PostgreSQL Source Code git master
multixact.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * multixact.c
4 * PostgreSQL multi-transaction-log manager
5 *
6 * The pg_multixact manager is a pg_xact-like manager that stores an array of
7 * MultiXactMember for each MultiXactId. It is a fundamental part of the
8 * shared-row-lock implementation. Each MultiXactMember is comprised of a
9 * TransactionId and a set of flag bits. The name is a bit historical:
10 * originally, a MultiXactId consisted of more than one TransactionId (except
11 * in rare corner cases), hence "multi". Nowadays, however, it's perfectly
12 * legitimate to have MultiXactIds that only include a single Xid.
13 *
14 * The meaning of the flag bits is opaque to this module, but they are mostly
15 * used in heapam.c to identify lock modes that each of the member transactions
16 * is holding on any given tuple. This module just contains support to store
17 * and retrieve the arrays.
18 *
19 * We use two SLRU areas, one for storing the offsets at which the data
20 * starts for each MultiXactId in the other one. This trick allows us to
21 * store variable length arrays of TransactionIds. (We could alternatively
22 * use one area containing counts and TransactionIds, with valid MultiXactId
23 * values pointing at slots containing counts; but that way seems less robust
24 * since it would get completely confused if someone inquired about a bogus
25 * MultiXactId that pointed to an intermediate slot containing an XID.)
26 *
27 * XLOG interactions: this module generates a record whenever a new OFFSETs or
28 * MEMBERs page is initialized to zeroes, as well as an
29 * XLOG_MULTIXACT_CREATE_ID record whenever a new MultiXactId is defined.
30 * This module ignores the WAL rule "write xlog before data," because it
31 * suffices that actions recording a MultiXactId in a heap xmax do follow that
32 * rule. The only way for the MXID to be referenced from any data page is for
33 * heap_lock_tuple() or heap_update() to have put it there, and each generates
34 * an XLOG record that must follow ours. The normal LSN interlock between the
35 * data page and that XLOG record will ensure that our XLOG record reaches
36 * disk first. If the SLRU members/offsets data reaches disk sooner than the
37 * XLOG records, we do not care; after recovery, no xmax will refer to it. On
38 * the flip side, to ensure that all referenced entries _do_ reach disk, this
39 * module's XLOG records completely rebuild the data entered since the last
40 * checkpoint. We flush and sync all dirty OFFSETs and MEMBERs pages to disk
41 * before each checkpoint is considered complete.
42 *
43 * Like clog.c, and unlike subtrans.c, we have to preserve state across
44 * crashes and ensure that MXID and offset numbering increases monotonically
45 * across a crash. We do this in the same way as it's done for transaction
46 * IDs: the WAL record is guaranteed to contain evidence of every MXID we
47 * could need to worry about, and we just make sure that at the end of
48 * replay, the next-MXID and next-offset counters are at least as large as
49 * anything we saw during replay.
50 *
51 * We are able to remove segments no longer necessary by carefully tracking
52 * each table's used values: during vacuum, any multixact older than a certain
53 * value is removed; the cutoff value is stored in pg_class. The minimum value
54 * across all tables in each database is stored in pg_database, and the global
55 * minimum across all databases is part of pg_control and is kept in shared
56 * memory. Whenever that minimum is advanced, the SLRUs are truncated.
57 *
58 * When new multixactid values are to be created, care is taken that the
59 * counter does not fall within the wraparound horizon considering the global
60 * minimum value.
61 *
62 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
63 * Portions Copyright (c) 1994, Regents of the University of California
64 *
65 * src/backend/access/transam/multixact.c
66 *
67 *-------------------------------------------------------------------------
68 */
69#include "postgres.h"
70
71#include "access/multixact.h"
72#include "access/slru.h"
73#include "access/twophase.h"
75#include "access/xlog.h"
76#include "access/xloginsert.h"
77#include "access/xlogutils.h"
78#include "miscadmin.h"
79#include "pg_trace.h"
80#include "pgstat.h"
82#include "storage/pmsignal.h"
83#include "storage/proc.h"
84#include "storage/procarray.h"
85#include "utils/guc_hooks.h"
87#include "utils/lsyscache.h"
88#include "utils/memutils.h"
89
90
91/*
92 * Defines for MultiXactOffset page sizes. A page is the same BLCKSZ as is
93 * used everywhere else in Postgres.
94 *
95 * Note: because MultiXactOffsets are 32 bits and wrap around at 0xFFFFFFFF,
96 * MultiXact page numbering also wraps around at
97 * 0xFFFFFFFF/MULTIXACT_OFFSETS_PER_PAGE, and segment numbering at
98 * 0xFFFFFFFF/MULTIXACT_OFFSETS_PER_PAGE/SLRU_PAGES_PER_SEGMENT. We need
99 * take no explicit notice of that fact in this module, except when comparing
100 * segment and page numbers in TruncateMultiXact (see
101 * MultiXactOffsetPagePrecedes).
102 */
103
104/* We need four bytes per offset */
105#define MULTIXACT_OFFSETS_PER_PAGE (BLCKSZ / sizeof(MultiXactOffset))
106
107static inline int64
109{
110 return multi / MULTIXACT_OFFSETS_PER_PAGE;
111}
112
113static inline int
115{
116 return multi % MULTIXACT_OFFSETS_PER_PAGE;
117}
118
119static inline int64
121{
123}
124
125/*
126 * The situation for members is a bit more complex: we store one byte of
127 * additional flag bits for each TransactionId. To do this without getting
128 * into alignment issues, we store four bytes of flags, and then the
129 * corresponding 4 Xids. Each such 5-word (20-byte) set we call a "group", and
130 * are stored as a whole in pages. Thus, with 8kB BLCKSZ, we keep 409 groups
131 * per page. This wastes 12 bytes per page, but that's OK -- simplicity (and
132 * performance) trumps space efficiency here.
133 *
134 * Note that the "offset" macros work with byte offset, not array indexes, so
135 * arithmetic must be done using "char *" pointers.
136 */
137/* We need eight bits per xact, so one xact fits in a byte */
138#define MXACT_MEMBER_BITS_PER_XACT 8
139#define MXACT_MEMBER_FLAGS_PER_BYTE 1
140#define MXACT_MEMBER_XACT_BITMASK ((1 << MXACT_MEMBER_BITS_PER_XACT) - 1)
141
142/* how many full bytes of flags are there in a group? */
143#define MULTIXACT_FLAGBYTES_PER_GROUP 4
144#define MULTIXACT_MEMBERS_PER_MEMBERGROUP \
145 (MULTIXACT_FLAGBYTES_PER_GROUP * MXACT_MEMBER_FLAGS_PER_BYTE)
146/* size in bytes of a complete group */
147#define MULTIXACT_MEMBERGROUP_SIZE \
148 (sizeof(TransactionId) * MULTIXACT_MEMBERS_PER_MEMBERGROUP + MULTIXACT_FLAGBYTES_PER_GROUP)
149#define MULTIXACT_MEMBERGROUPS_PER_PAGE (BLCKSZ / MULTIXACT_MEMBERGROUP_SIZE)
150#define MULTIXACT_MEMBERS_PER_PAGE \
151 (MULTIXACT_MEMBERGROUPS_PER_PAGE * MULTIXACT_MEMBERS_PER_MEMBERGROUP)
152
153/*
154 * Because the number of items per page is not a divisor of the last item
155 * number (member 0xFFFFFFFF), the last segment does not use the maximum number
156 * of pages, and moreover the last used page therein does not use the same
157 * number of items as previous pages. (Another way to say it is that the
158 * 0xFFFFFFFF member is somewhere in the middle of the last page, so the page
159 * has some empty space after that item.)
160 *
161 * This constant is the number of members in the last page of the last segment.
162 */
163#define MAX_MEMBERS_IN_LAST_MEMBERS_PAGE \
164 ((uint32) ((0xFFFFFFFF % MULTIXACT_MEMBERS_PER_PAGE) + 1))
165
166/* page in which a member is to be found */
167static inline int64
169{
170 return offset / MULTIXACT_MEMBERS_PER_PAGE;
171}
172
173static inline int64
175{
177}
178
179/* Location (byte offset within page) of flag word for a given member */
180static inline int
182{
184 int grouponpg = group % MULTIXACT_MEMBERGROUPS_PER_PAGE;
185 int byteoff = grouponpg * MULTIXACT_MEMBERGROUP_SIZE;
186
187 return byteoff;
188}
189
190static inline int
192{
193 int member_in_group = offset % MULTIXACT_MEMBERS_PER_MEMBERGROUP;
194 int bshift = member_in_group * MXACT_MEMBER_BITS_PER_XACT;
195
196 return bshift;
197}
198
199/* Location (byte offset within page) of TransactionId of given member */
200static inline int
202{
203 int member_in_group = offset % MULTIXACT_MEMBERS_PER_MEMBERGROUP;
204
205 return MXOffsetToFlagsOffset(offset) +
207 member_in_group * sizeof(TransactionId);
208}
209
210/* Multixact members wraparound thresholds. */
211#define MULTIXACT_MEMBER_SAFE_THRESHOLD (MaxMultiXactOffset / 2)
212#define MULTIXACT_MEMBER_DANGER_THRESHOLD \
213 (MaxMultiXactOffset - MaxMultiXactOffset / 4)
214
215static inline MultiXactId
217{
218 return multi == FirstMultiXactId ? MaxMultiXactId : multi - 1;
219}
220
221/*
222 * Links to shared-memory data structures for MultiXact control
223 */
226
227#define MultiXactOffsetCtl (&MultiXactOffsetCtlData)
228#define MultiXactMemberCtl (&MultiXactMemberCtlData)
229
230/*
231 * MultiXact state shared across all backends. All this state is protected
232 * by MultiXactGenLock. (We also use SLRU bank's lock of MultiXactOffset and
233 * MultiXactMember to guard accesses to the two sets of SLRU buffers. For
234 * concurrency's sake, we avoid holding more than one of these locks at a
235 * time.)
236 */
237typedef struct MultiXactStateData
238{
239 /* next-to-be-assigned MultiXactId */
241
242 /* next-to-be-assigned offset */
244
245 /* Have we completed multixact startup? */
247
248 /*
249 * Oldest multixact that is still potentially referenced by a relation.
250 * Anything older than this should not be consulted. These values are
251 * updated by vacuum.
252 */
255
256 /*
257 * Oldest multixact offset that is potentially referenced by a multixact
258 * referenced by a relation. We don't always know this value, so there's
259 * a flag here to indicate whether or not we currently do.
260 */
263
264 /* support for anti-wraparound measures */
269
270 /* support for members anti-wraparound measures */
271 MultiXactOffset offsetStopLimit; /* known if oldestOffsetKnown */
272
273 /*
274 * Per-backend data starts here. We have two arrays stored in the area
275 * immediately following the MultiXactStateData struct. Each is indexed by
276 * ProcNumber.
277 *
278 * In both arrays, there's a slot for all normal backends
279 * (0..MaxBackends-1) followed by a slot for max_prepared_xacts prepared
280 * transactions.
281 *
282 * OldestMemberMXactId[k] is the oldest MultiXactId each backend's current
283 * transaction(s) could possibly be a member of, or InvalidMultiXactId
284 * when the backend has no live transaction that could possibly be a
285 * member of a MultiXact. Each backend sets its entry to the current
286 * nextMXact counter just before first acquiring a shared lock in a given
287 * transaction, and clears it at transaction end. (This works because only
288 * during or after acquiring a shared lock could an XID possibly become a
289 * member of a MultiXact, and that MultiXact would have to be created
290 * during or after the lock acquisition.)
291 *
292 * OldestVisibleMXactId[k] is the oldest MultiXactId each backend's
293 * current transaction(s) think is potentially live, or InvalidMultiXactId
294 * when not in a transaction or not in a transaction that's paid any
295 * attention to MultiXacts yet. This is computed when first needed in a
296 * given transaction, and cleared at transaction end. We can compute it
297 * as the minimum of the valid OldestMemberMXactId[] entries at the time
298 * we compute it (using nextMXact if none are valid). Each backend is
299 * required not to attempt to access any SLRU data for MultiXactIds older
300 * than its own OldestVisibleMXactId[] setting; this is necessary because
301 * the relevant SLRU data can be concurrently truncated away.
302 *
303 * The oldest valid value among all of the OldestMemberMXactId[] and
304 * OldestVisibleMXactId[] entries is considered by vacuum as the earliest
305 * possible value still having any live member transaction -- OldestMxact.
306 * Any value older than that is typically removed from tuple headers, or
307 * "frozen" via being replaced with a new xmax. VACUUM can sometimes even
308 * remove an individual MultiXact xmax whose value is >= its OldestMxact
309 * cutoff, though typically only when no individual member XID is still
310 * running. See FreezeMultiXactId for full details.
311 *
312 * Whenever VACUUM advances relminmxid, then either its OldestMxact cutoff
313 * or the oldest extant Multi remaining in the table is used as the new
314 * pg_class.relminmxid value (whichever is earlier). The minimum of all
315 * relminmxid values in each database is stored in pg_database.datminmxid.
316 * In turn, the minimum of all of those values is stored in pg_control.
317 * This is used as the truncation point for pg_multixact when unneeded
318 * segments get removed by vac_truncate_clog() during vacuuming.
319 */
322
323/*
324 * Size of OldestMemberMXactId and OldestVisibleMXactId arrays.
325 */
326#define MaxOldestSlot (MaxBackends + max_prepared_xacts)
327
328/* Pointers to the state data in shared memory */
332
333
334/*
335 * Definitions for the backend-local MultiXactId cache.
336 *
337 * We use this cache to store known MultiXacts, so we don't need to go to
338 * SLRU areas every time.
339 *
340 * The cache lasts for the duration of a single transaction, the rationale
341 * for this being that most entries will contain our own TransactionId and
342 * so they will be uninteresting by the time our next transaction starts.
343 * (XXX not clear that this is correct --- other members of the MultiXact
344 * could hang around longer than we did. However, it's not clear what a
345 * better policy for flushing old cache entries would be.) FIXME actually
346 * this is plain wrong now that multixact's may contain update Xids.
347 *
348 * We allocate the cache entries in a memory context that is deleted at
349 * transaction end, so we don't need to do retail freeing of entries.
350 */
351typedef struct mXactCacheEnt
352{
358
359#define MAX_CACHE_ENTRIES 256
362
363#ifdef MULTIXACT_DEBUG
364#define debug_elog2(a,b) elog(a,b)
365#define debug_elog3(a,b,c) elog(a,b,c)
366#define debug_elog4(a,b,c,d) elog(a,b,c,d)
367#define debug_elog5(a,b,c,d,e) elog(a,b,c,d,e)
368#define debug_elog6(a,b,c,d,e,f) elog(a,b,c,d,e,f)
369#else
370#define debug_elog2(a,b)
371#define debug_elog3(a,b,c)
372#define debug_elog4(a,b,c,d)
373#define debug_elog5(a,b,c,d,e)
374#define debug_elog6(a,b,c,d,e,f)
375#endif
376
377/* internal MultiXactId management */
378static void MultiXactIdSetOldestVisible(void);
379static void RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
380 int nmembers, MultiXactMember *members);
381static MultiXactId GetNewMultiXactId(int nmembers, MultiXactOffset *offset);
382
383/* MultiXact cache management */
384static int mxactMemberComparator(const void *arg1, const void *arg2);
385static MultiXactId mXactCacheGetBySet(int nmembers, MultiXactMember *members);
386static int mXactCacheGetById(MultiXactId multi, MultiXactMember **members);
387static void mXactCachePut(MultiXactId multi, int nmembers,
388 MultiXactMember *members);
389
390/* management of SLRU infrastructure */
391static bool MultiXactOffsetPagePrecedes(int64 page1, int64 page2);
392static bool MultiXactMemberPagePrecedes(int64 page1, int64 page2);
393static bool MultiXactOffsetPrecedes(MultiXactOffset offset1,
394 MultiXactOffset offset2);
395static void ExtendMultiXactOffset(MultiXactId multi);
396static void ExtendMultiXactMember(MultiXactOffset offset, int nmembers);
397static bool MultiXactOffsetWouldWrap(MultiXactOffset boundary,
398 MultiXactOffset start, uint32 distance);
399static bool SetOffsetVacuumLimit(bool is_startup);
400static bool find_multixact_start(MultiXactId multi, MultiXactOffset *result);
401static void WriteMTruncateXlogRec(Oid oldestMultiDB,
402 MultiXactId startTruncOff,
403 MultiXactId endTruncOff,
404 MultiXactOffset startTruncMemb,
405 MultiXactOffset endTruncMemb);
406
407
408/*
409 * MultiXactIdCreate
410 * Construct a MultiXactId representing two TransactionIds.
411 *
412 * The two XIDs must be different, or be requesting different statuses.
413 *
414 * NB - we don't worry about our local MultiXactId cache here, because that
415 * is handled by the lower-level routines.
416 */
419 TransactionId xid2, MultiXactStatus status2)
420{
421 MultiXactId newMulti;
422 MultiXactMember members[2];
423
426
427 Assert(!TransactionIdEquals(xid1, xid2) || (status1 != status2));
428
429 /* MultiXactIdSetOldestMember() must have been called already. */
431
432 /*
433 * Note: unlike MultiXactIdExpand, we don't bother to check that both XIDs
434 * are still running. In typical usage, xid2 will be our own XID and the
435 * caller just did a check on xid1, so it'd be wasted effort.
436 */
437
438 members[0].xid = xid1;
439 members[0].status = status1;
440 members[1].xid = xid2;
441 members[1].status = status2;
442
443 newMulti = MultiXactIdCreateFromMembers(2, members);
444
445 debug_elog3(DEBUG2, "Create: %s",
446 mxid_to_string(newMulti, 2, members));
447
448 return newMulti;
449}
450
451/*
452 * MultiXactIdExpand
453 * Add a TransactionId to a pre-existing MultiXactId.
454 *
455 * If the TransactionId is already a member of the passed MultiXactId with the
456 * same status, just return it as-is.
457 *
458 * Note that we do NOT actually modify the membership of a pre-existing
459 * MultiXactId; instead we create a new one. This is necessary to avoid
460 * a race condition against code trying to wait for one MultiXactId to finish;
461 * see notes in heapam.c.
462 *
463 * NB - we don't worry about our local MultiXactId cache here, because that
464 * is handled by the lower-level routines.
465 *
466 * Note: It is critical that MultiXactIds that come from an old cluster (i.e.
467 * one upgraded by pg_upgrade from a cluster older than this feature) are not
468 * passed in.
469 */
472{
473 MultiXactId newMulti;
474 MultiXactMember *members;
475 MultiXactMember *newMembers;
476 int nmembers;
477 int i;
478 int j;
479
482
483 /* MultiXactIdSetOldestMember() must have been called already. */
485
486 debug_elog5(DEBUG2, "Expand: received multi %u, xid %u status %s",
487 multi, xid, mxstatus_to_string(status));
488
489 /*
490 * Note: we don't allow for old multis here. The reason is that the only
491 * caller of this function does a check that the multixact is no longer
492 * running.
493 */
494 nmembers = GetMultiXactIdMembers(multi, &members, false, false);
495
496 if (nmembers < 0)
497 {
498 MultiXactMember member;
499
500 /*
501 * The MultiXactId is obsolete. This can only happen if all the
502 * MultiXactId members stop running between the caller checking and
503 * passing it to us. It would be better to return that fact to the
504 * caller, but it would complicate the API and it's unlikely to happen
505 * too often, so just deal with it by creating a singleton MultiXact.
506 */
507 member.xid = xid;
508 member.status = status;
509 newMulti = MultiXactIdCreateFromMembers(1, &member);
510
511 debug_elog4(DEBUG2, "Expand: %u has no members, create singleton %u",
512 multi, newMulti);
513 return newMulti;
514 }
515
516 /*
517 * If the TransactionId is already a member of the MultiXactId with the
518 * same status, just return the existing MultiXactId.
519 */
520 for (i = 0; i < nmembers; i++)
521 {
522 if (TransactionIdEquals(members[i].xid, xid) &&
523 (members[i].status == status))
524 {
525 debug_elog4(DEBUG2, "Expand: %u is already a member of %u",
526 xid, multi);
527 pfree(members);
528 return multi;
529 }
530 }
531
532 /*
533 * Determine which of the members of the MultiXactId are still of
534 * interest. This is any running transaction, and also any transaction
535 * that grabbed something stronger than just a lock and was committed. (An
536 * update that aborted is of no interest here; and having more than one
537 * update Xid in a multixact would cause errors elsewhere.)
538 *
539 * Removing dead members is not just an optimization: freezing of tuples
540 * whose Xmax are multis depends on this behavior.
541 *
542 * Note we have the same race condition here as above: j could be 0 at the
543 * end of the loop.
544 */
545 newMembers = (MultiXactMember *)
546 palloc(sizeof(MultiXactMember) * (nmembers + 1));
547
548 for (i = 0, j = 0; i < nmembers; i++)
549 {
550 if (TransactionIdIsInProgress(members[i].xid) ||
551 (ISUPDATE_from_mxstatus(members[i].status) &&
552 TransactionIdDidCommit(members[i].xid)))
553 {
554 newMembers[j].xid = members[i].xid;
555 newMembers[j++].status = members[i].status;
556 }
557 }
558
559 newMembers[j].xid = xid;
560 newMembers[j++].status = status;
561 newMulti = MultiXactIdCreateFromMembers(j, newMembers);
562
563 pfree(members);
564 pfree(newMembers);
565
566 debug_elog3(DEBUG2, "Expand: returning new multi %u", newMulti);
567
568 return newMulti;
569}
570
571/*
572 * MultiXactIdIsRunning
573 * Returns whether a MultiXactId is "running".
574 *
575 * We return true if at least one member of the given MultiXactId is still
576 * running. Note that a "false" result is certain not to change,
577 * because it is not legal to add members to an existing MultiXactId.
578 *
579 * Caller is expected to have verified that the multixact does not come from
580 * a pg_upgraded share-locked tuple.
581 */
582bool
583MultiXactIdIsRunning(MultiXactId multi, bool isLockOnly)
584{
585 MultiXactMember *members;
586 int nmembers;
587 int i;
588
589 debug_elog3(DEBUG2, "IsRunning %u?", multi);
590
591 /*
592 * "false" here means we assume our callers have checked that the given
593 * multi cannot possibly come from a pg_upgraded database.
594 */
595 nmembers = GetMultiXactIdMembers(multi, &members, false, isLockOnly);
596
597 if (nmembers <= 0)
598 {
599 debug_elog2(DEBUG2, "IsRunning: no members");
600 return false;
601 }
602
603 /*
604 * Checking for myself is cheap compared to looking in shared memory;
605 * return true if any live subtransaction of the current top-level
606 * transaction is a member.
607 *
608 * This is not needed for correctness, it's just a fast path.
609 */
610 for (i = 0; i < nmembers; i++)
611 {
612 if (TransactionIdIsCurrentTransactionId(members[i].xid))
613 {
614 debug_elog3(DEBUG2, "IsRunning: I (%d) am running!", i);
615 pfree(members);
616 return true;
617 }
618 }
619
620 /*
621 * This could be made faster by having another entry point in procarray.c,
622 * walking the PGPROC array only once for all the members. But in most
623 * cases nmembers should be small enough that it doesn't much matter.
624 */
625 for (i = 0; i < nmembers; i++)
626 {
627 if (TransactionIdIsInProgress(members[i].xid))
628 {
629 debug_elog4(DEBUG2, "IsRunning: member %d (%u) is running",
630 i, members[i].xid);
631 pfree(members);
632 return true;
633 }
634 }
635
636 pfree(members);
637
638 debug_elog3(DEBUG2, "IsRunning: %u is not running", multi);
639
640 return false;
641}
642
643/*
644 * MultiXactIdSetOldestMember
645 * Save the oldest MultiXactId this transaction could be a member of.
646 *
647 * We set the OldestMemberMXactId for a given transaction the first time it's
648 * going to do some operation that might require a MultiXactId (tuple lock,
649 * update or delete). We need to do this even if we end up using a
650 * TransactionId instead of a MultiXactId, because there is a chance that
651 * another transaction would add our XID to a MultiXactId.
652 *
653 * The value to set is the next-to-be-assigned MultiXactId, so this is meant to
654 * be called just before doing any such possibly-MultiXactId-able operation.
655 */
656void
658{
660 {
661 MultiXactId nextMXact;
662
663 /*
664 * You might think we don't need to acquire a lock here, since
665 * fetching and storing of TransactionIds is probably atomic, but in
666 * fact we do: suppose we pick up nextMXact and then lose the CPU for
667 * a long time. Someone else could advance nextMXact, and then
668 * another someone else could compute an OldestVisibleMXactId that
669 * would be after the value we are going to store when we get control
670 * back. Which would be wrong.
671 *
672 * Note that a shared lock is sufficient, because it's enough to stop
673 * someone from advancing nextMXact; and nobody else could be trying
674 * to write to our OldestMember entry, only reading (and we assume
675 * storing it is atomic.)
676 */
677 LWLockAcquire(MultiXactGenLock, LW_SHARED);
678
679 /*
680 * We have to beware of the possibility that nextMXact is in the
681 * wrapped-around state. We don't fix the counter itself here, but we
682 * must be sure to store a valid value in our array entry.
683 */
684 nextMXact = MultiXactState->nextMXact;
685 if (nextMXact < FirstMultiXactId)
686 nextMXact = FirstMultiXactId;
687
689
690 LWLockRelease(MultiXactGenLock);
691
692 debug_elog4(DEBUG2, "MultiXact: setting OldestMember[%d] = %u",
693 MyProcNumber, nextMXact);
694 }
695}
696
697/*
698 * MultiXactIdSetOldestVisible
699 * Save the oldest MultiXactId this transaction considers possibly live.
700 *
701 * We set the OldestVisibleMXactId for a given transaction the first time
702 * it's going to inspect any MultiXactId. Once we have set this, we are
703 * guaranteed that SLRU data for MultiXactIds >= our own OldestVisibleMXactId
704 * won't be truncated away.
705 *
706 * The value to set is the oldest of nextMXact and all the valid per-backend
707 * OldestMemberMXactId[] entries. Because of the locking we do, we can be
708 * certain that no subsequent call to MultiXactIdSetOldestMember can set
709 * an OldestMemberMXactId[] entry older than what we compute here. Therefore
710 * there is no live transaction, now or later, that can be a member of any
711 * MultiXactId older than the OldestVisibleMXactId we compute here.
712 */
713static void
715{
717 {
718 MultiXactId oldestMXact;
719 int i;
720
721 LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
722
723 /*
724 * We have to beware of the possibility that nextMXact is in the
725 * wrapped-around state. We don't fix the counter itself here, but we
726 * must be sure to store a valid value in our array entry.
727 */
728 oldestMXact = MultiXactState->nextMXact;
729 if (oldestMXact < FirstMultiXactId)
730 oldestMXact = FirstMultiXactId;
731
732 for (i = 0; i < MaxOldestSlot; i++)
733 {
734 MultiXactId thisoldest = OldestMemberMXactId[i];
735
736 if (MultiXactIdIsValid(thisoldest) &&
737 MultiXactIdPrecedes(thisoldest, oldestMXact))
738 oldestMXact = thisoldest;
739 }
740
741 OldestVisibleMXactId[MyProcNumber] = oldestMXact;
742
743 LWLockRelease(MultiXactGenLock);
744
745 debug_elog4(DEBUG2, "MultiXact: setting OldestVisible[%d] = %u",
746 MyProcNumber, oldestMXact);
747 }
748}
749
750/*
751 * ReadNextMultiXactId
752 * Return the next MultiXactId to be assigned, but don't allocate it
753 */
756{
757 MultiXactId mxid;
758
759 /* XXX we could presumably do this without a lock. */
760 LWLockAcquire(MultiXactGenLock, LW_SHARED);
762 LWLockRelease(MultiXactGenLock);
763
764 if (mxid < FirstMultiXactId)
765 mxid = FirstMultiXactId;
766
767 return mxid;
768}
769
770/*
771 * ReadMultiXactIdRange
772 * Get the range of IDs that may still be referenced by a relation.
773 */
774void
776{
777 LWLockAcquire(MultiXactGenLock, LW_SHARED);
780 LWLockRelease(MultiXactGenLock);
781
782 if (*oldest < FirstMultiXactId)
783 *oldest = FirstMultiXactId;
784 if (*next < FirstMultiXactId)
786}
787
788
789/*
790 * MultiXactIdCreateFromMembers
791 * Make a new MultiXactId from the specified set of members
792 *
793 * Make XLOG, SLRU and cache entries for a new MultiXactId, recording the
794 * given TransactionIds as members. Returns the newly created MultiXactId.
795 *
796 * NB: the passed members[] array will be sorted in-place.
797 */
800{
801 MultiXactId multi;
802 MultiXactOffset offset;
804
805 debug_elog3(DEBUG2, "Create: %s",
806 mxid_to_string(InvalidMultiXactId, nmembers, members));
807
808 /*
809 * See if the same set of members already exists in our cache; if so, just
810 * re-use that MultiXactId. (Note: it might seem that looking in our
811 * cache is insufficient, and we ought to search disk to see if a
812 * duplicate definition already exists. But since we only ever create
813 * MultiXacts containing our own XID, in most cases any such MultiXacts
814 * were in fact created by us, and so will be in our cache. There are
815 * corner cases where someone else added us to a MultiXact without our
816 * knowledge, but it's not worth checking for.)
817 */
818 multi = mXactCacheGetBySet(nmembers, members);
819 if (MultiXactIdIsValid(multi))
820 {
821 debug_elog2(DEBUG2, "Create: in cache!");
822 return multi;
823 }
824
825 /* Verify that there is a single update Xid among the given members. */
826 {
827 int i;
828 bool has_update = false;
829
830 for (i = 0; i < nmembers; i++)
831 {
832 if (ISUPDATE_from_mxstatus(members[i].status))
833 {
834 if (has_update)
835 elog(ERROR, "new multixact has more than one updating member: %s",
836 mxid_to_string(InvalidMultiXactId, nmembers, members));
837 has_update = true;
838 }
839 }
840 }
841
842 /* Load the injection point before entering the critical section */
843 INJECTION_POINT_LOAD("multixact-create-from-members");
844
845 /*
846 * Assign the MXID and offsets range to use, and make sure there is space
847 * in the OFFSETs and MEMBERs files. NB: this routine does
848 * START_CRIT_SECTION().
849 *
850 * Note: unlike MultiXactIdCreate and MultiXactIdExpand, we do not check
851 * that we've called MultiXactIdSetOldestMember here. This is because
852 * this routine is used in some places to create new MultiXactIds of which
853 * the current backend is not a member, notably during freezing of multis
854 * in vacuum. During vacuum, in particular, it would be unacceptable to
855 * keep OldestMulti set, in case it runs for long.
856 */
857 multi = GetNewMultiXactId(nmembers, &offset);
858
859 INJECTION_POINT_CACHED("multixact-create-from-members", NULL);
860
861 /* Make an XLOG entry describing the new MXID. */
862 xlrec.mid = multi;
863 xlrec.moff = offset;
864 xlrec.nmembers = nmembers;
865
866 /*
867 * XXX Note: there's a lot of padding space in MultiXactMember. We could
868 * find a more compact representation of this Xlog record -- perhaps all
869 * the status flags in one XLogRecData, then all the xids in another one?
870 * Not clear that it's worth the trouble though.
871 */
874 XLogRegisterData(members, nmembers * sizeof(MultiXactMember));
875
876 (void) XLogInsert(RM_MULTIXACT_ID, XLOG_MULTIXACT_CREATE_ID);
877
878 /* Now enter the information into the OFFSETs and MEMBERs logs */
879 RecordNewMultiXact(multi, offset, nmembers, members);
880
881 /* Done with critical section */
883
884 /* Store the new MultiXactId in the local cache, too */
885 mXactCachePut(multi, nmembers, members);
886
887 debug_elog2(DEBUG2, "Create: all done");
888
889 return multi;
890}
891
892/*
893 * RecordNewMultiXact
894 * Write info about a new multixact into the offsets and members files
895 *
896 * This is broken out of MultiXactIdCreateFromMembers so that xlog replay can
897 * use it.
898 */
899static void
901 int nmembers, MultiXactMember *members)
902{
903 int64 pageno;
904 int64 prev_pageno;
905 int entryno;
906 int slotno;
907 MultiXactOffset *offptr;
909 int64 next_pageno;
910 int next_entryno;
911 MultiXactOffset *next_offptr;
912 MultiXactOffset next_offset;
913 LWLock *lock;
914 LWLock *prevlock = NULL;
915
916 /* position of this multixid in the offsets SLRU area */
917 pageno = MultiXactIdToOffsetPage(multi);
918 entryno = MultiXactIdToOffsetEntry(multi);
919
920 /* position of the next multixid */
921 next = multi + 1;
924 next_pageno = MultiXactIdToOffsetPage(next);
925 next_entryno = MultiXactIdToOffsetEntry(next);
926
927 /*
928 * Set the starting offset of this multixid's members.
929 *
930 * In the common case, it was already be set by the previous
931 * RecordNewMultiXact call, as this was the next multixid of the previous
932 * multixid. But if multiple backends are generating multixids
933 * concurrently, we might race ahead and get called before the previous
934 * multixid.
935 */
938
939 /*
940 * Note: we pass the MultiXactId to SimpleLruReadPage as the "transaction"
941 * to complain about if there's any I/O error. This is kinda bogus, but
942 * since the errors will always give the full pathname, it should be clear
943 * enough that a MultiXactId is really involved. Perhaps someday we'll
944 * take the trouble to generalize the slru.c error reporting code.
945 */
946 slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, multi);
947 offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
948 offptr += entryno;
949
950 if (*offptr != offset)
951 {
952 /* should already be set to the correct value, or not at all */
953 Assert(*offptr == 0);
954 *offptr = offset;
955 MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
956 }
957
958 /*
959 * Set the next multixid's offset to the end of this multixid's members.
960 */
961 if (next_pageno == pageno)
962 {
963 next_offptr = offptr + 1;
964 }
965 else
966 {
967 /* must be the first entry on the page */
968 Assert(next_entryno == 0 || next == FirstMultiXactId);
969
970 /* Swap the lock for a lock on the next page */
971 LWLockRelease(lock);
972 lock = SimpleLruGetBankLock(MultiXactOffsetCtl, next_pageno);
974
975 slotno = SimpleLruReadPage(MultiXactOffsetCtl, next_pageno, true, next);
976 next_offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
977 next_offptr += next_entryno;
978 }
979
980 /* Like in GetNewMultiXactId(), skip over offset 0 */
981 next_offset = offset + nmembers;
982 if (next_offset == 0)
983 next_offset = 1;
984 if (*next_offptr != next_offset)
985 {
986 /* should already be set to the correct value, or not at all */
987 Assert(*next_offptr == 0);
988 *next_offptr = next_offset;
989 MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
990 }
991
992 /* Release MultiXactOffset SLRU lock. */
993 LWLockRelease(lock);
994
995 prev_pageno = -1;
996
997 for (int i = 0; i < nmembers; i++, offset++)
998 {
999 TransactionId *memberptr;
1000 uint32 *flagsptr;
1001 uint32 flagsval;
1002 int bshift;
1003 int flagsoff;
1004 int memberoff;
1005
1006 Assert(members[i].status <= MultiXactStatusUpdate);
1007
1008 pageno = MXOffsetToMemberPage(offset);
1009 memberoff = MXOffsetToMemberOffset(offset);
1010 flagsoff = MXOffsetToFlagsOffset(offset);
1011 bshift = MXOffsetToFlagsBitShift(offset);
1012
1013 if (pageno != prev_pageno)
1014 {
1015 /*
1016 * MultiXactMember SLRU page is changed so check if this new page
1017 * fall into the different SLRU bank then release the old bank's
1018 * lock and acquire lock on the new bank.
1019 */
1021 if (lock != prevlock)
1022 {
1023 if (prevlock != NULL)
1024 LWLockRelease(prevlock);
1025
1027 prevlock = lock;
1028 }
1029 slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, true, multi);
1030 prev_pageno = pageno;
1031 }
1032
1033 memberptr = (TransactionId *)
1034 (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff);
1035
1036 *memberptr = members[i].xid;
1037
1038 flagsptr = (uint32 *)
1039 (MultiXactMemberCtl->shared->page_buffer[slotno] + flagsoff);
1040
1041 flagsval = *flagsptr;
1042 flagsval &= ~(((1 << MXACT_MEMBER_BITS_PER_XACT) - 1) << bshift);
1043 flagsval |= (members[i].status << bshift);
1044 *flagsptr = flagsval;
1045
1046 MultiXactMemberCtl->shared->page_dirty[slotno] = true;
1047 }
1048
1049 if (prevlock != NULL)
1050 LWLockRelease(prevlock);
1051}
1052
1053/*
1054 * GetNewMultiXactId
1055 * Get the next MultiXactId.
1056 *
1057 * Also, reserve the needed amount of space in the "members" area. The
1058 * starting offset of the reserved space is returned in *offset.
1059 *
1060 * This may generate XLOG records for expansion of the offsets and/or members
1061 * files. Unfortunately, we have to do that while holding MultiXactGenLock
1062 * to avoid race conditions --- the XLOG record for zeroing a page must appear
1063 * before any backend can possibly try to store data in that page!
1064 *
1065 * We start a critical section before advancing the shared counters. The
1066 * caller must end the critical section after writing SLRU data.
1067 */
1068static MultiXactId
1070{
1071 MultiXactId result;
1072 MultiXactOffset nextOffset;
1073
1074 debug_elog3(DEBUG2, "GetNew: for %d xids", nmembers);
1075
1076 /* safety check, we should never get this far in a HS standby */
1077 if (RecoveryInProgress())
1078 elog(ERROR, "cannot assign MultiXactIds during recovery");
1079
1080 LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
1081
1082 /* Handle wraparound of the nextMXact counter */
1085
1086 /* Assign the MXID */
1087 result = MultiXactState->nextMXact;
1088
1089 /*----------
1090 * Check to see if it's safe to assign another MultiXactId. This protects
1091 * against catastrophic data loss due to multixact wraparound. The basic
1092 * rules are:
1093 *
1094 * If we're past multiVacLimit or the safe threshold for member storage
1095 * space, or we don't know what the safe threshold for member storage is,
1096 * start trying to force autovacuum cycles.
1097 * If we're past multiWarnLimit, start issuing warnings.
1098 * If we're past multiStopLimit, refuse to create new MultiXactIds.
1099 *
1100 * Note these are pretty much the same protections in GetNewTransactionId.
1101 *----------
1102 */
1104 {
1105 /*
1106 * For safety's sake, we release MultiXactGenLock while sending
1107 * signals, warnings, etc. This is not so much because we care about
1108 * preserving concurrency in this situation, as to avoid any
1109 * possibility of deadlock while doing get_database_name(). First,
1110 * copy all the shared values we'll need in this path.
1111 */
1112 MultiXactId multiWarnLimit = MultiXactState->multiWarnLimit;
1113 MultiXactId multiStopLimit = MultiXactState->multiStopLimit;
1114 MultiXactId multiWrapLimit = MultiXactState->multiWrapLimit;
1115 Oid oldest_datoid = MultiXactState->oldestMultiXactDB;
1116
1117 LWLockRelease(MultiXactGenLock);
1118
1119 if (IsUnderPostmaster &&
1120 !MultiXactIdPrecedes(result, multiStopLimit))
1121 {
1122 char *oldest_datname = get_database_name(oldest_datoid);
1123
1124 /*
1125 * Immediately kick autovacuum into action as we're already in
1126 * ERROR territory.
1127 */
1129
1130 /* complain even if that DB has disappeared */
1131 if (oldest_datname)
1132 ereport(ERROR,
1133 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1134 errmsg("database is not accepting commands that assign new MultiXactIds to avoid wraparound data loss in database \"%s\"",
1135 oldest_datname),
1136 errhint("Execute a database-wide VACUUM in that database.\n"
1137 "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
1138 else
1139 ereport(ERROR,
1140 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1141 errmsg("database is not accepting commands that assign new MultiXactIds to avoid wraparound data loss in database with OID %u",
1142 oldest_datoid),
1143 errhint("Execute a database-wide VACUUM in that database.\n"
1144 "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
1145 }
1146
1147 /*
1148 * To avoid swamping the postmaster with signals, we issue the autovac
1149 * request only once per 64K multis generated. This still gives
1150 * plenty of chances before we get into real trouble.
1151 */
1152 if (IsUnderPostmaster && (result % 65536) == 0)
1154
1155 if (!MultiXactIdPrecedes(result, multiWarnLimit))
1156 {
1157 char *oldest_datname = get_database_name(oldest_datoid);
1158
1159 /* complain even if that DB has disappeared */
1160 if (oldest_datname)
1162 (errmsg_plural("database \"%s\" must be vacuumed before %u more MultiXactId is used",
1163 "database \"%s\" must be vacuumed before %u more MultiXactIds are used",
1164 multiWrapLimit - result,
1165 oldest_datname,
1166 multiWrapLimit - result),
1167 errhint("Execute a database-wide VACUUM in that database.\n"
1168 "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
1169 else
1171 (errmsg_plural("database with OID %u must be vacuumed before %u more MultiXactId is used",
1172 "database with OID %u must be vacuumed before %u more MultiXactIds are used",
1173 multiWrapLimit - result,
1174 oldest_datoid,
1175 multiWrapLimit - result),
1176 errhint("Execute a database-wide VACUUM in that database.\n"
1177 "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
1178 }
1179
1180 /* Re-acquire lock and start over */
1181 LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
1182 result = MultiXactState->nextMXact;
1183 if (result < FirstMultiXactId)
1184 result = FirstMultiXactId;
1185 }
1186
1187 /*
1188 * Make sure there is room for the next MXID in the file. Assigning this
1189 * MXID sets the next MXID's offset already.
1190 */
1191 ExtendMultiXactOffset(result + 1);
1192
1193 /*
1194 * Reserve the members space, similarly to above. Also, be careful not to
1195 * return zero as the starting offset for any multixact. See
1196 * GetMultiXactIdMembers() for motivation.
1197 */
1198 nextOffset = MultiXactState->nextOffset;
1199 if (nextOffset == 0)
1200 {
1201 *offset = 1;
1202 nmembers++; /* allocate member slot 0 too */
1203 }
1204 else
1205 *offset = nextOffset;
1206
1207 /*----------
1208 * Protect against overrun of the members space as well, with the
1209 * following rules:
1210 *
1211 * If we're past offsetStopLimit, refuse to generate more multis.
1212 * If we're close to offsetStopLimit, emit a warning.
1213 *
1214 * Arbitrarily, we start emitting warnings when we're 20 segments or less
1215 * from offsetStopLimit.
1216 *
1217 * Note we haven't updated the shared state yet, so if we fail at this
1218 * point, the multixact ID we grabbed can still be used by the next guy.
1219 *
1220 * Note that there is no point in forcing autovacuum runs here: the
1221 * multixact freeze settings would have to be reduced for that to have any
1222 * effect.
1223 *----------
1224 */
1225#define OFFSET_WARN_SEGMENTS 20
1228 nmembers))
1229 {
1230 /* see comment in the corresponding offsets wraparound case */
1232
1233 ereport(ERROR,
1234 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1235 errmsg("multixact \"members\" limit exceeded"),
1236 errdetail_plural("This command would create a multixact with %u members, but the remaining space is only enough for %u member.",
1237 "This command would create a multixact with %u members, but the remaining space is only enough for %u members.",
1238 MultiXactState->offsetStopLimit - nextOffset - 1,
1239 nmembers,
1240 MultiXactState->offsetStopLimit - nextOffset - 1),
1241 errhint("Execute a database-wide VACUUM in database with OID %u with reduced \"vacuum_multixact_freeze_min_age\" and \"vacuum_multixact_freeze_table_age\" settings.",
1243 }
1244
1245 /*
1246 * Check whether we should kick autovacuum into action, to prevent members
1247 * wraparound. NB we use a much larger window to trigger autovacuum than
1248 * just the warning limit. The warning is just a measure of last resort -
1249 * this is in line with GetNewTransactionId's behaviour.
1250 */
1254 {
1255 /*
1256 * To avoid swamping the postmaster with signals, we issue the autovac
1257 * request only when crossing a segment boundary. With default
1258 * compilation settings that's roughly after 50k members. This still
1259 * gives plenty of chances before we get into real trouble.
1260 */
1261 if ((MXOffsetToMemberPage(nextOffset) / SLRU_PAGES_PER_SEGMENT) !=
1262 (MXOffsetToMemberPage(nextOffset + nmembers) / SLRU_PAGES_PER_SEGMENT))
1264 }
1265
1268 nextOffset,
1271 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1272 errmsg_plural("database with OID %u must be vacuumed before %d more multixact member is used",
1273 "database with OID %u must be vacuumed before %d more multixact members are used",
1274 MultiXactState->offsetStopLimit - nextOffset + nmembers,
1276 MultiXactState->offsetStopLimit - nextOffset + nmembers),
1277 errhint("Execute a database-wide VACUUM in that database with reduced \"vacuum_multixact_freeze_min_age\" and \"vacuum_multixact_freeze_table_age\" settings.")));
1278
1279 ExtendMultiXactMember(nextOffset, nmembers);
1280
1281 /*
1282 * Critical section from here until caller has written the data into the
1283 * just-reserved SLRU space; we don't want to error out with a partly
1284 * written MultiXact structure. (In particular, failing to write our
1285 * start offset after advancing nextMXact would effectively corrupt the
1286 * previous MultiXact.)
1287 */
1289
1290 /*
1291 * Advance counters. As in GetNewTransactionId(), this must not happen
1292 * until after file extension has succeeded!
1293 *
1294 * We don't care about MultiXactId wraparound here; it will be handled by
1295 * the next iteration. But note that nextMXact may be InvalidMultiXactId
1296 * or the first value on a segment-beginning page after this routine
1297 * exits, so anyone else looking at the variable must be prepared to deal
1298 * with either case. Similarly, nextOffset may be zero, but we won't use
1299 * that as the actual start offset of the next multixact.
1300 */
1302
1303 MultiXactState->nextOffset += nmembers;
1304
1305 LWLockRelease(MultiXactGenLock);
1306
1307 debug_elog4(DEBUG2, "GetNew: returning %u offset %u", result, *offset);
1308 return result;
1309}
1310
1311/*
1312 * GetMultiXactIdMembers
1313 * Return the set of MultiXactMembers that make up a MultiXactId
1314 *
1315 * Return value is the number of members found, or -1 if there are none,
1316 * and *members is set to a newly palloc'ed array of members. It's the
1317 * caller's responsibility to free it when done with it.
1318 *
1319 * from_pgupgrade must be passed as true if and only if only the multixact
1320 * corresponds to a value from a tuple that was locked in a 9.2-or-older
1321 * installation and later pg_upgrade'd (that is, the infomask is
1322 * HEAP_LOCKED_UPGRADED). In this case, we know for certain that no members
1323 * can still be running, so we return -1 just like for an empty multixact
1324 * without any further checking. It would be wrong to try to resolve such a
1325 * multixact: either the multixact is within the current valid multixact
1326 * range, in which case the returned result would be bogus, or outside that
1327 * range, in which case an error would be raised.
1328 *
1329 * In all other cases, the passed multixact must be within the known valid
1330 * range, that is, greater than or equal to oldestMultiXactId, and less than
1331 * nextMXact. Otherwise, an error is raised.
1332 *
1333 * isLockOnly must be set to true if caller is certain that the given multi
1334 * is used only to lock tuples; can be false without loss of correctness,
1335 * but passing a true means we can return quickly without checking for
1336 * old updates.
1337 */
1338int
1340 bool from_pgupgrade, bool isLockOnly)
1341{
1342 int64 pageno;
1343 int64 prev_pageno;
1344 int entryno;
1345 int slotno;
1346 MultiXactOffset *offptr;
1347 MultiXactOffset offset;
1348 int length;
1349 int truelength;
1350 MultiXactId oldestMXact;
1351 MultiXactId nextMXact;
1352 MultiXactMember *ptr;
1353 LWLock *lock;
1354
1355 debug_elog3(DEBUG2, "GetMembers: asked for %u", multi);
1356
1357 if (!MultiXactIdIsValid(multi) || from_pgupgrade)
1358 {
1359 *members = NULL;
1360 return -1;
1361 }
1362
1363 /* See if the MultiXactId is in the local cache */
1364 length = mXactCacheGetById(multi, members);
1365 if (length >= 0)
1366 {
1367 debug_elog3(DEBUG2, "GetMembers: found %s in the cache",
1368 mxid_to_string(multi, length, *members));
1369 return length;
1370 }
1371
1372 /* Set our OldestVisibleMXactId[] entry if we didn't already */
1374
1375 /*
1376 * If we know the multi is used only for locking and not for updates, then
1377 * we can skip checking if the value is older than our oldest visible
1378 * multi. It cannot possibly still be running.
1379 */
1380 if (isLockOnly &&
1382 {
1383 debug_elog2(DEBUG2, "GetMembers: a locker-only multi is too old");
1384 *members = NULL;
1385 return -1;
1386 }
1387
1388 /*
1389 * We check known limits on MultiXact before resorting to the SLRU area.
1390 *
1391 * An ID older than MultiXactState->oldestMultiXactId cannot possibly be
1392 * useful; it has already been removed, or will be removed shortly, by
1393 * truncation. If one is passed, an error is raised.
1394 *
1395 * Also, an ID >= nextMXact shouldn't ever be seen here; if it is seen, it
1396 * implies undetected ID wraparound has occurred. This raises a hard
1397 * error.
1398 *
1399 * Shared lock is enough here since we aren't modifying any global state.
1400 * Acquire it just long enough to grab the current counter values.
1401 */
1402 LWLockAcquire(MultiXactGenLock, LW_SHARED);
1403
1404 oldestMXact = MultiXactState->oldestMultiXactId;
1405 nextMXact = MultiXactState->nextMXact;
1406
1407 LWLockRelease(MultiXactGenLock);
1408
1409 if (MultiXactIdPrecedes(multi, oldestMXact))
1410 ereport(ERROR,
1411 (errcode(ERRCODE_INTERNAL_ERROR),
1412 errmsg("MultiXactId %u does no longer exist -- apparent wraparound",
1413 multi)));
1414
1415 if (!MultiXactIdPrecedes(multi, nextMXact))
1416 ereport(ERROR,
1417 (errcode(ERRCODE_INTERNAL_ERROR),
1418 errmsg("MultiXactId %u has not been created yet -- apparent wraparound",
1419 multi)));
1420
1421 /*
1422 * Find out the offset at which we need to start reading MultiXactMembers
1423 * and the number of members in the multixact. We determine the latter as
1424 * the difference between this multixact's starting offset and the next
1425 * one's. However, there is one corner case to worry about:
1426 *
1427 * Because GetNewMultiXactId skips over offset zero, to reserve zero for
1428 * to mean "unset", there is an ambiguity near the point of offset
1429 * wraparound. If we see next multixact's offset is one, is that our
1430 * multixact's actual endpoint, or did it end at zero with a subsequent
1431 * increment? We handle this using the knowledge that if the zero'th
1432 * member slot wasn't filled, it'll contain zero, and zero isn't a valid
1433 * transaction ID so it can't be a multixact member. Therefore, if we
1434 * read a zero from the members array, just ignore it.
1435 */
1436 pageno = MultiXactIdToOffsetPage(multi);
1437 entryno = MultiXactIdToOffsetEntry(multi);
1438
1439 /* Acquire the bank lock for the page we need. */
1442
1443 /* read this multi's offset */
1444 slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, multi);
1445 offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
1446 offptr += entryno;
1447 offset = *offptr;
1448
1449 Assert(offset != 0);
1450
1451 /* read next multi's offset */
1452 {
1453 MultiXactId tmpMXact;
1454 MultiXactOffset nextMXOffset;
1455
1456 /* handle wraparound if needed */
1457 tmpMXact = multi + 1;
1458 if (tmpMXact < FirstMultiXactId)
1459 tmpMXact = FirstMultiXactId;
1460
1461 prev_pageno = pageno;
1462
1463 pageno = MultiXactIdToOffsetPage(tmpMXact);
1464 entryno = MultiXactIdToOffsetEntry(tmpMXact);
1465
1466 if (pageno != prev_pageno)
1467 {
1468 LWLock *newlock;
1469
1470 /*
1471 * Since we're going to access a different SLRU page, if this page
1472 * falls under a different bank, release the old bank's lock and
1473 * acquire the lock of the new bank.
1474 */
1475 newlock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno);
1476 if (newlock != lock)
1477 {
1478 LWLockRelease(lock);
1479 LWLockAcquire(newlock, LW_EXCLUSIVE);
1480 lock = newlock;
1481 }
1482 slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, tmpMXact);
1483 }
1484
1485 offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
1486 offptr += entryno;
1487 nextMXOffset = *offptr;
1488
1489 if (nextMXOffset == 0)
1490 ereport(ERROR,
1492 errmsg("MultiXact %u has invalid next offset",
1493 multi)));
1494
1495 length = nextMXOffset - offset;
1496 }
1497
1498 LWLockRelease(lock);
1499 lock = NULL;
1500
1501 /* read the members */
1502 ptr = (MultiXactMember *) palloc(length * sizeof(MultiXactMember));
1503
1504 truelength = 0;
1505 prev_pageno = -1;
1506 for (int i = 0; i < length; i++, offset++)
1507 {
1508 TransactionId *xactptr;
1509 uint32 *flagsptr;
1510 int flagsoff;
1511 int bshift;
1512 int memberoff;
1513
1514 pageno = MXOffsetToMemberPage(offset);
1515 memberoff = MXOffsetToMemberOffset(offset);
1516
1517 if (pageno != prev_pageno)
1518 {
1519 LWLock *newlock;
1520
1521 /*
1522 * Since we're going to access a different SLRU page, if this page
1523 * falls under a different bank, release the old bank's lock and
1524 * acquire the lock of the new bank.
1525 */
1526 newlock = SimpleLruGetBankLock(MultiXactMemberCtl, pageno);
1527 if (newlock != lock)
1528 {
1529 if (lock)
1530 LWLockRelease(lock);
1531 LWLockAcquire(newlock, LW_EXCLUSIVE);
1532 lock = newlock;
1533 }
1534
1535 slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, true, multi);
1536 prev_pageno = pageno;
1537 }
1538
1539 xactptr = (TransactionId *)
1540 (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff);
1541
1542 if (!TransactionIdIsValid(*xactptr))
1543 {
1544 /* Corner case: we must be looking at unused slot zero */
1545 Assert(offset == 0);
1546 continue;
1547 }
1548
1549 flagsoff = MXOffsetToFlagsOffset(offset);
1550 bshift = MXOffsetToFlagsBitShift(offset);
1551 flagsptr = (uint32 *) (MultiXactMemberCtl->shared->page_buffer[slotno] + flagsoff);
1552
1553 ptr[truelength].xid = *xactptr;
1554 ptr[truelength].status = (*flagsptr >> bshift) & MXACT_MEMBER_XACT_BITMASK;
1555 truelength++;
1556 }
1557
1558 LWLockRelease(lock);
1559
1560 /* A multixid with zero members should not happen */
1561 Assert(truelength > 0);
1562
1563 /*
1564 * Copy the result into the local cache.
1565 */
1566 mXactCachePut(multi, truelength, ptr);
1567
1568 debug_elog3(DEBUG2, "GetMembers: no cache for %s",
1569 mxid_to_string(multi, truelength, ptr));
1570 *members = ptr;
1571 return truelength;
1572}
1573
1574/*
1575 * mxactMemberComparator
1576 * qsort comparison function for MultiXactMember
1577 *
1578 * We can't use wraparound comparison for XIDs because that does not respect
1579 * the triangle inequality! Any old sort order will do.
1580 */
1581static int
1582mxactMemberComparator(const void *arg1, const void *arg2)
1583{
1584 MultiXactMember member1 = *(const MultiXactMember *) arg1;
1585 MultiXactMember member2 = *(const MultiXactMember *) arg2;
1586
1587 if (member1.xid > member2.xid)
1588 return 1;
1589 if (member1.xid < member2.xid)
1590 return -1;
1591 if (member1.status > member2.status)
1592 return 1;
1593 if (member1.status < member2.status)
1594 return -1;
1595 return 0;
1596}
1597
1598/*
1599 * mXactCacheGetBySet
1600 * returns a MultiXactId from the cache based on the set of
1601 * TransactionIds that compose it, or InvalidMultiXactId if
1602 * none matches.
1603 *
1604 * This is helpful, for example, if two transactions want to lock a huge
1605 * table. By using the cache, the second will use the same MultiXactId
1606 * for the majority of tuples, thus keeping MultiXactId usage low (saving
1607 * both I/O and wraparound issues).
1608 *
1609 * NB: the passed members array will be sorted in-place.
1610 */
1611static MultiXactId
1613{
1614 dlist_iter iter;
1615
1616 debug_elog3(DEBUG2, "CacheGet: looking for %s",
1617 mxid_to_string(InvalidMultiXactId, nmembers, members));
1618
1619 /* sort the array so comparison is easy */
1620 qsort(members, nmembers, sizeof(MultiXactMember), mxactMemberComparator);
1621
1623 {
1625 iter.cur);
1626
1627 if (entry->nmembers != nmembers)
1628 continue;
1629
1630 /*
1631 * We assume the cache entries are sorted, and that the unused bits in
1632 * "status" are zeroed.
1633 */
1634 if (memcmp(members, entry->members, nmembers * sizeof(MultiXactMember)) == 0)
1635 {
1636 debug_elog3(DEBUG2, "CacheGet: found %u", entry->multi);
1638 return entry->multi;
1639 }
1640 }
1641
1642 debug_elog2(DEBUG2, "CacheGet: not found :-(");
1643 return InvalidMultiXactId;
1644}
1645
1646/*
1647 * mXactCacheGetById
1648 * returns the composing MultiXactMember set from the cache for a
1649 * given MultiXactId, if present.
1650 *
1651 * If successful, *xids is set to the address of a palloc'd copy of the
1652 * MultiXactMember set. Return value is number of members, or -1 on failure.
1653 */
1654static int
1656{
1657 dlist_iter iter;
1658
1659 debug_elog3(DEBUG2, "CacheGet: looking for %u", multi);
1660
1662 {
1664 iter.cur);
1665
1666 if (entry->multi == multi)
1667 {
1668 MultiXactMember *ptr;
1669 Size size;
1670
1671 size = sizeof(MultiXactMember) * entry->nmembers;
1672 ptr = (MultiXactMember *) palloc(size);
1673
1674 memcpy(ptr, entry->members, size);
1675
1676 debug_elog3(DEBUG2, "CacheGet: found %s",
1677 mxid_to_string(multi,
1678 entry->nmembers,
1679 entry->members));
1680
1681 /*
1682 * Note we modify the list while not using a modifiable iterator.
1683 * This is acceptable only because we exit the iteration
1684 * immediately afterwards.
1685 */
1687
1688 *members = ptr;
1689 return entry->nmembers;
1690 }
1691 }
1692
1693 debug_elog2(DEBUG2, "CacheGet: not found");
1694 return -1;
1695}
1696
1697/*
1698 * mXactCachePut
1699 * Add a new MultiXactId and its composing set into the local cache.
1700 */
1701static void
1702mXactCachePut(MultiXactId multi, int nmembers, MultiXactMember *members)
1703{
1704 mXactCacheEnt *entry;
1705
1706 debug_elog3(DEBUG2, "CachePut: storing %s",
1707 mxid_to_string(multi, nmembers, members));
1708
1709 if (MXactContext == NULL)
1710 {
1711 /* The cache only lives as long as the current transaction */
1712 debug_elog2(DEBUG2, "CachePut: initializing memory context");
1714 "MultiXact cache context",
1716 }
1717
1718 entry = (mXactCacheEnt *)
1720 offsetof(mXactCacheEnt, members) +
1721 nmembers * sizeof(MultiXactMember));
1722
1723 entry->multi = multi;
1724 entry->nmembers = nmembers;
1725 memcpy(entry->members, members, nmembers * sizeof(MultiXactMember));
1726
1727 /* mXactCacheGetBySet assumes the entries are sorted, so sort them */
1728 qsort(entry->members, nmembers, sizeof(MultiXactMember), mxactMemberComparator);
1729
1730 dclist_push_head(&MXactCache, &entry->node);
1732 {
1733 dlist_node *node;
1734
1737
1738 entry = dclist_container(mXactCacheEnt, node, node);
1739 debug_elog3(DEBUG2, "CachePut: pruning cached multi %u",
1740 entry->multi);
1741
1742 pfree(entry);
1743 }
1744}
1745
1746char *
1748{
1749 switch (status)
1750 {
1752 return "keysh";
1754 return "sh";
1756 return "fornokeyupd";
1758 return "forupd";
1760 return "nokeyupd";
1762 return "upd";
1763 default:
1764 elog(ERROR, "unrecognized multixact status %d", status);
1765 return "";
1766 }
1767}
1768
1769char *
1770mxid_to_string(MultiXactId multi, int nmembers, MultiXactMember *members)
1771{
1772 static char *str = NULL;
1774 int i;
1775
1776 if (str != NULL)
1777 pfree(str);
1778
1780
1781 appendStringInfo(&buf, "%u %d[%u (%s)", multi, nmembers, members[0].xid,
1782 mxstatus_to_string(members[0].status));
1783
1784 for (i = 1; i < nmembers; i++)
1785 appendStringInfo(&buf, ", %u (%s)", members[i].xid,
1786 mxstatus_to_string(members[i].status));
1787
1790 pfree(buf.data);
1791 return str;
1792}
1793
1794/*
1795 * AtEOXact_MultiXact
1796 * Handle transaction end for MultiXact
1797 *
1798 * This is called at top transaction commit or abort (we don't care which).
1799 */
1800void
1802{
1803 /*
1804 * Reset our OldestMemberMXactId and OldestVisibleMXactId values, both of
1805 * which should only be valid while within a transaction.
1806 *
1807 * We assume that storing a MultiXactId is atomic and so we need not take
1808 * MultiXactGenLock to do this.
1809 */
1812
1813 /*
1814 * Discard the local MultiXactId cache. Since MXactContext was created as
1815 * a child of TopTransactionContext, we needn't delete it explicitly.
1816 */
1817 MXactContext = NULL;
1819}
1820
1821/*
1822 * AtPrepare_MultiXact
1823 * Save multixact state at 2PC transaction prepare
1824 *
1825 * In this phase, we only store our OldestMemberMXactId value in the two-phase
1826 * state file.
1827 */
1828void
1830{
1832
1833 if (MultiXactIdIsValid(myOldestMember))
1835 &myOldestMember, sizeof(MultiXactId));
1836}
1837
1838/*
1839 * PostPrepare_MultiXact
1840 * Clean up after successful PREPARE TRANSACTION
1841 */
1842void
1844{
1845 MultiXactId myOldestMember;
1846
1847 /*
1848 * Transfer our OldestMemberMXactId value to the slot reserved for the
1849 * prepared transaction.
1850 */
1851 myOldestMember = OldestMemberMXactId[MyProcNumber];
1852 if (MultiXactIdIsValid(myOldestMember))
1853 {
1854 ProcNumber dummyProcNumber = TwoPhaseGetDummyProcNumber(fxid, false);
1855
1856 /*
1857 * Even though storing MultiXactId is atomic, acquire lock to make
1858 * sure others see both changes, not just the reset of the slot of the
1859 * current backend. Using a volatile pointer might suffice, but this
1860 * isn't a hot spot.
1861 */
1862 LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
1863
1864 OldestMemberMXactId[dummyProcNumber] = myOldestMember;
1866
1867 LWLockRelease(MultiXactGenLock);
1868 }
1869
1870 /*
1871 * We don't need to transfer OldestVisibleMXactId value, because the
1872 * transaction is not going to be looking at any more multixacts once it's
1873 * prepared.
1874 *
1875 * We assume that storing a MultiXactId is atomic and so we need not take
1876 * MultiXactGenLock to do this.
1877 */
1879
1880 /*
1881 * Discard the local MultiXactId cache like in AtEOXact_MultiXact.
1882 */
1883 MXactContext = NULL;
1885}
1886
1887/*
1888 * multixact_twophase_recover
1889 * Recover the state of a prepared transaction at startup
1890 */
1891void
1893 void *recdata, uint32 len)
1894{
1895 ProcNumber dummyProcNumber = TwoPhaseGetDummyProcNumber(fxid, false);
1896 MultiXactId oldestMember;
1897
1898 /*
1899 * Get the oldest member XID from the state file record, and set it in the
1900 * OldestMemberMXactId slot reserved for this prepared transaction.
1901 */
1902 Assert(len == sizeof(MultiXactId));
1903 oldestMember = *((MultiXactId *) recdata);
1904
1905 OldestMemberMXactId[dummyProcNumber] = oldestMember;
1906}
1907
1908/*
1909 * multixact_twophase_postcommit
1910 * Similar to AtEOXact_MultiXact but for COMMIT PREPARED
1911 */
1912void
1914 void *recdata, uint32 len)
1915{
1916 ProcNumber dummyProcNumber = TwoPhaseGetDummyProcNumber(fxid, true);
1917
1918 Assert(len == sizeof(MultiXactId));
1919
1920 OldestMemberMXactId[dummyProcNumber] = InvalidMultiXactId;
1921}
1922
1923/*
1924 * multixact_twophase_postabort
1925 * This is actually just the same as the COMMIT case.
1926 */
1927void
1929 void *recdata, uint32 len)
1930{
1931 multixact_twophase_postcommit(fxid, info, recdata, len);
1932}
1933
1934/*
1935 * Initialization of shared memory for MultiXact. We use two SLRU areas,
1936 * thus double memory. Also, reserve space for the shared MultiXactState
1937 * struct and the per-backend MultiXactId arrays (two of those, too).
1938 */
1939Size
1941{
1942 Size size;
1943
1944 /* We need 2*MaxOldestSlot perBackendXactIds[] entries */
1945#define SHARED_MULTIXACT_STATE_SIZE \
1946 add_size(offsetof(MultiXactStateData, perBackendXactIds), \
1947 mul_size(sizeof(MultiXactId) * 2, MaxOldestSlot))
1948
1952
1953 return size;
1954}
1955
1956void
1958{
1959 bool found;
1960
1961 debug_elog2(DEBUG2, "Shared Memory Init for MultiXact");
1962
1965
1967 "multixact_offset", multixact_offset_buffers, 0,
1968 "pg_multixact/offsets", LWTRANCHE_MULTIXACTOFFSET_BUFFER,
1969 LWTRANCHE_MULTIXACTOFFSET_SLRU,
1971 false);
1974 "multixact_member", multixact_member_buffers, 0,
1975 "pg_multixact/members", LWTRANCHE_MULTIXACTMEMBER_BUFFER,
1976 LWTRANCHE_MULTIXACTMEMBER_SLRU,
1978 false);
1979 /* doesn't call SimpleLruTruncate() or meet criteria for unit tests */
1980
1981 /* Initialize our shared state struct */
1982 MultiXactState = ShmemInitStruct("Shared MultiXact State",
1984 &found);
1985 if (!IsUnderPostmaster)
1986 {
1987 Assert(!found);
1988
1989 /* Make sure we zero out the per-backend state */
1991 }
1992 else
1993 Assert(found);
1994
1995 /*
1996 * Set up array pointers.
1997 */
2000}
2001
2002/*
2003 * GUC check_hook for multixact_offset_buffers
2004 */
2005bool
2007{
2008 return check_slru_buffers("multixact_offset_buffers", newval);
2009}
2010
2011/*
2012 * GUC check_hook for multixact_member_buffers
2013 */
2014bool
2016{
2017 return check_slru_buffers("multixact_member_buffers", newval);
2018}
2019
2020/*
2021 * This func must be called ONCE on system install. It creates the initial
2022 * MultiXact segments. (The MultiXacts directories are assumed to have been
2023 * created by initdb, and MultiXactShmemInit must have been called already.)
2024 */
2025void
2027{
2028 /* Zero the initial pages and flush them to disk */
2031}
2032
2033/*
2034 * MaybeExtendOffsetSlru
2035 * Extend the offsets SLRU area, if necessary
2036 *
2037 * After a binary upgrade from <= 9.2, the pg_multixact/offsets SLRU area might
2038 * contain files that are shorter than necessary; this would occur if the old
2039 * installation had used multixacts beyond the first page (files cannot be
2040 * copied, because the on-disk representation is different). pg_upgrade would
2041 * update pg_control to set the next offset value to be at that position, so
2042 * that tuples marked as locked by such MultiXacts would be seen as visible
2043 * without having to consult multixact. However, trying to create and use a
2044 * new MultiXactId would result in an error because the page on which the new
2045 * value would reside does not exist. This routine is in charge of creating
2046 * such pages.
2047 */
2048static void
2050{
2051 int64 pageno;
2052 LWLock *lock;
2053
2056
2058
2060 {
2061 int slotno;
2062
2063 /*
2064 * Fortunately for us, SimpleLruWritePage is already prepared to deal
2065 * with creating a new segment file even if the page we're writing is
2066 * not the first in it, so this is enough.
2067 */
2068 slotno = SimpleLruZeroPage(MultiXactOffsetCtl, pageno);
2070 }
2071
2072 LWLockRelease(lock);
2073}
2074
2075/*
2076 * This must be called ONCE during postmaster or standalone-backend startup.
2077 *
2078 * StartupXLOG has already established nextMXact/nextOffset by calling
2079 * MultiXactSetNextMXact and/or MultiXactAdvanceNextMXact, and the oldestMulti
2080 * info from pg_control and/or MultiXactAdvanceOldest, but we haven't yet
2081 * replayed WAL.
2082 */
2083void
2085{
2088 int64 pageno;
2089
2090 /*
2091 * Initialize offset's idea of the latest page number.
2092 */
2093 pageno = MultiXactIdToOffsetPage(multi);
2094 pg_atomic_write_u64(&MultiXactOffsetCtl->shared->latest_page_number,
2095 pageno);
2096
2097 /*
2098 * Initialize member's idea of the latest page number.
2099 */
2100 pageno = MXOffsetToMemberPage(offset);
2101 pg_atomic_write_u64(&MultiXactMemberCtl->shared->latest_page_number,
2102 pageno);
2103}
2104
2105/*
2106 * This must be called ONCE at the end of startup/recovery.
2107 */
2108void
2110{
2111 MultiXactId nextMXact;
2112 MultiXactOffset offset;
2113 MultiXactId oldestMXact;
2114 Oid oldestMXactDB;
2115 int64 pageno;
2116 int entryno;
2117 int flagsoff;
2118
2119 LWLockAcquire(MultiXactGenLock, LW_SHARED);
2120 nextMXact = MultiXactState->nextMXact;
2121 offset = MultiXactState->nextOffset;
2122 oldestMXact = MultiXactState->oldestMultiXactId;
2123 oldestMXactDB = MultiXactState->oldestMultiXactDB;
2124 LWLockRelease(MultiXactGenLock);
2125
2126 /* Clean up offsets state */
2127
2128 /*
2129 * (Re-)Initialize our idea of the latest page number for offsets.
2130 */
2131 pageno = MultiXactIdToOffsetPage(nextMXact);
2132 pg_atomic_write_u64(&MultiXactOffsetCtl->shared->latest_page_number,
2133 pageno);
2134
2135 /*
2136 * Set the offset of nextMXact on the offsets page. This is normally done
2137 * in RecordNewMultiXact() of the previous multixact, but let's be sure
2138 * the next page exists, if the nextMXact was reset with pg_resetwal for
2139 * example.
2140 *
2141 * Zero out the remainder of the page. See notes in TrimCLOG() for
2142 * background. Unlike CLOG, some WAL record covers every pg_multixact
2143 * SLRU mutation. Since, also unlike CLOG, we ignore the WAL rule "write
2144 * xlog before data," nextMXact successors may carry obsolete, nonzero
2145 * offset values.
2146 */
2147 entryno = MultiXactIdToOffsetEntry(nextMXact);
2148 {
2149 int slotno;
2150 MultiXactOffset *offptr;
2152
2154 if (entryno == 0)
2155 slotno = SimpleLruZeroPage(MultiXactOffsetCtl, pageno);
2156 else
2157 slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, nextMXact);
2158 offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
2159 offptr += entryno;
2160
2161 *offptr = offset;
2162 if (entryno != 0 && (entryno + 1) * sizeof(MultiXactOffset) != BLCKSZ)
2163 MemSet(offptr + 1, 0, BLCKSZ - (entryno + 1) * sizeof(MultiXactOffset));
2164
2165 MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
2166 LWLockRelease(lock);
2167 }
2168
2169 /*
2170 * And the same for members.
2171 *
2172 * (Re-)Initialize our idea of the latest page number for members.
2173 */
2174 pageno = MXOffsetToMemberPage(offset);
2175 pg_atomic_write_u64(&MultiXactMemberCtl->shared->latest_page_number,
2176 pageno);
2177
2178 /*
2179 * Zero out the remainder of the current members page. See notes in
2180 * TrimCLOG() for motivation.
2181 */
2182 flagsoff = MXOffsetToFlagsOffset(offset);
2183 if (flagsoff != 0)
2184 {
2185 int slotno;
2186 TransactionId *xidptr;
2187 int memberoff;
2189
2191 memberoff = MXOffsetToMemberOffset(offset);
2192 slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, true, offset);
2193 xidptr = (TransactionId *)
2194 (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff);
2195
2196 MemSet(xidptr, 0, BLCKSZ - memberoff);
2197
2198 /*
2199 * Note: we don't need to zero out the flag bits in the remaining
2200 * members of the current group, because they are always reset before
2201 * writing.
2202 */
2203
2204 MultiXactMemberCtl->shared->page_dirty[slotno] = true;
2205 LWLockRelease(lock);
2206 }
2207
2208 /* signal that we're officially up */
2209 LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
2211 LWLockRelease(MultiXactGenLock);
2212
2213 /* Now compute how far away the next members wraparound is. */
2214 SetMultiXactIdLimit(oldestMXact, oldestMXactDB, true);
2215}
2216
2217/*
2218 * Get the MultiXact data to save in a checkpoint record
2219 */
2220void
2222 MultiXactId *nextMulti,
2223 MultiXactOffset *nextMultiOffset,
2224 MultiXactId *oldestMulti,
2225 Oid *oldestMultiDB)
2226{
2227 LWLockAcquire(MultiXactGenLock, LW_SHARED);
2228 *nextMulti = MultiXactState->nextMXact;
2229 *nextMultiOffset = MultiXactState->nextOffset;
2230 *oldestMulti = MultiXactState->oldestMultiXactId;
2231 *oldestMultiDB = MultiXactState->oldestMultiXactDB;
2232 LWLockRelease(MultiXactGenLock);
2233
2235 "MultiXact: checkpoint is nextMulti %u, nextOffset %u, oldestMulti %u in DB %u",
2236 *nextMulti, *nextMultiOffset, *oldestMulti, *oldestMultiDB);
2237}
2238
2239/*
2240 * Perform a checkpoint --- either during shutdown, or on-the-fly
2241 */
2242void
2244{
2245 TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_START(true);
2246
2247 /*
2248 * Write dirty MultiXact pages to disk. This may result in sync requests
2249 * queued for later handling by ProcessSyncRequests(), as part of the
2250 * checkpoint.
2251 */
2254
2255 TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_DONE(true);
2256}
2257
2258/*
2259 * Set the next-to-be-assigned MultiXactId and offset
2260 *
2261 * This is used when we can determine the correct next ID/offset exactly
2262 * from a checkpoint record. Although this is only called during bootstrap
2263 * and XLog replay, we take the lock in case any hot-standby backends are
2264 * examining the values.
2265 */
2266void
2268 MultiXactOffset nextMultiOffset)
2269{
2270 debug_elog4(DEBUG2, "MultiXact: setting next multi to %u offset %u",
2271 nextMulti, nextMultiOffset);
2272 LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
2273 MultiXactState->nextMXact = nextMulti;
2274 MultiXactState->nextOffset = nextMultiOffset;
2275 LWLockRelease(MultiXactGenLock);
2276
2277 /*
2278 * During a binary upgrade, make sure that the offsets SLRU is large
2279 * enough to contain the next value that would be created.
2280 *
2281 * We need to do this pretty early during the first startup in binary
2282 * upgrade mode: before StartupMultiXact() in fact, because this routine
2283 * is called even before that by StartupXLOG(). And we can't do it
2284 * earlier than at this point, because during that first call of this
2285 * routine we determine the MultiXactState->nextMXact value that
2286 * MaybeExtendOffsetSlru needs.
2287 */
2288 if (IsBinaryUpgrade)
2290}
2291
2292/*
2293 * Determine the last safe MultiXactId to allocate given the currently oldest
2294 * datminmxid (ie, the oldest MultiXactId that might exist in any database
2295 * of our cluster), and the OID of the (or a) database with that value.
2296 *
2297 * is_startup is true when we are just starting the cluster, false when we
2298 * are updating state in a running cluster. This only affects log messages.
2299 */
2300void
2301SetMultiXactIdLimit(MultiXactId oldest_datminmxid, Oid oldest_datoid,
2302 bool is_startup)
2303{
2304 MultiXactId multiVacLimit;
2305 MultiXactId multiWarnLimit;
2306 MultiXactId multiStopLimit;
2307 MultiXactId multiWrapLimit;
2308 MultiXactId curMulti;
2309 bool needs_offset_vacuum;
2310
2311 Assert(MultiXactIdIsValid(oldest_datminmxid));
2312
2313 /*
2314 * We pretend that a wrap will happen halfway through the multixact ID
2315 * space, but that's not really true, because multixacts wrap differently
2316 * from transaction IDs. Note that, separately from any concern about
2317 * multixact IDs wrapping, we must ensure that multixact members do not
2318 * wrap. Limits for that are set in SetOffsetVacuumLimit, not here.
2319 */
2320 multiWrapLimit = oldest_datminmxid + (MaxMultiXactId >> 1);
2321 if (multiWrapLimit < FirstMultiXactId)
2322 multiWrapLimit += FirstMultiXactId;
2323
2324 /*
2325 * We'll refuse to continue assigning MultiXactIds once we get within 3M
2326 * multi of data loss. See SetTransactionIdLimit.
2327 */
2328 multiStopLimit = multiWrapLimit - 3000000;
2329 if (multiStopLimit < FirstMultiXactId)
2330 multiStopLimit -= FirstMultiXactId;
2331
2332 /*
2333 * We'll start complaining loudly when we get within 40M multis of data
2334 * loss. This is kind of arbitrary, but if you let your gas gauge get
2335 * down to 2% of full, would you be looking for the next gas station? We
2336 * need to be fairly liberal about this number because there are lots of
2337 * scenarios where most transactions are done by automatic clients that
2338 * won't pay attention to warnings. (No, we're not gonna make this
2339 * configurable. If you know enough to configure it, you know enough to
2340 * not get in this kind of trouble in the first place.)
2341 */
2342 multiWarnLimit = multiWrapLimit - 40000000;
2343 if (multiWarnLimit < FirstMultiXactId)
2344 multiWarnLimit -= FirstMultiXactId;
2345
2346 /*
2347 * We'll start trying to force autovacuums when oldest_datminmxid gets to
2348 * be more than autovacuum_multixact_freeze_max_age mxids old.
2349 *
2350 * Note: autovacuum_multixact_freeze_max_age is a PGC_POSTMASTER parameter
2351 * so that we don't have to worry about dealing with on-the-fly changes in
2352 * its value. See SetTransactionIdLimit.
2353 */
2354 multiVacLimit = oldest_datminmxid + autovacuum_multixact_freeze_max_age;
2355 if (multiVacLimit < FirstMultiXactId)
2356 multiVacLimit += FirstMultiXactId;
2357
2358 /* Grab lock for just long enough to set the new limit values */
2359 LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
2360 MultiXactState->oldestMultiXactId = oldest_datminmxid;
2361 MultiXactState->oldestMultiXactDB = oldest_datoid;
2362 MultiXactState->multiVacLimit = multiVacLimit;
2363 MultiXactState->multiWarnLimit = multiWarnLimit;
2364 MultiXactState->multiStopLimit = multiStopLimit;
2365 MultiXactState->multiWrapLimit = multiWrapLimit;
2366 curMulti = MultiXactState->nextMXact;
2367 LWLockRelease(MultiXactGenLock);
2368
2369 /* Log the info */
2371 (errmsg_internal("MultiXactId wrap limit is %u, limited by database with OID %u",
2372 multiWrapLimit, oldest_datoid)));
2373
2374 /*
2375 * Computing the actual limits is only possible once the data directory is
2376 * in a consistent state. There's no need to compute the limits while
2377 * still replaying WAL - no decisions about new multis are made even
2378 * though multixact creations might be replayed. So we'll only do further
2379 * checks after TrimMultiXact() has been called.
2380 */
2382 return;
2383
2385
2386 /* Set limits for offset vacuum. */
2387 needs_offset_vacuum = SetOffsetVacuumLimit(is_startup);
2388
2389 /*
2390 * If past the autovacuum force point, immediately signal an autovac
2391 * request. The reason for this is that autovac only processes one
2392 * database per invocation. Once it's finished cleaning up the oldest
2393 * database, it'll call here, and we'll signal the postmaster to start
2394 * another iteration immediately if there are still any old databases.
2395 */
2396 if ((MultiXactIdPrecedes(multiVacLimit, curMulti) ||
2397 needs_offset_vacuum) && IsUnderPostmaster)
2399
2400 /* Give an immediate warning if past the wrap warn point */
2401 if (MultiXactIdPrecedes(multiWarnLimit, curMulti))
2402 {
2403 char *oldest_datname;
2404
2405 /*
2406 * We can be called when not inside a transaction, for example during
2407 * StartupXLOG(). In such a case we cannot do database access, so we
2408 * must just report the oldest DB's OID.
2409 *
2410 * Note: it's also possible that get_database_name fails and returns
2411 * NULL, for example because the database just got dropped. We'll
2412 * still warn, even though the warning might now be unnecessary.
2413 */
2414 if (IsTransactionState())
2415 oldest_datname = get_database_name(oldest_datoid);
2416 else
2417 oldest_datname = NULL;
2418
2419 if (oldest_datname)
2421 (errmsg_plural("database \"%s\" must be vacuumed before %u more MultiXactId is used",
2422 "database \"%s\" must be vacuumed before %u more MultiXactIds are used",
2423 multiWrapLimit - curMulti,
2424 oldest_datname,
2425 multiWrapLimit - curMulti),
2426 errhint("To avoid MultiXactId assignment failures, execute a database-wide VACUUM in that database.\n"
2427 "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
2428 else
2430 (errmsg_plural("database with OID %u must be vacuumed before %u more MultiXactId is used",
2431 "database with OID %u must be vacuumed before %u more MultiXactIds are used",
2432 multiWrapLimit - curMulti,
2433 oldest_datoid,
2434 multiWrapLimit - curMulti),
2435 errhint("To avoid MultiXactId assignment failures, execute a database-wide VACUUM in that database.\n"
2436 "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
2437 }
2438}
2439
2440/*
2441 * Ensure the next-to-be-assigned MultiXactId is at least minMulti,
2442 * and similarly nextOffset is at least minMultiOffset.
2443 *
2444 * This is used when we can determine minimum safe values from an XLog
2445 * record (either an on-line checkpoint or an mxact creation log entry).
2446 * Although this is only called during XLog replay, we take the lock in case
2447 * any hot-standby backends are examining the values.
2448 */
2449void
2451 MultiXactOffset minMultiOffset)
2452{
2453 LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
2455 {
2456 debug_elog3(DEBUG2, "MultiXact: setting next multi to %u", minMulti);
2457 MultiXactState->nextMXact = minMulti;
2458 }
2460 {
2461 debug_elog3(DEBUG2, "MultiXact: setting next offset to %u",
2462 minMultiOffset);
2463 MultiXactState->nextOffset = minMultiOffset;
2464 }
2465 LWLockRelease(MultiXactGenLock);
2466}
2467
2468/*
2469 * Update our oldestMultiXactId value, but only if it's more recent than what
2470 * we had.
2471 *
2472 * This may only be called during WAL replay.
2473 */
2474void
2475MultiXactAdvanceOldest(MultiXactId oldestMulti, Oid oldestMultiDB)
2476{
2478
2480 SetMultiXactIdLimit(oldestMulti, oldestMultiDB, false);
2481}
2482
2483/*
2484 * Make sure that MultiXactOffset has room for a newly-allocated MultiXactId.
2485 *
2486 * NB: this is called while holding MultiXactGenLock. We want it to be very
2487 * fast most of the time; even when it's not so fast, no actual I/O need
2488 * happen unless we're forced to write out a dirty log or xlog page to make
2489 * room in shared memory.
2490 */
2491static void
2493{
2494 int64 pageno;
2495 LWLock *lock;
2496
2497 /*
2498 * No work except at first MultiXactId of a page. But beware: just after
2499 * wraparound, the first MultiXactId of page zero is FirstMultiXactId.
2500 */
2501 if (MultiXactIdToOffsetEntry(multi) != 0 &&
2502 multi != FirstMultiXactId)
2503 return;
2504
2505 pageno = MultiXactIdToOffsetPage(multi);
2507
2509
2510 /* Zero the page and make a WAL entry about it */
2513 pageno);
2514
2515 LWLockRelease(lock);
2516}
2517
2518/*
2519 * Make sure that MultiXactMember has room for the members of a newly-
2520 * allocated MultiXactId.
2521 *
2522 * Like the above routine, this is called while holding MultiXactGenLock;
2523 * same comments apply.
2524 */
2525static void
2527{
2528 /*
2529 * It's possible that the members span more than one page of the members
2530 * file, so we loop to ensure we consider each page. The coding is not
2531 * optimal if the members span several pages, but that seems unusual
2532 * enough to not worry much about.
2533 */
2534 while (nmembers > 0)
2535 {
2536 int flagsoff;
2537 int flagsbit;
2539
2540 /*
2541 * Only zero when at first entry of a page.
2542 */
2543 flagsoff = MXOffsetToFlagsOffset(offset);
2544 flagsbit = MXOffsetToFlagsBitShift(offset);
2545 if (flagsoff == 0 && flagsbit == 0)
2546 {
2547 int64 pageno;
2548 LWLock *lock;
2549
2550 pageno = MXOffsetToMemberPage(offset);
2552
2554
2555 /* Zero the page and make a WAL entry about it */
2557 XLogSimpleInsertInt64(RM_MULTIXACT_ID,
2559
2560 LWLockRelease(lock);
2561 }
2562
2563 /*
2564 * Compute the number of items till end of current page. Careful: if
2565 * addition of unsigned ints wraps around, we're at the last page of
2566 * the last segment; since that page holds a different number of items
2567 * than other pages, we need to do it differently.
2568 */
2569 if (offset + MAX_MEMBERS_IN_LAST_MEMBERS_PAGE < offset)
2570 {
2571 /*
2572 * This is the last page of the last segment; we can compute the
2573 * number of items left to allocate in it without modulo
2574 * arithmetic.
2575 */
2576 difference = MaxMultiXactOffset - offset + 1;
2577 }
2578 else
2580
2581 /*
2582 * Advance to next page, taking care to properly handle the wraparound
2583 * case. OK if nmembers goes negative.
2584 */
2585 nmembers -= difference;
2586 offset += difference;
2587 }
2588}
2589
2590/*
2591 * GetOldestMultiXactId
2592 *
2593 * Return the oldest MultiXactId that's still possibly still seen as live by
2594 * any running transaction. Older ones might still exist on disk, but they no
2595 * longer have any running member transaction.
2596 *
2597 * It's not safe to truncate MultiXact SLRU segments on the value returned by
2598 * this function; however, it can be set as the new relminmxid for any table
2599 * that VACUUM knows has no remaining MXIDs < the same value. It is only safe
2600 * to truncate SLRUs when no table can possibly still have a referencing MXID.
2601 */
2604{
2605 MultiXactId oldestMXact;
2606 MultiXactId nextMXact;
2607 int i;
2608
2609 /*
2610 * This is the oldest valid value among all the OldestMemberMXactId[] and
2611 * OldestVisibleMXactId[] entries, or nextMXact if none are valid.
2612 */
2613 LWLockAcquire(MultiXactGenLock, LW_SHARED);
2614
2615 /*
2616 * We have to beware of the possibility that nextMXact is in the
2617 * wrapped-around state. We don't fix the counter itself here, but we
2618 * must be sure to use a valid value in our calculation.
2619 */
2620 nextMXact = MultiXactState->nextMXact;
2621 if (nextMXact < FirstMultiXactId)
2622 nextMXact = FirstMultiXactId;
2623
2624 oldestMXact = nextMXact;
2625 for (i = 0; i < MaxOldestSlot; i++)
2626 {
2627 MultiXactId thisoldest;
2628
2629 thisoldest = OldestMemberMXactId[i];
2630 if (MultiXactIdIsValid(thisoldest) &&
2631 MultiXactIdPrecedes(thisoldest, oldestMXact))
2632 oldestMXact = thisoldest;
2633 thisoldest = OldestVisibleMXactId[i];
2634 if (MultiXactIdIsValid(thisoldest) &&
2635 MultiXactIdPrecedes(thisoldest, oldestMXact))
2636 oldestMXact = thisoldest;
2637 }
2638
2639 LWLockRelease(MultiXactGenLock);
2640
2641 return oldestMXact;
2642}
2643
2644/*
2645 * Determine how aggressively we need to vacuum in order to prevent member
2646 * wraparound.
2647 *
2648 * To do so determine what's the oldest member offset and install the limit
2649 * info in MultiXactState, where it can be used to prevent overrun of old data
2650 * in the members SLRU area.
2651 *
2652 * The return value is true if emergency autovacuum is required and false
2653 * otherwise.
2654 */
2655static bool
2656SetOffsetVacuumLimit(bool is_startup)
2657{
2658 MultiXactId oldestMultiXactId;
2659 MultiXactId nextMXact;
2660 MultiXactOffset oldestOffset = 0; /* placate compiler */
2661 MultiXactOffset prevOldestOffset;
2662 MultiXactOffset nextOffset;
2663 bool oldestOffsetKnown = false;
2664 bool prevOldestOffsetKnown;
2665 MultiXactOffset offsetStopLimit = 0;
2666 MultiXactOffset prevOffsetStopLimit;
2667
2668 /*
2669 * NB: Have to prevent concurrent truncation, we might otherwise try to
2670 * lookup an oldestMulti that's concurrently getting truncated away.
2671 */
2672 LWLockAcquire(MultiXactTruncationLock, LW_SHARED);
2673
2674 /* Read relevant fields from shared memory. */
2675 LWLockAcquire(MultiXactGenLock, LW_SHARED);
2676 oldestMultiXactId = MultiXactState->oldestMultiXactId;
2677 nextMXact = MultiXactState->nextMXact;
2678 nextOffset = MultiXactState->nextOffset;
2679 prevOldestOffsetKnown = MultiXactState->oldestOffsetKnown;
2680 prevOldestOffset = MultiXactState->oldestOffset;
2681 prevOffsetStopLimit = MultiXactState->offsetStopLimit;
2683 LWLockRelease(MultiXactGenLock);
2684
2685 /*
2686 * Determine the offset of the oldest multixact. Normally, we can read
2687 * the offset from the multixact itself, but there's an important special
2688 * case: if there are no multixacts in existence at all, oldestMXact
2689 * obviously can't point to one. It will instead point to the multixact
2690 * ID that will be assigned the next time one is needed.
2691 */
2692 if (oldestMultiXactId == nextMXact)
2693 {
2694 /*
2695 * When the next multixact gets created, it will be stored at the next
2696 * offset.
2697 */
2698 oldestOffset = nextOffset;
2699 oldestOffsetKnown = true;
2700 }
2701 else
2702 {
2703 /*
2704 * Figure out where the oldest existing multixact's offsets are
2705 * stored. Due to bugs in early release of PostgreSQL 9.3.X and 9.4.X,
2706 * the supposedly-earliest multixact might not really exist. We are
2707 * careful not to fail in that case.
2708 */
2709 oldestOffsetKnown =
2710 find_multixact_start(oldestMultiXactId, &oldestOffset);
2711
2712 if (oldestOffsetKnown)
2714 (errmsg_internal("oldest MultiXactId member is at offset %u",
2715 oldestOffset)));
2716 else
2717 ereport(LOG,
2718 (errmsg("MultiXact member wraparound protections are disabled because oldest checkpointed MultiXact %u does not exist on disk",
2719 oldestMultiXactId)));
2720 }
2721
2722 LWLockRelease(MultiXactTruncationLock);
2723
2724 /*
2725 * If we can, compute limits (and install them MultiXactState) to prevent
2726 * overrun of old data in the members SLRU area. We can only do so if the
2727 * oldest offset is known though.
2728 */
2729 if (oldestOffsetKnown)
2730 {
2731 /* move back to start of the corresponding segment */
2732 offsetStopLimit = oldestOffset - (oldestOffset %
2734
2735 /* always leave one segment before the wraparound point */
2737
2738 if (!prevOldestOffsetKnown && !is_startup)
2739 ereport(LOG,
2740 (errmsg("MultiXact member wraparound protections are now enabled")));
2741
2743 (errmsg_internal("MultiXact member stop limit is now %u based on MultiXact %u",
2744 offsetStopLimit, oldestMultiXactId)));
2745 }
2746 else if (prevOldestOffsetKnown)
2747 {
2748 /*
2749 * If we failed to get the oldest offset this time, but we have a
2750 * value from a previous pass through this function, use the old
2751 * values rather than automatically forcing an emergency autovacuum
2752 * cycle again.
2753 */
2754 oldestOffset = prevOldestOffset;
2755 oldestOffsetKnown = true;
2756 offsetStopLimit = prevOffsetStopLimit;
2757 }
2758
2759 /* Install the computed values */
2760 LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
2761 MultiXactState->oldestOffset = oldestOffset;
2762 MultiXactState->oldestOffsetKnown = oldestOffsetKnown;
2763 MultiXactState->offsetStopLimit = offsetStopLimit;
2764 LWLockRelease(MultiXactGenLock);
2765
2766 /*
2767 * Do we need an emergency autovacuum? If we're not sure, assume yes.
2768 */
2769 return !oldestOffsetKnown ||
2770 (nextOffset - oldestOffset > MULTIXACT_MEMBER_SAFE_THRESHOLD);
2771}
2772
2773/*
2774 * Return whether adding "distance" to "start" would move past "boundary".
2775 *
2776 * We use this to determine whether the addition is "wrapping around" the
2777 * boundary point, hence the name. The reason we don't want to use the regular
2778 * 2^31-modulo arithmetic here is that we want to be able to use the whole of
2779 * the 2^32-1 space here, allowing for more multixacts than would fit
2780 * otherwise.
2781 */
2782static bool
2784 uint32 distance)
2785{
2786 MultiXactOffset finish;
2787
2788 /*
2789 * Note that offset number 0 is not used (see GetMultiXactIdMembers), so
2790 * if the addition wraps around the UINT_MAX boundary, skip that value.
2791 */
2792 finish = start + distance;
2793 if (finish < start)
2794 finish++;
2795
2796 /*-----------------------------------------------------------------------
2797 * When the boundary is numerically greater than the starting point, any
2798 * value numerically between the two is not wrapped:
2799 *
2800 * <----S----B---->
2801 * [---) = F wrapped past B (and UINT_MAX)
2802 * [---) = F not wrapped
2803 * [----] = F wrapped past B
2804 *
2805 * When the boundary is numerically less than the starting point (i.e. the
2806 * UINT_MAX wraparound occurs somewhere in between) then all values in
2807 * between are wrapped:
2808 *
2809 * <----B----S---->
2810 * [---) = F not wrapped past B (but wrapped past UINT_MAX)
2811 * [---) = F wrapped past B (and UINT_MAX)
2812 * [----] = F not wrapped
2813 *-----------------------------------------------------------------------
2814 */
2815 if (start < boundary)
2816 return finish >= boundary || finish < start;
2817 else
2818 return finish >= boundary && finish < start;
2819}
2820
2821/*
2822 * Find the starting offset of the given MultiXactId.
2823 *
2824 * Returns false if the file containing the multi does not exist on disk.
2825 * Otherwise, returns true and sets *result to the starting member offset.
2826 *
2827 * This function does not prevent concurrent truncation, so if that's
2828 * required, the caller has to protect against that.
2829 */
2830static bool
2832{
2833 MultiXactOffset offset;
2834 int64 pageno;
2835 int entryno;
2836 int slotno;
2837 MultiXactOffset *offptr;
2838
2840
2841 pageno = MultiXactIdToOffsetPage(multi);
2842 entryno = MultiXactIdToOffsetEntry(multi);
2843
2844 /*
2845 * Write out dirty data, so PhysicalPageExists can work correctly.
2846 */
2849
2851 return false;
2852
2853 /* lock is acquired by SimpleLruReadPage_ReadOnly */
2854 slotno = SimpleLruReadPage_ReadOnly(MultiXactOffsetCtl, pageno, multi);
2855 offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
2856 offptr += entryno;
2857 offset = *offptr;
2859
2860 *result = offset;
2861 return true;
2862}
2863
2864/*
2865 * GetMultiXactInfo
2866 *
2867 * Returns information about the current MultiXact state, as of:
2868 * multixacts: Number of MultiXacts (nextMultiXactId - oldestMultiXactId)
2869 * members: Number of member entries (nextOffset - oldestOffset)
2870 * oldestMultiXactId: Oldest MultiXact ID still in use
2871 * oldestOffset: Oldest offset still in use
2872 *
2873 * Returns false if unable to determine, the oldest offset being unknown.
2874 */
2875bool
2877 MultiXactId *oldestMultiXactId, MultiXactOffset *oldestOffset)
2878{
2879 MultiXactOffset nextOffset;
2880 MultiXactId nextMultiXactId;
2881 bool oldestOffsetKnown;
2882
2883 LWLockAcquire(MultiXactGenLock, LW_SHARED);
2884 nextOffset = MultiXactState->nextOffset;
2885 *oldestMultiXactId = MultiXactState->oldestMultiXactId;
2886 nextMultiXactId = MultiXactState->nextMXact;
2887 *oldestOffset = MultiXactState->oldestOffset;
2888 oldestOffsetKnown = MultiXactState->oldestOffsetKnown;
2889 LWLockRelease(MultiXactGenLock);
2890
2891 if (!oldestOffsetKnown)
2892 {
2893 *members = 0;
2894 *multixacts = 0;
2895 *oldestMultiXactId = InvalidMultiXactId;
2896 *oldestOffset = 0;
2897 return false;
2898 }
2899
2900 *members = nextOffset - *oldestOffset;
2901 *multixacts = nextMultiXactId - *oldestMultiXactId;
2902 return true;
2903}
2904
2905/*
2906 * Multixact members can be removed once the multixacts that refer to them
2907 * are older than every datminmxid. autovacuum_multixact_freeze_max_age and
2908 * vacuum_multixact_freeze_table_age work together to make sure we never have
2909 * too many multixacts; we hope that, at least under normal circumstances,
2910 * this will also be sufficient to keep us from using too many offsets.
2911 * However, if the average multixact has many members, we might exhaust the
2912 * members space while still using few enough members that these limits fail
2913 * to trigger relminmxid advancement by VACUUM. At that point, we'd have no
2914 * choice but to start failing multixact-creating operations with an error.
2915 *
2916 * To prevent that, if more than a threshold portion of the members space is
2917 * used, we effectively reduce autovacuum_multixact_freeze_max_age and
2918 * to a value just less than the number of multixacts in use. We hope that
2919 * this will quickly trigger autovacuuming on the table or tables with the
2920 * oldest relminmxid, thus allowing datminmxid values to advance and removing
2921 * some members.
2922 *
2923 * As the fraction of the member space currently in use grows, we become
2924 * more aggressive in clamping this value. That not only causes autovacuum
2925 * to ramp up, but also makes any manual vacuums the user issues more
2926 * aggressive. This happens because vacuum_get_cutoffs() will clamp the
2927 * freeze table and the minimum freeze age cutoffs based on the effective
2928 * autovacuum_multixact_freeze_max_age this function returns. In the worst
2929 * case, we'll claim the freeze_max_age to zero, and every vacuum of any
2930 * table will freeze every multixact.
2931 */
2932int
2934{
2935 MultiXactOffset members;
2936 uint32 multixacts;
2937 uint32 victim_multixacts;
2938 double fraction;
2939 int result;
2940 MultiXactId oldestMultiXactId;
2941 MultiXactOffset oldestOffset;
2942
2943 /* If we can't determine member space utilization, assume the worst. */
2944 if (!GetMultiXactInfo(&multixacts, &members, &oldestMultiXactId, &oldestOffset))
2945 return 0;
2946
2947 /* If member space utilization is low, no special action is required. */
2948 if (members <= MULTIXACT_MEMBER_SAFE_THRESHOLD)
2950
2951 /*
2952 * Compute a target for relminmxid advancement. The number of multixacts
2953 * we try to eliminate from the system is based on how far we are past
2954 * MULTIXACT_MEMBER_SAFE_THRESHOLD.
2955 */
2956 fraction = (double) (members - MULTIXACT_MEMBER_SAFE_THRESHOLD) /
2958 victim_multixacts = multixacts * fraction;
2959
2960 /* fraction could be > 1.0, but lowest possible freeze age is zero */
2961 if (victim_multixacts > multixacts)
2962 return 0;
2963 result = multixacts - victim_multixacts;
2964
2965 /*
2966 * Clamp to autovacuum_multixact_freeze_max_age, so that we never make
2967 * autovacuum less aggressive than it would otherwise be.
2968 */
2970}
2971
2972typedef struct mxtruncinfo
2973{
2976
2977/*
2978 * SlruScanDirectory callback
2979 * This callback determines the earliest existing page number.
2980 */
2981static bool
2983{
2984 mxtruncinfo *trunc = (mxtruncinfo *) data;
2985
2986 if (trunc->earliestExistingPage == -1 ||
2987 ctl->PagePrecedes(segpage, trunc->earliestExistingPage))
2988 {
2989 trunc->earliestExistingPage = segpage;
2990 }
2991
2992 return false; /* keep going */
2993}
2994
2995
2996/*
2997 * Delete members segments [oldest, newOldest)
2998 *
2999 * The members SLRU can, in contrast to the offsets one, be filled to almost
3000 * the full range at once. This means SimpleLruTruncate() can't trivially be
3001 * used - instead the to-be-deleted range is computed using the offsets
3002 * SLRU. C.f. TruncateMultiXact().
3003 */
3004static void
3006{
3008 int64 startsegment = MXOffsetToMemberSegment(oldestOffset);
3009 int64 endsegment = MXOffsetToMemberSegment(newOldestOffset);
3010 int64 segment = startsegment;
3011
3012 /*
3013 * Delete all the segments but the last one. The last segment can still
3014 * contain, possibly partially, valid data.
3015 */
3016 while (segment != endsegment)
3017 {
3018 elog(DEBUG2, "truncating multixact members segment %" PRIx64,
3019 segment);
3021
3022 /* move to next segment, handling wraparound correctly */
3023 if (segment == maxsegment)
3024 segment = 0;
3025 else
3026 segment += 1;
3027 }
3028}
3029
3030/*
3031 * Delete offsets segments [oldest, newOldest)
3032 */
3033static void
3035{
3036 /*
3037 * We step back one multixact to avoid passing a cutoff page that hasn't
3038 * been created yet in the rare case that oldestMulti would be the first
3039 * item on a page and oldestMulti == nextMulti. In that case, if we
3040 * didn't subtract one, we'd trigger SimpleLruTruncate's wraparound
3041 * detection.
3042 */
3045}
3046
3047/*
3048 * Remove all MultiXactOffset and MultiXactMember segments before the oldest
3049 * ones still of interest.
3050 *
3051 * This is only called on a primary as part of vacuum (via
3052 * vac_truncate_clog()). During recovery truncation is done by replaying
3053 * truncation WAL records logged here.
3054 *
3055 * newOldestMulti is the oldest currently required multixact, newOldestMultiDB
3056 * is one of the databases preventing newOldestMulti from increasing.
3057 */
3058void
3059TruncateMultiXact(MultiXactId newOldestMulti, Oid newOldestMultiDB)
3060{
3061 MultiXactId oldestMulti;
3062 MultiXactId nextMulti;
3063 MultiXactOffset newOldestOffset;
3064 MultiXactOffset oldestOffset;
3065 MultiXactOffset nextOffset;
3066 mxtruncinfo trunc;
3067 MultiXactId earliest;
3068
3071
3072 /*
3073 * We can only allow one truncation to happen at once. Otherwise parts of
3074 * members might vanish while we're doing lookups or similar. There's no
3075 * need to have an interlock with creating new multis or such, since those
3076 * are constrained by the limits (which only grow, never shrink).
3077 */
3078 LWLockAcquire(MultiXactTruncationLock, LW_EXCLUSIVE);
3079
3080 LWLockAcquire(MultiXactGenLock, LW_SHARED);
3081 nextMulti = MultiXactState->nextMXact;
3082 nextOffset = MultiXactState->nextOffset;
3083 oldestMulti = MultiXactState->oldestMultiXactId;
3084 LWLockRelease(MultiXactGenLock);
3085 Assert(MultiXactIdIsValid(oldestMulti));
3086
3087 /*
3088 * Make sure to only attempt truncation if there's values to truncate
3089 * away. In normal processing values shouldn't go backwards, but there's
3090 * some corner cases (due to bugs) where that's possible.
3091 */
3092 if (MultiXactIdPrecedesOrEquals(newOldestMulti, oldestMulti))
3093 {
3094 LWLockRelease(MultiXactTruncationLock);
3095 return;
3096 }
3097
3098 /*
3099 * Note we can't just plow ahead with the truncation; it's possible that
3100 * there are no segments to truncate, which is a problem because we are
3101 * going to attempt to read the offsets page to determine where to
3102 * truncate the members SLRU. So we first scan the directory to determine
3103 * the earliest offsets page number that we can read without error.
3104 *
3105 * When nextMXact is less than one segment away from multiWrapLimit,
3106 * SlruScanDirCbFindEarliest can find some early segment other than the
3107 * actual earliest. (MultiXactOffsetPagePrecedes(EARLIEST, LATEST)
3108 * returns false, because not all pairs of entries have the same answer.)
3109 * That can also arise when an earlier truncation attempt failed unlink()
3110 * or returned early from this function. The only consequence is
3111 * returning early, which wastes space that we could have liberated.
3112 *
3113 * NB: It's also possible that the page that oldestMulti is on has already
3114 * been truncated away, and we crashed before updating oldestMulti.
3115 */
3116 trunc.earliestExistingPage = -1;
3119 if (earliest < FirstMultiXactId)
3120 earliest = FirstMultiXactId;
3121
3122 /* If there's nothing to remove, we can bail out early. */
3123 if (MultiXactIdPrecedes(oldestMulti, earliest))
3124 {
3125 LWLockRelease(MultiXactTruncationLock);
3126 return;
3127 }
3128
3129 /*
3130 * First, compute the safe truncation point for MultiXactMember. This is
3131 * the starting offset of the oldest multixact.
3132 *
3133 * Hopefully, find_multixact_start will always work here, because we've
3134 * already checked that it doesn't precede the earliest MultiXact on disk.
3135 * But if it fails, don't truncate anything, and log a message.
3136 */
3137 if (oldestMulti == nextMulti)
3138 {
3139 /* there are NO MultiXacts */
3140 oldestOffset = nextOffset;
3141 }
3142 else if (!find_multixact_start(oldestMulti, &oldestOffset))
3143 {
3144 ereport(LOG,
3145 (errmsg("oldest MultiXact %u not found, earliest MultiXact %u, skipping truncation",
3146 oldestMulti, earliest)));
3147 LWLockRelease(MultiXactTruncationLock);
3148 return;
3149 }
3150
3151 /*
3152 * Secondly compute up to where to truncate. Lookup the corresponding
3153 * member offset for newOldestMulti for that.
3154 */
3155 if (newOldestMulti == nextMulti)
3156 {
3157 /* there are NO MultiXacts */
3158 newOldestOffset = nextOffset;
3159 }
3160 else if (!find_multixact_start(newOldestMulti, &newOldestOffset))
3161 {
3162 ereport(LOG,
3163 (errmsg("cannot truncate up to MultiXact %u because it does not exist on disk, skipping truncation",
3164 newOldestMulti)));
3165 LWLockRelease(MultiXactTruncationLock);
3166 return;
3167 }
3168
3169 elog(DEBUG1, "performing multixact truncation: "
3170 "offsets [%u, %u), offsets segments [%" PRIx64 ", %" PRIx64 "), "
3171 "members [%u, %u), members segments [%" PRIx64 ", %" PRIx64 ")",
3172 oldestMulti, newOldestMulti,
3173 MultiXactIdToOffsetSegment(oldestMulti),
3174 MultiXactIdToOffsetSegment(newOldestMulti),
3175 oldestOffset, newOldestOffset,
3176 MXOffsetToMemberSegment(oldestOffset),
3177 MXOffsetToMemberSegment(newOldestOffset));
3178
3179 /*
3180 * Do truncation, and the WAL logging of the truncation, in a critical
3181 * section. That way offsets/members cannot get out of sync anymore, i.e.
3182 * once consistent the newOldestMulti will always exist in members, even
3183 * if we crashed in the wrong moment.
3184 */
3186
3187 /*
3188 * Prevent checkpoints from being scheduled concurrently. This is critical
3189 * because otherwise a truncation record might not be replayed after a
3190 * crash/basebackup, even though the state of the data directory would
3191 * require it.
3192 */
3195
3196 /* WAL log truncation */
3197 WriteMTruncateXlogRec(newOldestMultiDB,
3198 oldestMulti, newOldestMulti,
3199 oldestOffset, newOldestOffset);
3200
3201 /*
3202 * Update in-memory limits before performing the truncation, while inside
3203 * the critical section: Have to do it before truncation, to prevent
3204 * concurrent lookups of those values. Has to be inside the critical
3205 * section as otherwise a future call to this function would error out,
3206 * while looking up the oldest member in offsets, if our caller crashes
3207 * before updating the limits.
3208 */
3209 LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
3210 MultiXactState->oldestMultiXactId = newOldestMulti;
3211 MultiXactState->oldestMultiXactDB = newOldestMultiDB;
3212 LWLockRelease(MultiXactGenLock);
3213
3214 /* First truncate members */
3215 PerformMembersTruncation(oldestOffset, newOldestOffset);
3216
3217 /* Then offsets */
3218 PerformOffsetsTruncation(oldestMulti, newOldestMulti);
3219
3220 MyProc->delayChkptFlags &= ~DELAY_CHKPT_START;
3221
3223 LWLockRelease(MultiXactTruncationLock);
3224}
3225
3226/*
3227 * Decide whether a MultiXactOffset page number is "older" for truncation
3228 * purposes. Analogous to CLOGPagePrecedes().
3229 *
3230 * Offsetting the values is optional, because MultiXactIdPrecedes() has
3231 * translational symmetry.
3232 */
3233static bool
3235{
3236 MultiXactId multi1;
3237 MultiXactId multi2;
3238
3239 multi1 = ((MultiXactId) page1) * MULTIXACT_OFFSETS_PER_PAGE;
3240 multi1 += FirstMultiXactId + 1;
3241 multi2 = ((MultiXactId) page2) * MULTIXACT_OFFSETS_PER_PAGE;
3242 multi2 += FirstMultiXactId + 1;
3243
3244 return (MultiXactIdPrecedes(multi1, multi2) &&
3245 MultiXactIdPrecedes(multi1,
3246 multi2 + MULTIXACT_OFFSETS_PER_PAGE - 1));
3247}
3248
3249/*
3250 * Decide whether a MultiXactMember page number is "older" for truncation
3251 * purposes. There is no "invalid offset number" so use the numbers verbatim.
3252 */
3253static bool
3255{
3256 MultiXactOffset offset1;
3257 MultiXactOffset offset2;
3258
3259 offset1 = ((MultiXactOffset) page1) * MULTIXACT_MEMBERS_PER_PAGE;
3260 offset2 = ((MultiXactOffset) page2) * MULTIXACT_MEMBERS_PER_PAGE;
3261
3262 return (MultiXactOffsetPrecedes(offset1, offset2) &&
3264 offset2 + MULTIXACT_MEMBERS_PER_PAGE - 1));
3265}
3266
3267/*
3268 * Decide which of two MultiXactIds is earlier.
3269 *
3270 * XXX do we need to do something special for InvalidMultiXactId?
3271 * (Doesn't look like it.)
3272 */
3273bool
3275{
3276 int32 diff = (int32) (multi1 - multi2);
3277
3278 return (diff < 0);
3279}
3280
3281/*
3282 * MultiXactIdPrecedesOrEquals -- is multi1 logically <= multi2?
3283 *
3284 * XXX do we need to do something special for InvalidMultiXactId?
3285 * (Doesn't look like it.)
3286 */
3287bool
3289{
3290 int32 diff = (int32) (multi1 - multi2);
3291
3292 return (diff <= 0);
3293}
3294
3295
3296/*
3297 * Decide which of two offsets is earlier.
3298 */
3299static bool
3301{
3302 int32 diff = (int32) (offset1 - offset2);
3303
3304 return (diff < 0);
3305}
3306
3307/*
3308 * Write a TRUNCATE xlog record
3309 *
3310 * We must flush the xlog record to disk before returning --- see notes in
3311 * TruncateCLOG().
3312 */
3313static void
3315 MultiXactId startTruncOff, MultiXactId endTruncOff,
3316 MultiXactOffset startTruncMemb, MultiXactOffset endTruncMemb)
3317{
3318 XLogRecPtr recptr;
3320
3321 xlrec.oldestMultiDB = oldestMultiDB;
3322
3323 xlrec.startTruncOff = startTruncOff;
3324 xlrec.endTruncOff = endTruncOff;
3325
3326 xlrec.startTruncMemb = startTruncMemb;
3327 xlrec.endTruncMemb = endTruncMemb;
3328
3331 recptr = XLogInsert(RM_MULTIXACT_ID, XLOG_MULTIXACT_TRUNCATE_ID);
3332 XLogFlush(recptr);
3333}
3334
3335/*
3336 * MULTIXACT resource manager's routines
3337 */
3338void
3340{
3341 uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
3342
3343 /* Backup blocks are not used in multixact records */
3345
3346 if (info == XLOG_MULTIXACT_ZERO_OFF_PAGE)
3347 {
3348 int64 pageno;
3349
3350 memcpy(&pageno, XLogRecGetData(record), sizeof(pageno));
3352 }
3353 else if (info == XLOG_MULTIXACT_ZERO_MEM_PAGE)
3354 {
3355 int64 pageno;
3356
3357 memcpy(&pageno, XLogRecGetData(record), sizeof(pageno));
3359 }
3360 else if (info == XLOG_MULTIXACT_CREATE_ID)
3361 {
3362 xl_multixact_create *xlrec =
3364 TransactionId max_xid;
3365 int i;
3366
3367 /* Store the data back into the SLRU files */
3368 RecordNewMultiXact(xlrec->mid, xlrec->moff, xlrec->nmembers,
3369 xlrec->members);
3370
3371 /* Make sure nextMXact/nextOffset are beyond what this record has */
3372 MultiXactAdvanceNextMXact(xlrec->mid + 1,
3373 xlrec->moff + xlrec->nmembers);
3374
3375 /*
3376 * Make sure nextXid is beyond any XID mentioned in the record. This
3377 * should be unnecessary, since any XID found here ought to have other
3378 * evidence in the XLOG, but let's be safe.
3379 */
3380 max_xid = XLogRecGetXid(record);
3381 for (i = 0; i < xlrec->nmembers; i++)
3382 {
3383 if (TransactionIdPrecedes(max_xid, xlrec->members[i].xid))
3384 max_xid = xlrec->members[i].xid;
3385 }
3386
3388 }
3389 else if (info == XLOG_MULTIXACT_TRUNCATE_ID)
3390 {
3392 int64 pageno;
3393
3394 memcpy(&xlrec, XLogRecGetData(record),
3396
3397 elog(DEBUG1, "replaying multixact truncation: "
3398 "offsets [%u, %u), offsets segments [%" PRIx64 ", %" PRIx64 "), "
3399 "members [%u, %u), members segments [%" PRIx64 ", %" PRIx64 ")",
3400 xlrec.startTruncOff, xlrec.endTruncOff,
3403 xlrec.startTruncMemb, xlrec.endTruncMemb,
3406
3407 /* should not be required, but more than cheap enough */
3408 LWLockAcquire(MultiXactTruncationLock, LW_EXCLUSIVE);
3409
3410 /*
3411 * Advance the horizon values, so they're current at the end of
3412 * recovery.
3413 */
3414 SetMultiXactIdLimit(xlrec.endTruncOff, xlrec.oldestMultiDB, false);
3415
3417
3418 /*
3419 * During XLOG replay, latest_page_number isn't necessarily set up
3420 * yet; insert a suitable value to bypass the sanity test in
3421 * SimpleLruTruncate.
3422 */
3423 pageno = MultiXactIdToOffsetPage(xlrec.endTruncOff);
3424 pg_atomic_write_u64(&MultiXactOffsetCtl->shared->latest_page_number,
3425 pageno);
3427
3428 LWLockRelease(MultiXactTruncationLock);
3429 }
3430 else
3431 elog(PANIC, "multixact_redo: unknown op code %u", info);
3432}
3433
3434/*
3435 * Entrypoint for sync.c to sync offsets files.
3436 */
3437int
3438multixactoffsetssyncfiletag(const FileTag *ftag, char *path)
3439{
3440 return SlruSyncFileTag(MultiXactOffsetCtl, ftag, path);
3441}
3442
3443/*
3444 * Entrypoint for sync.c to sync members files.
3445 */
3446int
3447multixactmemberssyncfiletag(const FileTag *ftag, char *path)
3448{
3449 return SlruSyncFileTag(MultiXactMemberCtl, ftag, path);
3450}
static void pg_atomic_write_u64(volatile pg_atomic_uint64 *ptr, uint64 val)
Definition: atomics.h:483
int autovacuum_multixact_freeze_max_age
Definition: autovacuum.c:130
static int32 next
Definition: blutils.c:224
#define Min(x, y)
Definition: c.h:1006
uint8_t uint8
Definition: c.h:539
int64_t int64
Definition: c.h:538
uint32 MultiXactOffset
Definition: c.h:672
TransactionId MultiXactId
Definition: c.h:670
#define FLEXIBLE_ARRAY_MEMBER
Definition: c.h:475
int32_t int32
Definition: c.h:537
uint16_t uint16
Definition: c.h:540
uint32_t uint32
Definition: c.h:541
#define MemSet(start, val, len)
Definition: c.h:1022
uint32 TransactionId
Definition: c.h:660
size_t Size
Definition: c.h:613
int errmsg_plural(const char *fmt_singular, const char *fmt_plural, unsigned long n,...)
Definition: elog.c:1193
int errmsg_internal(const char *fmt,...)
Definition: elog.c:1170
int errdetail_plural(const char *fmt_singular, const char *fmt_plural, unsigned long n,...)
Definition: elog.c:1308
int errhint(const char *fmt,...)
Definition: elog.c:1330
int errcode(int sqlerrcode)
Definition: elog.c:863
int errmsg(const char *fmt,...)
Definition: elog.c:1080
#define LOG
Definition: elog.h:31
#define WARNING
Definition: elog.h:36
#define DEBUG2
Definition: elog.h:29
#define PANIC
Definition: elog.h:42
#define DEBUG1
Definition: elog.h:30
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:226
#define ereport(elevel,...)
Definition: elog.h:150
Datum difference(PG_FUNCTION_ARGS)
int multixact_offset_buffers
Definition: globals.c:163
bool IsBinaryUpgrade
Definition: globals.c:121
ProcNumber MyProcNumber
Definition: globals.c:90
bool IsUnderPostmaster
Definition: globals.c:120
int multixact_member_buffers
Definition: globals.c:162
#define newval
GucSource
Definition: guc.h:112
Assert(PointerIsAligned(start, uint64))
return str start
const char * str
#define dclist_container(type, membername, ptr)
Definition: ilist.h:947
static uint32 dclist_count(const dclist_head *head)
Definition: ilist.h:932
static void dclist_move_head(dclist_head *head, dlist_node *node)
Definition: ilist.h:808
static dlist_node * dclist_tail_node(dclist_head *head)
Definition: ilist.h:920
static void dclist_delete_from(dclist_head *head, dlist_node *node)
Definition: ilist.h:763
#define DCLIST_STATIC_INIT(name)
Definition: ilist.h:282
static void dclist_push_head(dclist_head *head, dlist_node *node)
Definition: ilist.h:693
static void dclist_init(dclist_head *head)
Definition: ilist.h:671
#define dclist_foreach(iter, lhead)
Definition: ilist.h:970
#define INJECTION_POINT_CACHED(name, arg)
#define INJECTION_POINT_LOAD(name)
int j
Definition: isn.c:78
int i
Definition: isn.c:77
if(TABLE==NULL||TABLE_index==NULL)
Definition: isn.c:81
char * get_database_name(Oid dbid)
Definition: lsyscache.c:1259
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1174
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1894
@ LW_SHARED
Definition: lwlock.h:113
@ LW_EXCLUSIVE
Definition: lwlock.h:112
char * MemoryContextStrdup(MemoryContext context, const char *string)
Definition: mcxt.c:1746
void * MemoryContextAlloc(MemoryContext context, Size size)
Definition: mcxt.c:1229
MemoryContext TopTransactionContext
Definition: mcxt.c:171
void pfree(void *pointer)
Definition: mcxt.c:1594
MemoryContext TopMemoryContext
Definition: mcxt.c:166
void * palloc(Size size)
Definition: mcxt.c:1365
#define AllocSetContextCreate
Definition: memutils.h:129
#define ALLOCSET_SMALL_SIZES
Definition: memutils.h:170
#define START_CRIT_SECTION()
Definition: miscadmin.h:150
#define END_CRIT_SECTION()
Definition: miscadmin.h:152
static void WriteMTruncateXlogRec(Oid oldestMultiDB, MultiXactId startTruncOff, MultiXactId endTruncOff, MultiXactOffset startTruncMemb, MultiXactOffset endTruncMemb)
Definition: multixact.c:3314
static MultiXactId PreviousMultiXactId(MultiXactId multi)
Definition: multixact.c:216
static SlruCtlData MultiXactOffsetCtlData
Definition: multixact.c:224
void MultiXactShmemInit(void)
Definition: multixact.c:1957
#define MULTIXACT_MEMBER_SAFE_THRESHOLD
Definition: multixact.c:211
static bool MultiXactMemberPagePrecedes(int64 page1, int64 page2)
Definition: multixact.c:3254
static MultiXactId GetNewMultiXactId(int nmembers, MultiXactOffset *offset)
Definition: multixact.c:1069
static int mXactCacheGetById(MultiXactId multi, MultiXactMember **members)
Definition: multixact.c:1655
MultiXactId MultiXactIdExpand(MultiXactId multi, TransactionId xid, MultiXactStatus status)
Definition: multixact.c:471
static int64 MXOffsetToMemberPage(MultiXactOffset offset)
Definition: multixact.c:168
#define MXACT_MEMBER_BITS_PER_XACT
Definition: multixact.c:138
static int64 MultiXactIdToOffsetSegment(MultiXactId multi)
Definition: multixact.c:120
static void ExtendMultiXactMember(MultiXactOffset offset, int nmembers)
Definition: multixact.c:2526
void ReadMultiXactIdRange(MultiXactId *oldest, MultiXactId *next)
Definition: multixact.c:775
static void PerformOffsetsTruncation(MultiXactId oldestMulti, MultiXactId newOldestMulti)
Definition: multixact.c:3034
#define MXACT_MEMBER_XACT_BITMASK
Definition: multixact.c:140
#define MULTIXACT_FLAGBYTES_PER_GROUP
Definition: multixact.c:143
bool MultiXactIdPrecedes(MultiXactId multi1, MultiXactId multi2)
Definition: multixact.c:3274
char * mxstatus_to_string(MultiXactStatus status)
Definition: multixact.c:1747
void multixact_redo(XLogReaderState *record)
Definition: multixact.c:3339
#define MULTIXACT_OFFSETS_PER_PAGE
Definition: multixact.c:105
void multixact_twophase_postcommit(FullTransactionId fxid, uint16 info, void *recdata, uint32 len)
Definition: multixact.c:1913
#define debug_elog5(a, b, c, d, e)
Definition: multixact.c:373
static void MultiXactIdSetOldestVisible(void)
Definition: multixact.c:714
int multixactoffsetssyncfiletag(const FileTag *ftag, char *path)
Definition: multixact.c:3438
static bool find_multixact_start(MultiXactId multi, MultiXactOffset *result)
Definition: multixact.c:2831
void PostPrepare_MultiXact(FullTransactionId fxid)
Definition: multixact.c:1843
void MultiXactSetNextMXact(MultiXactId nextMulti, MultiXactOffset nextMultiOffset)
Definition: multixact.c:2267
#define MultiXactMemberCtl
Definition: multixact.c:228
static bool SlruScanDirCbFindEarliest(SlruCtl ctl, char *filename, int64 segpage, void *data)
Definition: multixact.c:2982
void AtPrepare_MultiXact(void)
Definition: multixact.c:1829
static bool MultiXactOffsetWouldWrap(MultiXactOffset boundary, MultiXactOffset start, uint32 distance)
Definition: multixact.c:2783
bool MultiXactIdPrecedesOrEquals(MultiXactId multi1, MultiXactId multi2)
Definition: multixact.c:3288
void MultiXactAdvanceOldest(MultiXactId oldestMulti, Oid oldestMultiDB)
Definition: multixact.c:2475
static int MultiXactIdToOffsetEntry(MultiXactId multi)
Definition: multixact.c:114
static void mXactCachePut(MultiXactId multi, int nmembers, MultiXactMember *members)
Definition: multixact.c:1702
static void MaybeExtendOffsetSlru(void)
Definition: multixact.c:2049
bool MultiXactIdIsRunning(MultiXactId multi, bool isLockOnly)
Definition: multixact.c:583
void MultiXactIdSetOldestMember(void)
Definition: multixact.c:657
static void PerformMembersTruncation(MultiXactOffset oldestOffset, MultiXactOffset newOldestOffset)
Definition: multixact.c:3005
static MemoryContext MXactContext
Definition: multixact.c:361
#define SHARED_MULTIXACT_STATE_SIZE
static MultiXactId * OldestVisibleMXactId
Definition: multixact.c:331
struct mxtruncinfo mxtruncinfo
static int mxactMemberComparator(const void *arg1, const void *arg2)
Definition: multixact.c:1582
struct MultiXactStateData MultiXactStateData
static void ExtendMultiXactOffset(MultiXactId multi)
Definition: multixact.c:2492
Size MultiXactShmemSize(void)
Definition: multixact.c:1940
#define MULTIXACT_MEMBERGROUPS_PER_PAGE
Definition: multixact.c:149
#define MultiXactOffsetCtl
Definition: multixact.c:227
static int MXOffsetToMemberOffset(MultiXactOffset offset)
Definition: multixact.c:201
void MultiXactGetCheckptMulti(bool is_shutdown, MultiXactId *nextMulti, MultiXactOffset *nextMultiOffset, MultiXactId *oldestMulti, Oid *oldestMultiDB)
Definition: multixact.c:2221
void SetMultiXactIdLimit(MultiXactId oldest_datminmxid, Oid oldest_datoid, bool is_startup)
Definition: multixact.c:2301
static void RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset, int nmembers, MultiXactMember *members)
Definition: multixact.c:900
int multixactmemberssyncfiletag(const FileTag *ftag, char *path)
Definition: multixact.c:3447
#define MAX_CACHE_ENTRIES
Definition: multixact.c:359
static int64 MultiXactIdToOffsetPage(MultiXactId multi)
Definition: multixact.c:108
MultiXactId GetOldestMultiXactId(void)
Definition: multixact.c:2603
void CheckPointMultiXact(void)
Definition: multixact.c:2243
#define MaxOldestSlot
Definition: multixact.c:326
MultiXactId MultiXactIdCreateFromMembers(int nmembers, MultiXactMember *members)
Definition: multixact.c:799
struct mXactCacheEnt mXactCacheEnt
static int64 MXOffsetToMemberSegment(MultiXactOffset offset)
Definition: multixact.c:174
static MultiXactId mXactCacheGetBySet(int nmembers, MultiXactMember *members)
Definition: multixact.c:1612
static dclist_head MXactCache
Definition: multixact.c:360
void TrimMultiXact(void)
Definition: multixact.c:2109
#define debug_elog3(a, b, c)
Definition: multixact.c:371
char * mxid_to_string(MultiXactId multi, int nmembers, MultiXactMember *members)
Definition: multixact.c:1770
#define MULTIXACT_MEMBERGROUP_SIZE
Definition: multixact.c:147
#define debug_elog4(a, b, c, d)
Definition: multixact.c:372
void multixact_twophase_postabort(FullTransactionId fxid, uint16 info, void *recdata, uint32 len)
Definition: multixact.c:1928
static bool MultiXactOffsetPagePrecedes(int64 page1, int64 page2)
Definition: multixact.c:3234
static bool SetOffsetVacuumLimit(bool is_startup)
Definition: multixact.c:2656
static int MXOffsetToFlagsOffset(MultiXactOffset offset)
Definition: multixact.c:181
int MultiXactMemberFreezeThreshold(void)
Definition: multixact.c:2933
void MultiXactAdvanceNextMXact(MultiXactId minMulti, MultiXactOffset minMultiOffset)
Definition: multixact.c:2450
static MultiXactId * OldestMemberMXactId
Definition: multixact.c:330
#define MAX_MEMBERS_IN_LAST_MEMBERS_PAGE
Definition: multixact.c:163
static MultiXactStateData * MultiXactState
Definition: multixact.c:329
#define MULTIXACT_MEMBERS_PER_MEMBERGROUP
Definition: multixact.c:144
#define OFFSET_WARN_SEGMENTS
MultiXactId ReadNextMultiXactId(void)
Definition: multixact.c:755
void BootStrapMultiXact(void)
Definition: multixact.c:2026
#define debug_elog6(a, b, c, d, e, f)
Definition: multixact.c:374
void multixact_twophase_recover(FullTransactionId fxid, uint16 info, void *recdata, uint32 len)
Definition: multixact.c:1892
#define MULTIXACT_MEMBERS_PER_PAGE
Definition: multixact.c:150
MultiXactId MultiXactIdCreate(TransactionId xid1, MultiXactStatus status1, TransactionId xid2, MultiXactStatus status2)
Definition: multixact.c:418
void TruncateMultiXact(MultiXactId newOldestMulti, Oid newOldestMultiDB)
Definition: multixact.c:3059
#define MULTIXACT_MEMBER_DANGER_THRESHOLD
Definition: multixact.c:212
static int MXOffsetToFlagsBitShift(MultiXactOffset offset)
Definition: multixact.c:191
bool check_multixact_offset_buffers(int *newval, void **extra, GucSource source)
Definition: multixact.c:2006
static bool MultiXactOffsetPrecedes(MultiXactOffset offset1, MultiXactOffset offset2)
Definition: multixact.c:3300
bool GetMultiXactInfo(uint32 *multixacts, MultiXactOffset *members, MultiXactId *oldestMultiXactId, MultiXactOffset *oldestOffset)
Definition: multixact.c:2876
bool check_multixact_member_buffers(int *newval, void **extra, GucSource source)
Definition: multixact.c:2015
void AtEOXact_MultiXact(void)
Definition: multixact.c:1801
static SlruCtlData MultiXactMemberCtlData
Definition: multixact.c:225
#define debug_elog2(a, b)
Definition: multixact.c:370
void StartupMultiXact(void)
Definition: multixact.c:2084
int GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members, bool from_pgupgrade, bool isLockOnly)
Definition: multixact.c:1339
#define MultiXactIdIsValid(multi)
Definition: multixact.h:29
#define XLOG_MULTIXACT_ZERO_MEM_PAGE
Definition: multixact.h:70
#define XLOG_MULTIXACT_ZERO_OFF_PAGE
Definition: multixact.h:69
#define FirstMultiXactId
Definition: multixact.h:26
MultiXactStatus
Definition: multixact.h:39
@ MultiXactStatusForShare
Definition: multixact.h:41
@ MultiXactStatusForNoKeyUpdate
Definition: multixact.h:42
@ MultiXactStatusNoKeyUpdate
Definition: multixact.h:45
@ MultiXactStatusUpdate
Definition: multixact.h:47
@ MultiXactStatusForUpdate
Definition: multixact.h:43
@ MultiXactStatusForKeyShare
Definition: multixact.h:40
#define ISUPDATE_from_mxstatus(status)
Definition: multixact.h:53
#define InvalidMultiXactId
Definition: multixact.h:25
#define XLOG_MULTIXACT_TRUNCATE_ID
Definition: multixact.h:72
#define SizeOfMultiXactCreate
Definition: multixact.h:82
#define SizeOfMultiXactTruncate
Definition: multixact.h:97
#define XLOG_MULTIXACT_CREATE_ID
Definition: multixact.h:71
#define MaxMultiXactOffset
Definition: multixact.h:31
#define MaxMultiXactId
Definition: multixact.h:27
struct MultiXactMember MultiXactMember
#define ERRCODE_DATA_CORRUPTED
Definition: pg_basebackup.c:42
#define SLRU_PAGES_PER_SEGMENT
const void size_t len
const void * data
static char * filename
Definition: pg_dumpall.c:120
static rewind_source * source
Definition: pg_rewind.c:89
static char * buf
Definition: pg_test_fsync.c:72
void SendPostmasterSignal(PMSignalReason reason)
Definition: pmsignal.c:165
@ PMSIGNAL_START_AUTOVAC_LAUNCHER
Definition: pmsignal.h:39
#define qsort(a, b, c, d)
Definition: port.h:500
unsigned int Oid
Definition: postgres_ext.h:32
#define DELAY_CHKPT_START
Definition: proc.h:135
bool TransactionIdIsInProgress(TransactionId xid)
Definition: procarray.c:1402
int ProcNumber
Definition: procnumber.h:24
tree ctl
Definition: radixtree.h:1838
Size add_size(Size s1, Size s2)
Definition: shmem.c:495
void * ShmemInitStruct(const char *name, Size size, bool *foundPtr)
Definition: shmem.c:389
void SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns, const char *subdir, int buffer_tranche_id, int bank_tranche_id, SyncRequestHandler sync_handler, bool long_segment_names)
Definition: slru.c:252
int SimpleLruReadPage_ReadOnly(SlruCtl ctl, int64 pageno, TransactionId xid)
Definition: slru.c:630
void SimpleLruWritePage(SlruCtl ctl, int slotno)
Definition: slru.c:757
void SimpleLruWriteAll(SlruCtl ctl, bool allow_redirtied)
Definition: slru.c:1347
bool SimpleLruDoesPhysicalPageExist(SlruCtl ctl, int64 pageno)
Definition: slru.c:771
void SlruDeleteSegment(SlruCtl ctl, int64 segno)
Definition: slru.c:1551
bool SlruScanDirectory(SlruCtl ctl, SlruScanCallback callback, void *data)
Definition: slru.c:1816
int SimpleLruReadPage(SlruCtl ctl, int64 pageno, bool write_ok, TransactionId xid)
Definition: slru.c:527
int SlruSyncFileTag(SlruCtl ctl, const FileTag *ftag, char *path)
Definition: slru.c:1856
int SimpleLruZeroPage(SlruCtl ctl, int64 pageno)
Definition: slru.c:375
void SimpleLruZeroAndWritePage(SlruCtl ctl, int64 pageno)
Definition: slru.c:444
void SimpleLruTruncate(SlruCtl ctl, int64 cutoffPage)
Definition: slru.c:1433
Size SimpleLruShmemSize(int nslots, int nlsns)
Definition: slru.c:198
bool check_slru_buffers(const char *name, int *newval)
Definition: slru.c:355
static LWLock * SimpleLruGetBankLock(SlruCtl ctl, int64 pageno)
Definition: slru.h:160
#define SlruPagePrecedesUnitTests(ctl, per_page)
Definition: slru.h:185
PGPROC * MyProc
Definition: proc.c:67
void appendStringInfo(StringInfo str, const char *fmt,...)
Definition: stringinfo.c:145
void appendStringInfoChar(StringInfo str, char ch)
Definition: stringinfo.c:242
void initStringInfo(StringInfo str)
Definition: stringinfo.c:97
Definition: sync.h:51
Definition: lwlock.h:42
TransactionId xid
Definition: multixact.h:59
MultiXactStatus status
Definition: multixact.h:60
MultiXactId multiWrapLimit
Definition: multixact.c:268
MultiXactId multiStopLimit
Definition: multixact.c:267
MultiXactId multiWarnLimit
Definition: multixact.c:266
MultiXactId multiVacLimit
Definition: multixact.c:265
MultiXactOffset offsetStopLimit
Definition: multixact.c:271
MultiXactOffset nextOffset
Definition: multixact.c:243
MultiXactId nextMXact
Definition: multixact.c:240
MultiXactId oldestMultiXactId
Definition: multixact.c:253
MultiXactId perBackendXactIds[FLEXIBLE_ARRAY_MEMBER]
Definition: multixact.c:320
MultiXactOffset oldestOffset
Definition: multixact.c:261
int delayChkptFlags
Definition: proc.h:257
dlist_node * cur
Definition: ilist.h:179
MultiXactId multi
Definition: multixact.c:353
dlist_node node
Definition: multixact.c:355
MultiXactMember members[FLEXIBLE_ARRAY_MEMBER]
Definition: multixact.c:356
int64 earliestExistingPage
Definition: multixact.c:2974
MultiXactId mid
Definition: multixact.h:76
MultiXactMember members[FLEXIBLE_ARRAY_MEMBER]
Definition: multixact.h:79
MultiXactOffset moff
Definition: multixact.h:77
MultiXactId endTruncOff
Definition: multixact.h:90
MultiXactOffset startTruncMemb
Definition: multixact.h:93
MultiXactOffset endTruncMemb
Definition: multixact.h:94
MultiXactId startTruncOff
Definition: multixact.h:89
@ SYNC_HANDLER_MULTIXACT_MEMBER
Definition: sync.h:41
@ SYNC_HANDLER_MULTIXACT_OFFSET
Definition: sync.h:40
bool TransactionIdDidCommit(TransactionId transactionId)
Definition: transam.c:126
#define TransactionIdEquals(id1, id2)
Definition: transam.h:43
#define TransactionIdIsValid(xid)
Definition: transam.h:41
static bool TransactionIdPrecedes(TransactionId id1, TransactionId id2)
Definition: transam.h:263
ProcNumber TwoPhaseGetDummyProcNumber(FullTransactionId fxid, bool lock_held)
Definition: twophase.c:908
void RegisterTwoPhaseRecord(TwoPhaseRmgrId rmid, uint16 info, const void *data, uint32 len)
Definition: twophase.c:1271
#define TWOPHASE_RM_MULTIXACT_ID
Definition: twophase_rmgr.h:29
void AdvanceNextFullTransactionIdPastXid(TransactionId xid)
Definition: varsup.c:304
bool IsTransactionState(void)
Definition: xact.c:388
bool TransactionIdIsCurrentTransactionId(TransactionId xid)
Definition: xact.c:942
bool RecoveryInProgress(void)
Definition: xlog.c:6406
void XLogFlush(XLogRecPtr record)
Definition: xlog.c:2783
uint64 XLogRecPtr
Definition: xlogdefs.h:21
XLogRecPtr XLogSimpleInsertInt64(RmgrId rmid, uint8 info, int64 value)
Definition: xloginsert.c:543
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:478
void XLogRegisterData(const void *data, uint32 len)
Definition: xloginsert.c:368
void XLogBeginInsert(void)
Definition: xloginsert.c:152
#define XLogRecGetInfo(decoder)
Definition: xlogreader.h:409
#define XLogRecGetData(decoder)
Definition: xlogreader.h:414
#define XLogRecGetXid(decoder)
Definition: xlogreader.h:411
#define XLogRecHasAnyBlockRefs(decoder)
Definition: xlogreader.h:416
bool InRecovery
Definition: xlogutils.c:50