PostgreSQL Source Code  git master
predicate.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * predicate.c
4  * POSTGRES predicate locking
5  * to support full serializable transaction isolation
6  *
7  *
8  * The approach taken is to implement Serializable Snapshot Isolation (SSI)
9  * as initially described in this paper:
10  *
11  * Michael J. Cahill, Uwe Röhm, and Alan D. Fekete. 2008.
12  * Serializable isolation for snapshot databases.
13  * In SIGMOD '08: Proceedings of the 2008 ACM SIGMOD
14  * international conference on Management of data,
15  * pages 729-738, New York, NY, USA. ACM.
16  * http://doi.acm.org/10.1145/1376616.1376690
17  *
18  * and further elaborated in Cahill's doctoral thesis:
19  *
20  * Michael James Cahill. 2009.
21  * Serializable Isolation for Snapshot Databases.
22  * Sydney Digital Theses.
23  * University of Sydney, School of Information Technologies.
24  * http://hdl.handle.net/2123/5353
25  *
26  *
27  * Predicate locks for Serializable Snapshot Isolation (SSI) are SIREAD
28  * locks, which are so different from normal locks that a distinct set of
29  * structures is required to handle them. They are needed to detect
30  * rw-conflicts when the read happens before the write. (When the write
31  * occurs first, the reading transaction can check for a conflict by
32  * examining the MVCC data.)
33  *
34  * (1) Besides tuples actually read, they must cover ranges of tuples
35  * which would have been read based on the predicate. This will
36  * require modelling the predicates through locks against database
37  * objects such as pages, index ranges, or entire tables.
38  *
39  * (2) They must be kept in RAM for quick access. Because of this, it
40  * isn't possible to always maintain tuple-level granularity -- when
41  * the space allocated to store these approaches exhaustion, a
42  * request for a lock may need to scan for situations where a single
43  * transaction holds many fine-grained locks which can be coalesced
44  * into a single coarser-grained lock.
45  *
46  * (3) They never block anything; they are more like flags than locks
47  * in that regard; although they refer to database objects and are
48  * used to identify rw-conflicts with normal write locks.
49  *
50  * (4) While they are associated with a transaction, they must survive
51  * a successful COMMIT of that transaction, and remain until all
52  * overlapping transactions complete. This even means that they
53  * must survive termination of the transaction's process. If a
54  * top level transaction is rolled back, however, it is immediately
55  * flagged so that it can be ignored, and its SIREAD locks can be
56  * released any time after that.
57  *
58  * (5) The only transactions which create SIREAD locks or check for
59  * conflicts with them are serializable transactions.
60  *
61  * (6) When a write lock for a top level transaction is found to cover
62  * an existing SIREAD lock for the same transaction, the SIREAD lock
63  * can be deleted.
64  *
65  * (7) A write from a serializable transaction must ensure that an xact
66  * record exists for the transaction, with the same lifespan (until
67  * all concurrent transaction complete or the transaction is rolled
68  * back) so that rw-dependencies to that transaction can be
69  * detected.
70  *
71  * We use an optimization for read-only transactions. Under certain
72  * circumstances, a read-only transaction's snapshot can be shown to
73  * never have conflicts with other transactions. This is referred to
74  * as a "safe" snapshot (and one known not to be is "unsafe").
75  * However, it can't be determined whether a snapshot is safe until
76  * all concurrent read/write transactions complete.
77  *
78  * Once a read-only transaction is known to have a safe snapshot, it
79  * can release its predicate locks and exempt itself from further
80  * predicate lock tracking. READ ONLY DEFERRABLE transactions run only
81  * on safe snapshots, waiting as necessary for one to be available.
82  *
83  *
84  * Lightweight locks to manage access to the predicate locking shared
85  * memory objects must be taken in this order, and should be released in
86  * reverse order:
87  *
88  * SerializableFinishedListLock
89  * - Protects the list of transactions which have completed but which
90  * may yet matter because they overlap still-active transactions.
91  *
92  * SerializablePredicateListLock
93  * - Protects the linked list of locks held by a transaction. Note
94  * that the locks themselves are also covered by the partition
95  * locks of their respective lock targets; this lock only affects
96  * the linked list connecting the locks related to a transaction.
97  * - All transactions share this single lock (with no partitioning).
98  * - There is never a need for a process other than the one running
99  * an active transaction to walk the list of locks held by that
100  * transaction, except parallel query workers sharing the leader's
101  * transaction. In the parallel case, an extra per-sxact lock is
102  * taken; see below.
103  * - It is relatively infrequent that another process needs to
104  * modify the list for a transaction, but it does happen for such
105  * things as index page splits for pages with predicate locks and
106  * freeing of predicate locked pages by a vacuum process. When
107  * removing a lock in such cases, the lock itself contains the
108  * pointers needed to remove it from the list. When adding a
109  * lock in such cases, the lock can be added using the anchor in
110  * the transaction structure. Neither requires walking the list.
111  * - Cleaning up the list for a terminated transaction is sometimes
112  * not done on a retail basis, in which case no lock is required.
113  * - Due to the above, a process accessing its active transaction's
114  * list always uses a shared lock, regardless of whether it is
115  * walking or maintaining the list. This improves concurrency
116  * for the common access patterns.
117  * - A process which needs to alter the list of a transaction other
118  * than its own active transaction must acquire an exclusive
119  * lock.
120  *
121  * SERIALIZABLEXACT's member 'perXactPredicateListLock'
122  * - Protects the linked list of predicate locks held by a transaction.
123  * Only needed for parallel mode, where multiple backends share the
124  * same SERIALIZABLEXACT object. Not needed if
125  * SerializablePredicateListLock is held exclusively.
126  *
127  * PredicateLockHashPartitionLock(hashcode)
128  * - The same lock protects a target, all locks on that target, and
129  * the linked list of locks on the target.
130  * - When more than one is needed, acquire in ascending address order.
131  * - When all are needed (rare), acquire in ascending index order with
132  * PredicateLockHashPartitionLockByIndex(index).
133  *
134  * SerializableXactHashLock
135  * - Protects both PredXact and SerializableXidHash.
136  *
137  * SerialControlLock
138  * - Protects SerialControlData members
139  *
140  * SerialSLRULock
141  * - Protects SerialSlruCtl
142  *
143  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
144  * Portions Copyright (c) 1994, Regents of the University of California
145  *
146  *
147  * IDENTIFICATION
148  * src/backend/storage/lmgr/predicate.c
149  *
150  *-------------------------------------------------------------------------
151  */
152 /*
153  * INTERFACE ROUTINES
154  *
155  * housekeeping for setting up shared memory predicate lock structures
156  * InitPredicateLocks(void)
157  * PredicateLockShmemSize(void)
158  *
159  * predicate lock reporting
160  * GetPredicateLockStatusData(void)
161  * PageIsPredicateLocked(Relation relation, BlockNumber blkno)
162  *
163  * predicate lock maintenance
164  * GetSerializableTransactionSnapshot(Snapshot snapshot)
165  * SetSerializableTransactionSnapshot(Snapshot snapshot,
166  * VirtualTransactionId *sourcevxid)
167  * RegisterPredicateLockingXid(void)
168  * PredicateLockRelation(Relation relation, Snapshot snapshot)
169  * PredicateLockPage(Relation relation, BlockNumber blkno,
170  * Snapshot snapshot)
171  * PredicateLockTID(Relation relation, ItemPointer tid, Snapshot snapshot,
172  * TransactionId tuple_xid)
173  * PredicateLockPageSplit(Relation relation, BlockNumber oldblkno,
174  * BlockNumber newblkno)
175  * PredicateLockPageCombine(Relation relation, BlockNumber oldblkno,
176  * BlockNumber newblkno)
177  * TransferPredicateLocksToHeapRelation(Relation relation)
178  * ReleasePredicateLocks(bool isCommit, bool isReadOnlySafe)
179  *
180  * conflict detection (may also trigger rollback)
181  * CheckForSerializableConflictOut(Relation relation, TransactionId xid,
182  * Snapshot snapshot)
183  * CheckForSerializableConflictIn(Relation relation, ItemPointer tid,
184  * BlockNumber blkno)
185  * CheckTableForSerializableConflictIn(Relation relation)
186  *
187  * final rollback checking
188  * PreCommit_CheckForSerializationFailure(void)
189  *
190  * two-phase commit support
191  * AtPrepare_PredicateLocks(void);
192  * PostPrepare_PredicateLocks(TransactionId xid);
193  * PredicateLockTwoPhaseFinish(TransactionId xid, bool isCommit);
194  * predicatelock_twophase_recover(TransactionId xid, uint16 info,
195  * void *recdata, uint32 len);
196  */
197 
198 #include "postgres.h"
199 
200 #include "access/parallel.h"
201 #include "access/slru.h"
202 #include "access/subtrans.h"
203 #include "access/transam.h"
204 #include "access/twophase.h"
205 #include "access/twophase_rmgr.h"
206 #include "access/xact.h"
207 #include "access/xlog.h"
208 #include "miscadmin.h"
209 #include "pgstat.h"
210 #include "port/pg_lfind.h"
211 #include "storage/bufmgr.h"
212 #include "storage/predicate.h"
214 #include "storage/proc.h"
215 #include "storage/procarray.h"
216 #include "utils/guc_hooks.h"
217 #include "utils/rel.h"
218 #include "utils/snapmgr.h"
219 
220 /* Uncomment the next line to test the graceful degradation code. */
221 /* #define TEST_SUMMARIZE_SERIAL */
222 
223 /*
224  * Test the most selective fields first, for performance.
225  *
226  * a is covered by b if all of the following hold:
227  * 1) a.database = b.database
228  * 2) a.relation = b.relation
229  * 3) b.offset is invalid (b is page-granularity or higher)
230  * 4) either of the following:
231  * 4a) a.offset is valid (a is tuple-granularity) and a.page = b.page
232  * or 4b) a.offset is invalid and b.page is invalid (a is
233  * page-granularity and b is relation-granularity
234  */
235 #define TargetTagIsCoveredBy(covered_target, covering_target) \
236  ((GET_PREDICATELOCKTARGETTAG_RELATION(covered_target) == /* (2) */ \
237  GET_PREDICATELOCKTARGETTAG_RELATION(covering_target)) \
238  && (GET_PREDICATELOCKTARGETTAG_OFFSET(covering_target) == \
239  InvalidOffsetNumber) /* (3) */ \
240  && (((GET_PREDICATELOCKTARGETTAG_OFFSET(covered_target) != \
241  InvalidOffsetNumber) /* (4a) */ \
242  && (GET_PREDICATELOCKTARGETTAG_PAGE(covering_target) == \
243  GET_PREDICATELOCKTARGETTAG_PAGE(covered_target))) \
244  || ((GET_PREDICATELOCKTARGETTAG_PAGE(covering_target) == \
245  InvalidBlockNumber) /* (4b) */ \
246  && (GET_PREDICATELOCKTARGETTAG_PAGE(covered_target) \
247  != InvalidBlockNumber))) \
248  && (GET_PREDICATELOCKTARGETTAG_DB(covered_target) == /* (1) */ \
249  GET_PREDICATELOCKTARGETTAG_DB(covering_target)))
250 
251 /*
252  * The predicate locking target and lock shared hash tables are partitioned to
253  * reduce contention. To determine which partition a given target belongs to,
254  * compute the tag's hash code with PredicateLockTargetTagHashCode(), then
255  * apply one of these macros.
256  * NB: NUM_PREDICATELOCK_PARTITIONS must be a power of 2!
257  */
258 #define PredicateLockHashPartition(hashcode) \
259  ((hashcode) % NUM_PREDICATELOCK_PARTITIONS)
260 #define PredicateLockHashPartitionLock(hashcode) \
261  (&MainLWLockArray[PREDICATELOCK_MANAGER_LWLOCK_OFFSET + \
262  PredicateLockHashPartition(hashcode)].lock)
263 #define PredicateLockHashPartitionLockByIndex(i) \
264  (&MainLWLockArray[PREDICATELOCK_MANAGER_LWLOCK_OFFSET + (i)].lock)
265 
266 #define NPREDICATELOCKTARGETENTS() \
267  mul_size(max_predicate_locks_per_xact, add_size(MaxBackends, max_prepared_xacts))
268 
269 #define SxactIsOnFinishedList(sxact) (!dlist_node_is_detached(&(sxact)->finishedLink))
270 
271 /*
272  * Note that a sxact is marked "prepared" once it has passed
273  * PreCommit_CheckForSerializationFailure, even if it isn't using
274  * 2PC. This is the point at which it can no longer be aborted.
275  *
276  * The PREPARED flag remains set after commit, so SxactIsCommitted
277  * implies SxactIsPrepared.
278  */
279 #define SxactIsCommitted(sxact) (((sxact)->flags & SXACT_FLAG_COMMITTED) != 0)
280 #define SxactIsPrepared(sxact) (((sxact)->flags & SXACT_FLAG_PREPARED) != 0)
281 #define SxactIsRolledBack(sxact) (((sxact)->flags & SXACT_FLAG_ROLLED_BACK) != 0)
282 #define SxactIsDoomed(sxact) (((sxact)->flags & SXACT_FLAG_DOOMED) != 0)
283 #define SxactIsReadOnly(sxact) (((sxact)->flags & SXACT_FLAG_READ_ONLY) != 0)
284 #define SxactHasSummaryConflictIn(sxact) (((sxact)->flags & SXACT_FLAG_SUMMARY_CONFLICT_IN) != 0)
285 #define SxactHasSummaryConflictOut(sxact) (((sxact)->flags & SXACT_FLAG_SUMMARY_CONFLICT_OUT) != 0)
286 /*
287  * The following macro actually means that the specified transaction has a
288  * conflict out *to a transaction which committed ahead of it*. It's hard
289  * to get that into a name of a reasonable length.
290  */
291 #define SxactHasConflictOut(sxact) (((sxact)->flags & SXACT_FLAG_CONFLICT_OUT) != 0)
292 #define SxactIsDeferrableWaiting(sxact) (((sxact)->flags & SXACT_FLAG_DEFERRABLE_WAITING) != 0)
293 #define SxactIsROSafe(sxact) (((sxact)->flags & SXACT_FLAG_RO_SAFE) != 0)
294 #define SxactIsROUnsafe(sxact) (((sxact)->flags & SXACT_FLAG_RO_UNSAFE) != 0)
295 #define SxactIsPartiallyReleased(sxact) (((sxact)->flags & SXACT_FLAG_PARTIALLY_RELEASED) != 0)
296 
297 /*
298  * Compute the hash code associated with a PREDICATELOCKTARGETTAG.
299  *
300  * To avoid unnecessary recomputations of the hash code, we try to do this
301  * just once per function, and then pass it around as needed. Aside from
302  * passing the hashcode to hash_search_with_hash_value(), we can extract
303  * the lock partition number from the hashcode.
304  */
305 #define PredicateLockTargetTagHashCode(predicatelocktargettag) \
306  get_hash_value(PredicateLockTargetHash, predicatelocktargettag)
307 
308 /*
309  * Given a predicate lock tag, and the hash for its target,
310  * compute the lock hash.
311  *
312  * To make the hash code also depend on the transaction, we xor the sxid
313  * struct's address into the hash code, left-shifted so that the
314  * partition-number bits don't change. Since this is only a hash, we
315  * don't care if we lose high-order bits of the address; use an
316  * intermediate variable to suppress cast-pointer-to-int warnings.
317  */
318 #define PredicateLockHashCodeFromTargetHashCode(predicatelocktag, targethash) \
319  ((targethash) ^ ((uint32) PointerGetDatum((predicatelocktag)->myXact)) \
320  << LOG2_NUM_PREDICATELOCK_PARTITIONS)
321 
322 
323 /*
324  * The SLRU buffer area through which we access the old xids.
325  */
327 
328 #define SerialSlruCtl (&SerialSlruCtlData)
329 
330 #define SERIAL_PAGESIZE BLCKSZ
331 #define SERIAL_ENTRYSIZE sizeof(SerCommitSeqNo)
332 #define SERIAL_ENTRIESPERPAGE (SERIAL_PAGESIZE / SERIAL_ENTRYSIZE)
333 
334 /*
335  * Set maximum pages based on the number needed to track all transactions.
336  */
337 #define SERIAL_MAX_PAGE (MaxTransactionId / SERIAL_ENTRIESPERPAGE)
338 
339 #define SerialNextPage(page) (((page) >= SERIAL_MAX_PAGE) ? 0 : (page) + 1)
340 
341 #define SerialValue(slotno, xid) (*((SerCommitSeqNo *) \
342  (SerialSlruCtl->shared->page_buffer[slotno] + \
343  ((((uint32) (xid)) % SERIAL_ENTRIESPERPAGE) * SERIAL_ENTRYSIZE))))
344 
345 #define SerialPage(xid) (((uint32) (xid)) / SERIAL_ENTRIESPERPAGE)
346 
347 typedef struct SerialControlData
348 {
349  int headPage; /* newest initialized page */
350  TransactionId headXid; /* newest valid Xid in the SLRU */
351  TransactionId tailXid; /* oldest xmin we might be interested in */
353 
355 
357 
358 /*
359  * When the oldest committed transaction on the "finished" list is moved to
360  * SLRU, its predicate locks will be moved to this "dummy" transaction,
361  * collapsing duplicate targets. When a duplicate is found, the later
362  * commitSeqNo is used.
363  */
365 
366 
367 /*
368  * These configuration variables are used to set the predicate lock table size
369  * and to control promotion of predicate locks to coarser granularity in an
370  * attempt to degrade performance (mostly as false positive serialization
371  * failure) gracefully in the face of memory pressure.
372  */
373 int max_predicate_locks_per_xact; /* in guc_tables.c */
374 int max_predicate_locks_per_relation; /* in guc_tables.c */
375 int max_predicate_locks_per_page; /* in guc_tables.c */
376 
377 /*
378  * This provides a list of objects in order to track transactions
379  * participating in predicate locking. Entries in the list are fixed size,
380  * and reside in shared memory. The memory address of an entry must remain
381  * fixed during its lifetime. The list will be protected from concurrent
382  * update externally; no provision is made in this code to manage that. The
383  * number of entries in the list, and the size allowed for each entry is
384  * fixed upon creation.
385  */
387 
388 /*
389  * This provides a pool of RWConflict data elements to use in conflict lists
390  * between transactions.
391  */
393 
394 /*
395  * The predicate locking hash tables are in shared memory.
396  * Each backend keeps pointers to them.
397  */
402 
403 /*
404  * Tag for a dummy entry in PredicateLockTargetHash. By temporarily removing
405  * this entry, you can ensure that there's enough scratch space available for
406  * inserting one entry in the hash table. This is an otherwise-invalid tag.
407  */
408 static const PREDICATELOCKTARGETTAG ScratchTargetTag = {0, 0, 0, 0};
411 
412 /*
413  * The local hash table used to determine when to combine multiple fine-
414  * grained locks into a single courser-grained lock.
415  */
417 
418 /*
419  * Keep a pointer to the currently-running serializable transaction (if any)
420  * for quick reference. Also, remember if we have written anything that could
421  * cause a rw-conflict.
422  */
424 static bool MyXactDidWrite = false;
425 
426 /*
427  * The SXACT_FLAG_RO_UNSAFE optimization might lead us to release
428  * MySerializableXact early. If that happens in a parallel query, the leader
429  * needs to defer the destruction of the SERIALIZABLEXACT until end of
430  * transaction, because the workers still have a reference to it. In that
431  * case, the leader stores it here.
432  */
434 
435 /* local functions */
436 
437 static SERIALIZABLEXACT *CreatePredXact(void);
438 static void ReleasePredXact(SERIALIZABLEXACT *sxact);
439 
440 static bool RWConflictExists(const SERIALIZABLEXACT *reader, const SERIALIZABLEXACT *writer);
441 static void SetRWConflict(SERIALIZABLEXACT *reader, SERIALIZABLEXACT *writer);
442 static void SetPossibleUnsafeConflict(SERIALIZABLEXACT *roXact, SERIALIZABLEXACT *activeXact);
443 static void ReleaseRWConflict(RWConflict conflict);
444 static void FlagSxactUnsafe(SERIALIZABLEXACT *sxact);
445 
446 static bool SerialPagePrecedesLogically(int64 page1, int64 page2);
447 static void SerialInit(void);
448 static void SerialAdd(TransactionId xid, SerCommitSeqNo minConflictCommitSeqNo);
450 static void SerialSetActiveSerXmin(TransactionId xid);
451 
452 static uint32 predicatelock_hash(const void *key, Size keysize);
453 static void SummarizeOldestCommittedSxact(void);
454 static Snapshot GetSafeSnapshot(Snapshot origSnapshot);
456  VirtualTransactionId *sourcevxid,
457  int sourcepid);
458 static bool PredicateLockExists(const PREDICATELOCKTARGETTAG *targettag);
460  PREDICATELOCKTARGETTAG *parent);
461 static bool CoarserLockCovers(const PREDICATELOCKTARGETTAG *newtargettag);
462 static void RemoveScratchTarget(bool lockheld);
463 static void RestoreScratchTarget(bool lockheld);
465  uint32 targettaghash);
466 static void DeleteChildTargetLocks(const PREDICATELOCKTARGETTAG *newtargettag);
467 static int MaxPredicateChildLocks(const PREDICATELOCKTARGETTAG *tag);
469 static void DecrementParentLocks(const PREDICATELOCKTARGETTAG *targettag);
470 static void CreatePredicateLock(const PREDICATELOCKTARGETTAG *targettag,
471  uint32 targettaghash,
472  SERIALIZABLEXACT *sxact);
473 static void DeleteLockTarget(PREDICATELOCKTARGET *target, uint32 targettaghash);
475  PREDICATELOCKTARGETTAG newtargettag,
476  bool removeOld);
477 static void PredicateLockAcquire(const PREDICATELOCKTARGETTAG *targettag);
478 static void DropAllPredicateLocksFromTable(Relation relation,
479  bool transfer);
480 static void SetNewSxactGlobalXmin(void);
481 static void ClearOldPredicateLocks(void);
482 static void ReleaseOneSerializableXact(SERIALIZABLEXACT *sxact, bool partial,
483  bool summarize);
484 static bool XidIsConcurrent(TransactionId xid);
485 static void CheckTargetForConflictsIn(PREDICATELOCKTARGETTAG *targettag);
486 static void FlagRWConflict(SERIALIZABLEXACT *reader, SERIALIZABLEXACT *writer);
488  SERIALIZABLEXACT *writer);
489 static void CreateLocalPredicateLockHash(void);
490 static void ReleasePredicateLocksLocal(void);
491 
492 
493 /*------------------------------------------------------------------------*/
494 
495 /*
496  * Does this relation participate in predicate locking? Temporary and system
497  * relations are exempt.
498  */
499 static inline bool
501 {
502  return !(relation->rd_id < FirstUnpinnedObjectId ||
503  RelationUsesLocalBuffers(relation));
504 }
505 
506 /*
507  * When a public interface method is called for a read, this is the test to
508  * see if we should do a quick return.
509  *
510  * Note: this function has side-effects! If this transaction has been flagged
511  * as RO-safe since the last call, we release all predicate locks and reset
512  * MySerializableXact. That makes subsequent calls to return quickly.
513  *
514  * This is marked as 'inline' to eliminate the function call overhead in the
515  * common case that serialization is not needed.
516  */
517 static inline bool
519 {
520  /* Nothing to do if this is not a serializable transaction */
522  return false;
523 
524  /*
525  * Don't acquire locks or conflict when scanning with a special snapshot.
526  * This excludes things like CLUSTER and REINDEX. They use the wholesale
527  * functions TransferPredicateLocksToHeapRelation() and
528  * CheckTableForSerializableConflictIn() to participate in serialization,
529  * but the scans involved don't need serialization.
530  */
531  if (!IsMVCCSnapshot(snapshot))
532  return false;
533 
534  /*
535  * Check if we have just become "RO-safe". If we have, immediately release
536  * all locks as they're not needed anymore. This also resets
537  * MySerializableXact, so that subsequent calls to this function can exit
538  * quickly.
539  *
540  * A transaction is flagged as RO_SAFE if all concurrent R/W transactions
541  * commit without having conflicts out to an earlier snapshot, thus
542  * ensuring that no conflicts are possible for this transaction.
543  */
545  {
546  ReleasePredicateLocks(false, true);
547  return false;
548  }
549 
550  /* Check if the relation doesn't participate in predicate locking */
551  if (!PredicateLockingNeededForRelation(relation))
552  return false;
553 
554  return true; /* no excuse to skip predicate locking */
555 }
556 
557 /*
558  * Like SerializationNeededForRead(), but called on writes.
559  * The logic is the same, but there is no snapshot and we can't be RO-safe.
560  */
561 static inline bool
563 {
564  /* Nothing to do if this is not a serializable transaction */
566  return false;
567 
568  /* Check if the relation doesn't participate in predicate locking */
569  if (!PredicateLockingNeededForRelation(relation))
570  return false;
571 
572  return true; /* no excuse to skip predicate locking */
573 }
574 
575 
576 /*------------------------------------------------------------------------*/
577 
578 /*
579  * These functions are a simple implementation of a list for this specific
580  * type of struct. If there is ever a generalized shared memory list, we
581  * should probably switch to that.
582  */
583 static SERIALIZABLEXACT *
585 {
586  SERIALIZABLEXACT *sxact;
587 
589  return NULL;
590 
591  sxact = dlist_container(SERIALIZABLEXACT, xactLink,
594  return sxact;
595 }
596 
597 static void
599 {
600  Assert(ShmemAddrIsValid(sxact));
601 
602  dlist_delete(&sxact->xactLink);
604 }
605 
606 /*------------------------------------------------------------------------*/
607 
608 /*
609  * These functions manage primitive access to the RWConflict pool and lists.
610  */
611 static bool
613 {
614  dlist_iter iter;
615 
616  Assert(reader != writer);
617 
618  /* Check the ends of the purported conflict first. */
619  if (SxactIsDoomed(reader)
620  || SxactIsDoomed(writer)
621  || dlist_is_empty(&reader->outConflicts)
622  || dlist_is_empty(&writer->inConflicts))
623  return false;
624 
625  /*
626  * A conflict is possible; walk the list to find out.
627  *
628  * The unconstify is needed as we have no const version of
629  * dlist_foreach().
630  */
631  dlist_foreach(iter, &unconstify(SERIALIZABLEXACT *, reader)->outConflicts)
632  {
633  RWConflict conflict =
634  dlist_container(RWConflictData, outLink, iter.cur);
635 
636  if (conflict->sxactIn == writer)
637  return true;
638  }
639 
640  /* No conflict found. */
641  return false;
642 }
643 
644 static void
646 {
647  RWConflict conflict;
648 
649  Assert(reader != writer);
650  Assert(!RWConflictExists(reader, writer));
651 
653  ereport(ERROR,
654  (errcode(ERRCODE_OUT_OF_MEMORY),
655  errmsg("not enough elements in RWConflictPool to record a read/write conflict"),
656  errhint("You might need to run fewer transactions at a time or increase max_connections.")));
657 
659  dlist_delete(&conflict->outLink);
660 
661  conflict->sxactOut = reader;
662  conflict->sxactIn = writer;
663  dlist_push_tail(&reader->outConflicts, &conflict->outLink);
664  dlist_push_tail(&writer->inConflicts, &conflict->inLink);
665 }
666 
667 static void
669  SERIALIZABLEXACT *activeXact)
670 {
671  RWConflict conflict;
672 
673  Assert(roXact != activeXact);
674  Assert(SxactIsReadOnly(roXact));
675  Assert(!SxactIsReadOnly(activeXact));
676 
678  ereport(ERROR,
679  (errcode(ERRCODE_OUT_OF_MEMORY),
680  errmsg("not enough elements in RWConflictPool to record a potential read/write conflict"),
681  errhint("You might need to run fewer transactions at a time or increase max_connections.")));
682 
684  dlist_delete(&conflict->outLink);
685 
686  conflict->sxactOut = activeXact;
687  conflict->sxactIn = roXact;
688  dlist_push_tail(&activeXact->possibleUnsafeConflicts, &conflict->outLink);
689  dlist_push_tail(&roXact->possibleUnsafeConflicts, &conflict->inLink);
690 }
691 
692 static void
694 {
695  dlist_delete(&conflict->inLink);
696  dlist_delete(&conflict->outLink);
698 }
699 
700 static void
702 {
703  dlist_mutable_iter iter;
704 
705  Assert(SxactIsReadOnly(sxact));
706  Assert(!SxactIsROSafe(sxact));
707 
708  sxact->flags |= SXACT_FLAG_RO_UNSAFE;
709 
710  /*
711  * We know this isn't a safe snapshot, so we can stop looking for other
712  * potential conflicts.
713  */
715  {
716  RWConflict conflict =
717  dlist_container(RWConflictData, inLink, iter.cur);
718 
719  Assert(!SxactIsReadOnly(conflict->sxactOut));
720  Assert(sxact == conflict->sxactIn);
721 
722  ReleaseRWConflict(conflict);
723  }
724 }
725 
726 /*------------------------------------------------------------------------*/
727 
728 /*
729  * Decide whether a Serial page number is "older" for truncation purposes.
730  * Analogous to CLOGPagePrecedes().
731  */
732 static bool
733 SerialPagePrecedesLogically(int64 page1, int64 page2)
734 {
735  TransactionId xid1;
736  TransactionId xid2;
737 
738  xid1 = ((TransactionId) page1) * SERIAL_ENTRIESPERPAGE;
739  xid1 += FirstNormalTransactionId + 1;
740  xid2 = ((TransactionId) page2) * SERIAL_ENTRIESPERPAGE;
741  xid2 += FirstNormalTransactionId + 1;
742 
743  return (TransactionIdPrecedes(xid1, xid2) &&
744  TransactionIdPrecedes(xid1, xid2 + SERIAL_ENTRIESPERPAGE - 1));
745 }
746 
747 #ifdef USE_ASSERT_CHECKING
748 static void
749 SerialPagePrecedesLogicallyUnitTests(void)
750 {
751  int per_page = SERIAL_ENTRIESPERPAGE,
752  offset = per_page / 2;
753  int64 newestPage,
754  oldestPage,
755  headPage,
756  targetPage;
757  TransactionId newestXact,
758  oldestXact;
759 
760  /* GetNewTransactionId() has assigned the last XID it can safely use. */
761  newestPage = 2 * SLRU_PAGES_PER_SEGMENT - 1; /* nothing special */
762  newestXact = newestPage * per_page + offset;
763  Assert(newestXact / per_page == newestPage);
764  oldestXact = newestXact + 1;
765  oldestXact -= 1U << 31;
766  oldestPage = oldestXact / per_page;
767 
768  /*
769  * In this scenario, the SLRU headPage pertains to the last ~1000 XIDs
770  * assigned. oldestXact finishes, ~2B XIDs having elapsed since it
771  * started. Further transactions cause us to summarize oldestXact to
772  * tailPage. Function must return false so SerialAdd() doesn't zero
773  * tailPage (which may contain entries for other old, recently-finished
774  * XIDs) and half the SLRU. Reaching this requires burning ~2B XIDs in
775  * single-user mode, a negligible possibility.
776  */
777  headPage = newestPage;
778  targetPage = oldestPage;
780 
781  /*
782  * In this scenario, the SLRU headPage pertains to oldestXact. We're
783  * summarizing an XID near newestXact. (Assume few other XIDs used
784  * SERIALIZABLE, hence the minimal headPage advancement. Assume
785  * oldestXact was long-running and only recently reached the SLRU.)
786  * Function must return true to make SerialAdd() create targetPage.
787  *
788  * Today's implementation mishandles this case, but it doesn't matter
789  * enough to fix. Verify that the defect affects just one page by
790  * asserting correct treatment of its prior page. Reaching this case
791  * requires burning ~2B XIDs in single-user mode, a negligible
792  * possibility. Moreover, if it does happen, the consequence would be
793  * mild, namely a new transaction failing in SimpleLruReadPage().
794  */
795  headPage = oldestPage;
796  targetPage = newestPage;
797  Assert(SerialPagePrecedesLogically(headPage, targetPage - 1));
798 #if 0
800 #endif
801 }
802 #endif
803 
804 /*
805  * Initialize for the tracking of old serializable committed xids.
806  */
807 static void
809 {
810  bool found;
811 
812  /*
813  * Set up SLRU management of the pg_serial data.
814  */
816  SimpleLruInit(SerialSlruCtl, "serializable",
817  serializable_buffers, 0, "pg_serial",
819  SYNC_HANDLER_NONE, false);
820 #ifdef USE_ASSERT_CHECKING
821  SerialPagePrecedesLogicallyUnitTests();
822 #endif
824 
825  /*
826  * Create or attach to the SerialControl structure.
827  */
829  ShmemInitStruct("SerialControlData", sizeof(SerialControlData), &found);
830 
831  Assert(found == IsUnderPostmaster);
832  if (!found)
833  {
834  /*
835  * Set control information to reflect empty SLRU.
836  */
837  LWLockAcquire(SerialControlLock, LW_EXCLUSIVE);
838  serialControl->headPage = -1;
841  LWLockRelease(SerialControlLock);
842  }
843 }
844 
845 /*
846  * GUC check_hook for serializable_buffers
847  */
848 bool
850 {
851  return check_slru_buffers("serializable_buffers", newval);
852 }
853 
854 /*
855  * Record a committed read write serializable xid and the minimum
856  * commitSeqNo of any transactions to which this xid had a rw-conflict out.
857  * An invalid commitSeqNo means that there were no conflicts out from xid.
858  */
859 static void
860 SerialAdd(TransactionId xid, SerCommitSeqNo minConflictCommitSeqNo)
861 {
863  int64 targetPage;
864  int slotno;
865  int64 firstZeroPage;
866  bool isNewPage;
867  LWLock *lock;
868 
870 
871  targetPage = SerialPage(xid);
872  lock = SimpleLruGetBankLock(SerialSlruCtl, targetPage);
873 
874  /*
875  * In this routine, we must hold both SerialControlLock and the SLRU bank
876  * lock simultaneously while making the SLRU data catch up with the new
877  * state that we determine.
878  */
879  LWLockAcquire(SerialControlLock, LW_EXCLUSIVE);
880 
881  /*
882  * If no serializable transactions are active, there shouldn't be anything
883  * to push out to the SLRU. Hitting this assert would mean there's
884  * something wrong with the earlier cleanup logic.
885  */
888 
889  /*
890  * If the SLRU is currently unused, zero out the whole active region from
891  * tailXid to headXid before taking it into use. Otherwise zero out only
892  * any new pages that enter the tailXid-headXid range as we advance
893  * headXid.
894  */
895  if (serialControl->headPage < 0)
896  {
897  firstZeroPage = SerialPage(tailXid);
898  isNewPage = true;
899  }
900  else
901  {
902  firstZeroPage = SerialNextPage(serialControl->headPage);
904  targetPage);
905  }
906 
909  serialControl->headXid = xid;
910  if (isNewPage)
911  serialControl->headPage = targetPage;
912 
914 
915  if (isNewPage)
916  {
917  /* Initialize intervening pages. */
918  while (firstZeroPage != targetPage)
919  {
920  (void) SimpleLruZeroPage(SerialSlruCtl, firstZeroPage);
921  firstZeroPage = SerialNextPage(firstZeroPage);
922  }
923  slotno = SimpleLruZeroPage(SerialSlruCtl, targetPage);
924  }
925  else
926  slotno = SimpleLruReadPage(SerialSlruCtl, targetPage, true, xid);
927 
928  SerialValue(slotno, xid) = minConflictCommitSeqNo;
929  SerialSlruCtl->shared->page_dirty[slotno] = true;
930 
931  LWLockRelease(lock);
932  LWLockRelease(SerialControlLock);
933 }
934 
935 /*
936  * Get the minimum commitSeqNo for any conflict out for the given xid. For
937  * a transaction which exists but has no conflict out, InvalidSerCommitSeqNo
938  * will be returned.
939  */
940 static SerCommitSeqNo
942 {
946  int slotno;
947 
949 
950  LWLockAcquire(SerialControlLock, LW_SHARED);
953  LWLockRelease(SerialControlLock);
954 
956  return 0;
957 
959 
961  || TransactionIdFollows(xid, headXid))
962  return 0;
963 
964  /*
965  * The following function must be called without holding SLRU bank lock,
966  * but will return with that lock held, which must then be released.
967  */
969  SerialPage(xid), xid);
970  val = SerialValue(slotno, xid);
972  return val;
973 }
974 
975 /*
976  * Call this whenever there is a new xmin for active serializable
977  * transactions. We don't need to keep information on transactions which
978  * precede that. InvalidTransactionId means none active, so everything in
979  * the SLRU can be discarded.
980  */
981 static void
983 {
984  LWLockAcquire(SerialControlLock, LW_EXCLUSIVE);
985 
986  /*
987  * When no sxacts are active, nothing overlaps, set the xid values to
988  * invalid to show that there are no valid entries. Don't clear headPage,
989  * though. A new xmin might still land on that page, and we don't want to
990  * repeatedly zero out the same page.
991  */
992  if (!TransactionIdIsValid(xid))
993  {
996  LWLockRelease(SerialControlLock);
997  return;
998  }
999 
1000  /*
1001  * When we're recovering prepared transactions, the global xmin might move
1002  * backwards depending on the order they're recovered. Normally that's not
1003  * OK, but during recovery no serializable transactions will commit, so
1004  * the SLRU is empty and we can get away with it.
1005  */
1006  if (RecoveryInProgress())
1007  {
1011  {
1012  serialControl->tailXid = xid;
1013  }
1014  LWLockRelease(SerialControlLock);
1015  return;
1016  }
1017 
1020 
1021  serialControl->tailXid = xid;
1022 
1023  LWLockRelease(SerialControlLock);
1024 }
1025 
1026 /*
1027  * Perform a checkpoint --- either during shutdown, or on-the-fly
1028  *
1029  * We don't have any data that needs to survive a restart, but this is a
1030  * convenient place to truncate the SLRU.
1031  */
1032 void
1034 {
1035  int truncateCutoffPage;
1036 
1037  LWLockAcquire(SerialControlLock, LW_EXCLUSIVE);
1038 
1039  /* Exit quickly if the SLRU is currently not in use. */
1040  if (serialControl->headPage < 0)
1041  {
1042  LWLockRelease(SerialControlLock);
1043  return;
1044  }
1045 
1047  {
1048  int tailPage;
1049 
1050  tailPage = SerialPage(serialControl->tailXid);
1051 
1052  /*
1053  * It is possible for the tailXid to be ahead of the headXid. This
1054  * occurs if we checkpoint while there are in-progress serializable
1055  * transaction(s) advancing the tail but we are yet to summarize the
1056  * transactions. In this case, we cutoff up to the headPage and the
1057  * next summary will advance the headXid.
1058  */
1060  {
1061  /* We can truncate the SLRU up to the page containing tailXid */
1062  truncateCutoffPage = tailPage;
1063  }
1064  else
1065  truncateCutoffPage = serialControl->headPage;
1066  }
1067  else
1068  {
1069  /*----------
1070  * The SLRU is no longer needed. Truncate to head before we set head
1071  * invalid.
1072  *
1073  * XXX: It's possible that the SLRU is not needed again until XID
1074  * wrap-around has happened, so that the segment containing headPage
1075  * that we leave behind will appear to be new again. In that case it
1076  * won't be removed until XID horizon advances enough to make it
1077  * current again.
1078  *
1079  * XXX: This should happen in vac_truncate_clog(), not in checkpoints.
1080  * Consider this scenario, starting from a system with no in-progress
1081  * transactions and VACUUM FREEZE having maximized oldestXact:
1082  * - Start a SERIALIZABLE transaction.
1083  * - Start, finish, and summarize a SERIALIZABLE transaction, creating
1084  * one SLRU page.
1085  * - Consume XIDs to reach xidStopLimit.
1086  * - Finish all transactions. Due to the long-running SERIALIZABLE
1087  * transaction, earlier checkpoints did not touch headPage. The
1088  * next checkpoint will change it, but that checkpoint happens after
1089  * the end of the scenario.
1090  * - VACUUM to advance XID limits.
1091  * - Consume ~2M XIDs, crossing the former xidWrapLimit.
1092  * - Start, finish, and summarize a SERIALIZABLE transaction.
1093  * SerialAdd() declines to create the targetPage, because headPage
1094  * is not regarded as in the past relative to that targetPage. The
1095  * transaction instigating the summarize fails in
1096  * SimpleLruReadPage().
1097  */
1098  truncateCutoffPage = serialControl->headPage;
1099  serialControl->headPage = -1;
1100  }
1101 
1102  LWLockRelease(SerialControlLock);
1103 
1104  /*
1105  * Truncate away pages that are no longer required. Note that no
1106  * additional locking is required, because this is only called as part of
1107  * a checkpoint, and the validity limits have already been determined.
1108  */
1109  SimpleLruTruncate(SerialSlruCtl, truncateCutoffPage);
1110 
1111  /*
1112  * Write dirty SLRU pages to disk
1113  *
1114  * This is not actually necessary from a correctness point of view. We do
1115  * it merely as a debugging aid.
1116  *
1117  * We're doing this after the truncation to avoid writing pages right
1118  * before deleting the file in which they sit, which would be completely
1119  * pointless.
1120  */
1122 }
1123 
1124 /*------------------------------------------------------------------------*/
1125 
1126 /*
1127  * InitPredicateLocks -- Initialize the predicate locking data structures.
1128  *
1129  * This is called from CreateSharedMemoryAndSemaphores(), which see for
1130  * more comments. In the normal postmaster case, the shared hash tables
1131  * are created here. Backends inherit the pointers
1132  * to the shared tables via fork(). In the EXEC_BACKEND case, each
1133  * backend re-executes this code to obtain pointers to the already existing
1134  * shared hash tables.
1135  */
1136 void
1138 {
1139  HASHCTL info;
1140  long max_table_size;
1141  Size requestSize;
1142  bool found;
1143 
1144 #ifndef EXEC_BACKEND
1146 #endif
1147 
1148  /*
1149  * Compute size of predicate lock target hashtable. Note these
1150  * calculations must agree with PredicateLockShmemSize!
1151  */
1152  max_table_size = NPREDICATELOCKTARGETENTS();
1153 
1154  /*
1155  * Allocate hash table for PREDICATELOCKTARGET structs. This stores
1156  * per-predicate-lock-target information.
1157  */
1158  info.keysize = sizeof(PREDICATELOCKTARGETTAG);
1159  info.entrysize = sizeof(PREDICATELOCKTARGET);
1161 
1162  PredicateLockTargetHash = ShmemInitHash("PREDICATELOCKTARGET hash",
1163  max_table_size,
1164  max_table_size,
1165  &info,
1166  HASH_ELEM | HASH_BLOBS |
1168 
1169  /*
1170  * Reserve a dummy entry in the hash table; we use it to make sure there's
1171  * always one entry available when we need to split or combine a page,
1172  * because running out of space there could mean aborting a
1173  * non-serializable transaction.
1174  */
1175  if (!IsUnderPostmaster)
1176  {
1178  HASH_ENTER, &found);
1179  Assert(!found);
1180  }
1181 
1182  /* Pre-calculate the hash and partition lock of the scratch entry */
1185 
1186  /*
1187  * Allocate hash table for PREDICATELOCK structs. This stores per
1188  * xact-lock-of-a-target information.
1189  */
1190  info.keysize = sizeof(PREDICATELOCKTAG);
1191  info.entrysize = sizeof(PREDICATELOCK);
1192  info.hash = predicatelock_hash;
1194 
1195  /* Assume an average of 2 xacts per target */
1196  max_table_size *= 2;
1197 
1198  PredicateLockHash = ShmemInitHash("PREDICATELOCK hash",
1199  max_table_size,
1200  max_table_size,
1201  &info,
1204 
1205  /*
1206  * Compute size for serializable transaction hashtable. Note these
1207  * calculations must agree with PredicateLockShmemSize!
1208  */
1209  max_table_size = (MaxBackends + max_prepared_xacts);
1210 
1211  /*
1212  * Allocate a list to hold information on transactions participating in
1213  * predicate locking.
1214  *
1215  * Assume an average of 10 predicate locking transactions per backend.
1216  * This allows aggressive cleanup while detail is present before data must
1217  * be summarized for storage in SLRU and the "dummy" transaction.
1218  */
1219  max_table_size *= 10;
1220 
1221  PredXact = ShmemInitStruct("PredXactList",
1223  &found);
1224  Assert(found == IsUnderPostmaster);
1225  if (!found)
1226  {
1227  int i;
1228 
1237  requestSize = mul_size((Size) max_table_size,
1238  sizeof(SERIALIZABLEXACT));
1239  PredXact->element = ShmemAlloc(requestSize);
1240  /* Add all elements to available list, clean. */
1241  memset(PredXact->element, 0, requestSize);
1242  for (i = 0; i < max_table_size; i++)
1243  {
1247  }
1264  }
1265  /* This never changes, so let's keep a local copy. */
1267 
1268  /*
1269  * Allocate hash table for SERIALIZABLEXID structs. This stores per-xid
1270  * information for serializable transactions which have accessed data.
1271  */
1272  info.keysize = sizeof(SERIALIZABLEXIDTAG);
1273  info.entrysize = sizeof(SERIALIZABLEXID);
1274 
1275  SerializableXidHash = ShmemInitHash("SERIALIZABLEXID hash",
1276  max_table_size,
1277  max_table_size,
1278  &info,
1279  HASH_ELEM | HASH_BLOBS |
1280  HASH_FIXED_SIZE);
1281 
1282  /*
1283  * Allocate space for tracking rw-conflicts in lists attached to the
1284  * transactions.
1285  *
1286  * Assume an average of 5 conflicts per transaction. Calculations suggest
1287  * that this will prevent resource exhaustion in even the most pessimal
1288  * loads up to max_connections = 200 with all 200 connections pounding the
1289  * database with serializable transactions. Beyond that, there may be
1290  * occasional transactions canceled when trying to flag conflicts. That's
1291  * probably OK.
1292  */
1293  max_table_size *= 5;
1294 
1295  RWConflictPool = ShmemInitStruct("RWConflictPool",
1297  &found);
1298  Assert(found == IsUnderPostmaster);
1299  if (!found)
1300  {
1301  int i;
1302 
1304  requestSize = mul_size((Size) max_table_size,
1306  RWConflictPool->element = ShmemAlloc(requestSize);
1307  /* Add all elements to available list, clean. */
1308  memset(RWConflictPool->element, 0, requestSize);
1309  for (i = 0; i < max_table_size; i++)
1310  {
1313  }
1314  }
1315 
1316  /*
1317  * Create or attach to the header for the list of finished serializable
1318  * transactions.
1319  */
1321  ShmemInitStruct("FinishedSerializableTransactions",
1322  sizeof(dlist_head),
1323  &found);
1324  Assert(found == IsUnderPostmaster);
1325  if (!found)
1327 
1328  /*
1329  * Initialize the SLRU storage for old committed serializable
1330  * transactions.
1331  */
1332  SerialInit();
1333 }
1334 
1335 /*
1336  * Estimate shared-memory space used for predicate lock table
1337  */
1338 Size
1340 {
1341  Size size = 0;
1342  long max_table_size;
1343 
1344  /* predicate lock target hash table */
1345  max_table_size = NPREDICATELOCKTARGETENTS();
1346  size = add_size(size, hash_estimate_size(max_table_size,
1347  sizeof(PREDICATELOCKTARGET)));
1348 
1349  /* predicate lock hash table */
1350  max_table_size *= 2;
1351  size = add_size(size, hash_estimate_size(max_table_size,
1352  sizeof(PREDICATELOCK)));
1353 
1354  /*
1355  * Since NPREDICATELOCKTARGETENTS is only an estimate, add 10% safety
1356  * margin.
1357  */
1358  size = add_size(size, size / 10);
1359 
1360  /* transaction list */
1361  max_table_size = MaxBackends + max_prepared_xacts;
1362  max_table_size *= 10;
1364  size = add_size(size, mul_size((Size) max_table_size,
1365  sizeof(SERIALIZABLEXACT)));
1366 
1367  /* transaction xid table */
1368  size = add_size(size, hash_estimate_size(max_table_size,
1369  sizeof(SERIALIZABLEXID)));
1370 
1371  /* rw-conflict pool */
1372  max_table_size *= 5;
1374  size = add_size(size, mul_size((Size) max_table_size,
1376 
1377  /* Head for list of finished serializable transactions. */
1378  size = add_size(size, sizeof(dlist_head));
1379 
1380  /* Shared memory structures for SLRU tracking of old committed xids. */
1381  size = add_size(size, sizeof(SerialControlData));
1383 
1384  return size;
1385 }
1386 
1387 
1388 /*
1389  * Compute the hash code associated with a PREDICATELOCKTAG.
1390  *
1391  * Because we want to use just one set of partition locks for both the
1392  * PREDICATELOCKTARGET and PREDICATELOCK hash tables, we have to make sure
1393  * that PREDICATELOCKs fall into the same partition number as their
1394  * associated PREDICATELOCKTARGETs. dynahash.c expects the partition number
1395  * to be the low-order bits of the hash code, and therefore a
1396  * PREDICATELOCKTAG's hash code must have the same low-order bits as the
1397  * associated PREDICATELOCKTARGETTAG's hash code. We achieve this with this
1398  * specialized hash function.
1399  */
1400 static uint32
1401 predicatelock_hash(const void *key, Size keysize)
1402 {
1403  const PREDICATELOCKTAG *predicatelocktag = (const PREDICATELOCKTAG *) key;
1404  uint32 targethash;
1405 
1406  Assert(keysize == sizeof(PREDICATELOCKTAG));
1407 
1408  /* Look into the associated target object, and compute its hash code */
1409  targethash = PredicateLockTargetTagHashCode(&predicatelocktag->myTarget->tag);
1410 
1411  return PredicateLockHashCodeFromTargetHashCode(predicatelocktag, targethash);
1412 }
1413 
1414 
1415 /*
1416  * GetPredicateLockStatusData
1417  * Return a table containing the internal state of the predicate
1418  * lock manager for use in pg_lock_status.
1419  *
1420  * Like GetLockStatusData, this function tries to hold the partition LWLocks
1421  * for as short a time as possible by returning two arrays that simply
1422  * contain the PREDICATELOCKTARGETTAG and SERIALIZABLEXACT for each lock
1423  * table entry. Multiple copies of the same PREDICATELOCKTARGETTAG and
1424  * SERIALIZABLEXACT will likely appear.
1425  */
1428 {
1430  int i;
1431  int els,
1432  el;
1433  HASH_SEQ_STATUS seqstat;
1434  PREDICATELOCK *predlock;
1435 
1437 
1438  /*
1439  * To ensure consistency, take simultaneous locks on all partition locks
1440  * in ascending order, then SerializableXactHashLock.
1441  */
1442  for (i = 0; i < NUM_PREDICATELOCK_PARTITIONS; i++)
1444  LWLockAcquire(SerializableXactHashLock, LW_SHARED);
1445 
1446  /* Get number of locks and allocate appropriately-sized arrays. */
1448  data->nelements = els;
1449  data->locktags = (PREDICATELOCKTARGETTAG *)
1450  palloc(sizeof(PREDICATELOCKTARGETTAG) * els);
1451  data->xacts = (SERIALIZABLEXACT *)
1452  palloc(sizeof(SERIALIZABLEXACT) * els);
1453 
1454 
1455  /* Scan through PredicateLockHash and copy contents */
1456  hash_seq_init(&seqstat, PredicateLockHash);
1457 
1458  el = 0;
1459 
1460  while ((predlock = (PREDICATELOCK *) hash_seq_search(&seqstat)))
1461  {
1462  data->locktags[el] = predlock->tag.myTarget->tag;
1463  data->xacts[el] = *predlock->tag.myXact;
1464  el++;
1465  }
1466 
1467  Assert(el == els);
1468 
1469  /* Release locks in reverse order */
1470  LWLockRelease(SerializableXactHashLock);
1471  for (i = NUM_PREDICATELOCK_PARTITIONS - 1; i >= 0; i--)
1473 
1474  return data;
1475 }
1476 
1477 /*
1478  * Free up shared memory structures by pushing the oldest sxact (the one at
1479  * the front of the SummarizeOldestCommittedSxact queue) into summary form.
1480  * Each call will free exactly one SERIALIZABLEXACT structure and may also
1481  * free one or more of these structures: SERIALIZABLEXID, PREDICATELOCK,
1482  * PREDICATELOCKTARGET, RWConflictData.
1483  */
1484 static void
1486 {
1487  SERIALIZABLEXACT *sxact;
1488 
1489  LWLockAcquire(SerializableFinishedListLock, LW_EXCLUSIVE);
1490 
1491  /*
1492  * This function is only called if there are no sxact slots available.
1493  * Some of them must belong to old, already-finished transactions, so
1494  * there should be something in FinishedSerializableTransactions list that
1495  * we can summarize. However, there's a race condition: while we were not
1496  * holding any locks, a transaction might have ended and cleaned up all
1497  * the finished sxact entries already, freeing up their sxact slots. In
1498  * that case, we have nothing to do here. The caller will find one of the
1499  * slots released by the other backend when it retries.
1500  */
1502  {
1503  LWLockRelease(SerializableFinishedListLock);
1504  return;
1505  }
1506 
1507  /*
1508  * Grab the first sxact off the finished list -- this will be the earliest
1509  * commit. Remove it from the list.
1510  */
1511  sxact = dlist_head_element(SERIALIZABLEXACT, finishedLink,
1514 
1515  /* Add to SLRU summary information. */
1516  if (TransactionIdIsValid(sxact->topXid) && !SxactIsReadOnly(sxact))
1517  SerialAdd(sxact->topXid, SxactHasConflictOut(sxact)
1519 
1520  /* Summarize and release the detail. */
1521  ReleaseOneSerializableXact(sxact, false, true);
1522 
1523  LWLockRelease(SerializableFinishedListLock);
1524 }
1525 
1526 /*
1527  * GetSafeSnapshot
1528  * Obtain and register a snapshot for a READ ONLY DEFERRABLE
1529  * transaction. Ensures that the snapshot is "safe", i.e. a
1530  * read-only transaction running on it can execute serializably
1531  * without further checks. This requires waiting for concurrent
1532  * transactions to complete, and retrying with a new snapshot if
1533  * one of them could possibly create a conflict.
1534  *
1535  * As with GetSerializableTransactionSnapshot (which this is a subroutine
1536  * for), the passed-in Snapshot pointer should reference a static data
1537  * area that can safely be passed to GetSnapshotData.
1538  */
1539 static Snapshot
1541 {
1542  Snapshot snapshot;
1543 
1545 
1546  while (true)
1547  {
1548  /*
1549  * GetSerializableTransactionSnapshotInt is going to call
1550  * GetSnapshotData, so we need to provide it the static snapshot area
1551  * our caller passed to us. The pointer returned is actually the same
1552  * one passed to it, but we avoid assuming that here.
1553  */
1554  snapshot = GetSerializableTransactionSnapshotInt(origSnapshot,
1555  NULL, InvalidPid);
1556 
1558  return snapshot; /* no concurrent r/w xacts; it's safe */
1559 
1560  LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
1561 
1562  /*
1563  * Wait for concurrent transactions to finish. Stop early if one of
1564  * them marked us as conflicted.
1565  */
1569  {
1570  LWLockRelease(SerializableXactHashLock);
1571  ProcWaitForSignal(WAIT_EVENT_SAFE_SNAPSHOT);
1572  LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
1573  }
1575 
1577  {
1578  LWLockRelease(SerializableXactHashLock);
1579  break; /* success */
1580  }
1581 
1582  LWLockRelease(SerializableXactHashLock);
1583 
1584  /* else, need to retry... */
1585  ereport(DEBUG2,
1587  errmsg_internal("deferrable snapshot was unsafe; trying a new one")));
1588  ReleasePredicateLocks(false, false);
1589  }
1590 
1591  /*
1592  * Now we have a safe snapshot, so we don't need to do any further checks.
1593  */
1595  ReleasePredicateLocks(false, true);
1596 
1597  return snapshot;
1598 }
1599 
1600 /*
1601  * GetSafeSnapshotBlockingPids
1602  * If the specified process is currently blocked in GetSafeSnapshot,
1603  * write the process IDs of all processes that it is blocked by
1604  * into the caller-supplied buffer output[]. The list is truncated at
1605  * output_size, and the number of PIDs written into the buffer is
1606  * returned. Returns zero if the given PID is not currently blocked
1607  * in GetSafeSnapshot.
1608  */
1609 int
1610 GetSafeSnapshotBlockingPids(int blocked_pid, int *output, int output_size)
1611 {
1612  int num_written = 0;
1613  dlist_iter iter;
1614  SERIALIZABLEXACT *blocking_sxact = NULL;
1615 
1616  LWLockAcquire(SerializableXactHashLock, LW_SHARED);
1617 
1618  /* Find blocked_pid's SERIALIZABLEXACT by linear search. */
1620  {
1621  SERIALIZABLEXACT *sxact =
1622  dlist_container(SERIALIZABLEXACT, xactLink, iter.cur);
1623 
1624  if (sxact->pid == blocked_pid)
1625  {
1626  blocking_sxact = sxact;
1627  break;
1628  }
1629  }
1630 
1631  /* Did we find it, and is it currently waiting in GetSafeSnapshot? */
1632  if (blocking_sxact != NULL && SxactIsDeferrableWaiting(blocking_sxact))
1633  {
1634  /* Traverse the list of possible unsafe conflicts collecting PIDs. */
1635  dlist_foreach(iter, &blocking_sxact->possibleUnsafeConflicts)
1636  {
1637  RWConflict possibleUnsafeConflict =
1638  dlist_container(RWConflictData, inLink, iter.cur);
1639 
1640  output[num_written++] = possibleUnsafeConflict->sxactOut->pid;
1641 
1642  if (num_written >= output_size)
1643  break;
1644  }
1645  }
1646 
1647  LWLockRelease(SerializableXactHashLock);
1648 
1649  return num_written;
1650 }
1651 
1652 /*
1653  * Acquire a snapshot that can be used for the current transaction.
1654  *
1655  * Make sure we have a SERIALIZABLEXACT reference in MySerializableXact.
1656  * It should be current for this process and be contained in PredXact.
1657  *
1658  * The passed-in Snapshot pointer should reference a static data area that
1659  * can safely be passed to GetSnapshotData. The return value is actually
1660  * always this same pointer; no new snapshot data structure is allocated
1661  * within this function.
1662  */
1663 Snapshot
1665 {
1667 
1668  /*
1669  * Can't use serializable mode while recovery is still active, as it is,
1670  * for example, on a hot standby. We could get here despite the check in
1671  * check_transaction_isolation() if default_transaction_isolation is set
1672  * to serializable, so phrase the hint accordingly.
1673  */
1674  if (RecoveryInProgress())
1675  ereport(ERROR,
1676  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1677  errmsg("cannot use serializable mode in a hot standby"),
1678  errdetail("default_transaction_isolation is set to \"serializable\"."),
1679  errhint("You can use \"SET default_transaction_isolation = 'repeatable read'\" to change the default.")));
1680 
1681  /*
1682  * A special optimization is available for SERIALIZABLE READ ONLY
1683  * DEFERRABLE transactions -- we can wait for a suitable snapshot and
1684  * thereby avoid all SSI overhead once it's running.
1685  */
1687  return GetSafeSnapshot(snapshot);
1688 
1689  return GetSerializableTransactionSnapshotInt(snapshot,
1690  NULL, InvalidPid);
1691 }
1692 
1693 /*
1694  * Import a snapshot to be used for the current transaction.
1695  *
1696  * This is nearly the same as GetSerializableTransactionSnapshot, except that
1697  * we don't take a new snapshot, but rather use the data we're handed.
1698  *
1699  * The caller must have verified that the snapshot came from a serializable
1700  * transaction; and if we're read-write, the source transaction must not be
1701  * read-only.
1702  */
1703 void
1705  VirtualTransactionId *sourcevxid,
1706  int sourcepid)
1707 {
1709 
1710  /*
1711  * If this is called by parallel.c in a parallel worker, we don't want to
1712  * create a SERIALIZABLEXACT just yet because the leader's
1713  * SERIALIZABLEXACT will be installed with AttachSerializableXact(). We
1714  * also don't want to reject SERIALIZABLE READ ONLY DEFERRABLE in this
1715  * case, because the leader has already determined that the snapshot it
1716  * has passed us is safe. So there is nothing for us to do.
1717  */
1718  if (IsParallelWorker())
1719  return;
1720 
1721  /*
1722  * We do not allow SERIALIZABLE READ ONLY DEFERRABLE transactions to
1723  * import snapshots, since there's no way to wait for a safe snapshot when
1724  * we're using the snap we're told to. (XXX instead of throwing an error,
1725  * we could just ignore the XactDeferrable flag?)
1726  */
1728  ereport(ERROR,
1729  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1730  errmsg("a snapshot-importing transaction must not be READ ONLY DEFERRABLE")));
1731 
1732  (void) GetSerializableTransactionSnapshotInt(snapshot, sourcevxid,
1733  sourcepid);
1734 }
1735 
1736 /*
1737  * Guts of GetSerializableTransactionSnapshot
1738  *
1739  * If sourcevxid is valid, this is actually an import operation and we should
1740  * skip calling GetSnapshotData, because the snapshot contents are already
1741  * loaded up. HOWEVER: to avoid race conditions, we must check that the
1742  * source xact is still running after we acquire SerializableXactHashLock.
1743  * We do that by calling ProcArrayInstallImportedXmin.
1744  */
1745 static Snapshot
1747  VirtualTransactionId *sourcevxid,
1748  int sourcepid)
1749 {
1750  PGPROC *proc;
1751  VirtualTransactionId vxid;
1752  SERIALIZABLEXACT *sxact,
1753  *othersxact;
1754 
1755  /* We only do this for serializable transactions. Once. */
1757 
1759 
1760  /*
1761  * Since all parts of a serializable transaction must use the same
1762  * snapshot, it is too late to establish one after a parallel operation
1763  * has begun.
1764  */
1765  if (IsInParallelMode())
1766  elog(ERROR, "cannot establish serializable snapshot during a parallel operation");
1767 
1768  proc = MyProc;
1769  Assert(proc != NULL);
1770  GET_VXID_FROM_PGPROC(vxid, *proc);
1771 
1772  /*
1773  * First we get the sxact structure, which may involve looping and access
1774  * to the "finished" list to free a structure for use.
1775  *
1776  * We must hold SerializableXactHashLock when taking/checking the snapshot
1777  * to avoid race conditions, for much the same reasons that
1778  * GetSnapshotData takes the ProcArrayLock. Since we might have to
1779  * release SerializableXactHashLock to call SummarizeOldestCommittedSxact,
1780  * this means we have to create the sxact first, which is a bit annoying
1781  * (in particular, an elog(ERROR) in procarray.c would cause us to leak
1782  * the sxact). Consider refactoring to avoid this.
1783  */
1784 #ifdef TEST_SUMMARIZE_SERIAL
1786 #endif
1787  LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
1788  do
1789  {
1790  sxact = CreatePredXact();
1791  /* If null, push out committed sxact to SLRU summary & retry. */
1792  if (!sxact)
1793  {
1794  LWLockRelease(SerializableXactHashLock);
1796  LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
1797  }
1798  } while (!sxact);
1799 
1800  /* Get the snapshot, or check that it's safe to use */
1801  if (!sourcevxid)
1802  snapshot = GetSnapshotData(snapshot);
1803  else if (!ProcArrayInstallImportedXmin(snapshot->xmin, sourcevxid))
1804  {
1805  ReleasePredXact(sxact);
1806  LWLockRelease(SerializableXactHashLock);
1807  ereport(ERROR,
1808  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1809  errmsg("could not import the requested snapshot"),
1810  errdetail("The source process with PID %d is not running anymore.",
1811  sourcepid)));
1812  }
1813 
1814  /*
1815  * If there are no serializable transactions which are not read-only, we
1816  * can "opt out" of predicate locking and conflict checking for a
1817  * read-only transaction.
1818  *
1819  * The reason this is safe is that a read-only transaction can only become
1820  * part of a dangerous structure if it overlaps a writable transaction
1821  * which in turn overlaps a writable transaction which committed before
1822  * the read-only transaction started. A new writable transaction can
1823  * overlap this one, but it can't meet the other condition of overlapping
1824  * a transaction which committed before this one started.
1825  */
1827  {
1828  ReleasePredXact(sxact);
1829  LWLockRelease(SerializableXactHashLock);
1830  return snapshot;
1831  }
1832 
1833  /* Initialize the structure. */
1834  sxact->vxid = vxid;
1838  dlist_init(&(sxact->outConflicts));
1839  dlist_init(&(sxact->inConflicts));
1841  sxact->topXid = GetTopTransactionIdIfAny();
1843  sxact->xmin = snapshot->xmin;
1844  sxact->pid = MyProcPid;
1845  sxact->pgprocno = MyProcNumber;
1846  dlist_init(&sxact->predicateLocks);
1847  dlist_node_init(&sxact->finishedLink);
1848  sxact->flags = 0;
1849  if (XactReadOnly)
1850  {
1851  dlist_iter iter;
1852 
1853  sxact->flags |= SXACT_FLAG_READ_ONLY;
1854 
1855  /*
1856  * Register all concurrent r/w transactions as possible conflicts; if
1857  * all of them commit without any outgoing conflicts to earlier
1858  * transactions then this snapshot can be deemed safe (and we can run
1859  * without tracking predicate locks).
1860  */
1862  {
1863  othersxact = dlist_container(SERIALIZABLEXACT, xactLink, iter.cur);
1864 
1865  if (!SxactIsCommitted(othersxact)
1866  && !SxactIsDoomed(othersxact)
1867  && !SxactIsReadOnly(othersxact))
1868  {
1869  SetPossibleUnsafeConflict(sxact, othersxact);
1870  }
1871  }
1872 
1873  /*
1874  * If we didn't find any possibly unsafe conflicts because every
1875  * uncommitted writable transaction turned out to be doomed, then we
1876  * can "opt out" immediately. See comments above the earlier check
1877  * for PredXact->WritableSxactCount == 0.
1878  */
1880  {
1881  ReleasePredXact(sxact);
1882  LWLockRelease(SerializableXactHashLock);
1883  return snapshot;
1884  }
1885  }
1886  else
1887  {
1891  }
1892 
1893  /* Maintain serializable global xmin info. */
1895  {
1897  PredXact->SxactGlobalXmin = snapshot->xmin;
1899  SerialSetActiveSerXmin(snapshot->xmin);
1900  }
1901  else if (TransactionIdEquals(snapshot->xmin, PredXact->SxactGlobalXmin))
1902  {
1905  }
1906  else
1907  {
1909  }
1910 
1911  MySerializableXact = sxact;
1912  MyXactDidWrite = false; /* haven't written anything yet */
1913 
1914  LWLockRelease(SerializableXactHashLock);
1915 
1917 
1918  return snapshot;
1919 }
1920 
1921 static void
1923 {
1924  HASHCTL hash_ctl;
1925 
1926  /* Initialize the backend-local hash table of parent locks */
1927  Assert(LocalPredicateLockHash == NULL);
1928  hash_ctl.keysize = sizeof(PREDICATELOCKTARGETTAG);
1929  hash_ctl.entrysize = sizeof(LOCALPREDICATELOCK);
1930  LocalPredicateLockHash = hash_create("Local predicate lock",
1932  &hash_ctl,
1933  HASH_ELEM | HASH_BLOBS);
1934 }
1935 
1936 /*
1937  * Register the top level XID in SerializableXidHash.
1938  * Also store it for easy reference in MySerializableXact.
1939  */
1940 void
1942 {
1943  SERIALIZABLEXIDTAG sxidtag;
1944  SERIALIZABLEXID *sxid;
1945  bool found;
1946 
1947  /*
1948  * If we're not tracking predicate lock data for this transaction, we
1949  * should ignore the request and return quickly.
1950  */
1952  return;
1953 
1954  /* We should have a valid XID and be at the top level. */
1956 
1957  LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
1958 
1959  /* This should only be done once per transaction. */
1961 
1962  MySerializableXact->topXid = xid;
1963 
1964  sxidtag.xid = xid;
1966  &sxidtag,
1967  HASH_ENTER, &found);
1968  Assert(!found);
1969 
1970  /* Initialize the structure. */
1971  sxid->myXact = MySerializableXact;
1972  LWLockRelease(SerializableXactHashLock);
1973 }
1974 
1975 
1976 /*
1977  * Check whether there are any predicate locks held by any transaction
1978  * for the page at the given block number.
1979  *
1980  * Note that the transaction may be completed but not yet subject to
1981  * cleanup due to overlapping serializable transactions. This must
1982  * return valid information regardless of transaction isolation level.
1983  *
1984  * Also note that this doesn't check for a conflicting relation lock,
1985  * just a lock specifically on the given page.
1986  *
1987  * One use is to support proper behavior during GiST index vacuum.
1988  */
1989 bool
1991 {
1992  PREDICATELOCKTARGETTAG targettag;
1993  uint32 targettaghash;
1994  LWLock *partitionLock;
1995  PREDICATELOCKTARGET *target;
1996 
1998  relation->rd_locator.dbOid,
1999  relation->rd_id,
2000  blkno);
2001 
2002  targettaghash = PredicateLockTargetTagHashCode(&targettag);
2003  partitionLock = PredicateLockHashPartitionLock(targettaghash);
2004  LWLockAcquire(partitionLock, LW_SHARED);
2005  target = (PREDICATELOCKTARGET *)
2007  &targettag, targettaghash,
2008  HASH_FIND, NULL);
2009  LWLockRelease(partitionLock);
2010 
2011  return (target != NULL);
2012 }
2013 
2014 
2015 /*
2016  * Check whether a particular lock is held by this transaction.
2017  *
2018  * Important note: this function may return false even if the lock is
2019  * being held, because it uses the local lock table which is not
2020  * updated if another transaction modifies our lock list (e.g. to
2021  * split an index page). It can also return true when a coarser
2022  * granularity lock that covers this target is being held. Be careful
2023  * to only use this function in circumstances where such errors are
2024  * acceptable!
2025  */
2026 static bool
2028 {
2029  LOCALPREDICATELOCK *lock;
2030 
2031  /* check local hash table */
2033  targettag,
2034  HASH_FIND, NULL);
2035 
2036  if (!lock)
2037  return false;
2038 
2039  /*
2040  * Found entry in the table, but still need to check whether it's actually
2041  * held -- it could just be a parent of some held lock.
2042  */
2043  return lock->held;
2044 }
2045 
2046 /*
2047  * Return the parent lock tag in the lock hierarchy: the next coarser
2048  * lock that covers the provided tag.
2049  *
2050  * Returns true and sets *parent to the parent tag if one exists,
2051  * returns false if none exists.
2052  */
2053 static bool
2055  PREDICATELOCKTARGETTAG *parent)
2056 {
2057  switch (GET_PREDICATELOCKTARGETTAG_TYPE(*tag))
2058  {
2059  case PREDLOCKTAG_RELATION:
2060  /* relation locks have no parent lock */
2061  return false;
2062 
2063  case PREDLOCKTAG_PAGE:
2064  /* parent lock is relation lock */
2068 
2069  return true;
2070 
2071  case PREDLOCKTAG_TUPLE:
2072  /* parent lock is page lock */
2077  return true;
2078  }
2079 
2080  /* not reachable */
2081  Assert(false);
2082  return false;
2083 }
2084 
2085 /*
2086  * Check whether the lock we are considering is already covered by a
2087  * coarser lock for our transaction.
2088  *
2089  * Like PredicateLockExists, this function might return a false
2090  * negative, but it will never return a false positive.
2091  */
2092 static bool
2094 {
2095  PREDICATELOCKTARGETTAG targettag,
2096  parenttag;
2097 
2098  targettag = *newtargettag;
2099 
2100  /* check parents iteratively until no more */
2101  while (GetParentPredicateLockTag(&targettag, &parenttag))
2102  {
2103  targettag = parenttag;
2104  if (PredicateLockExists(&targettag))
2105  return true;
2106  }
2107 
2108  /* no more parents to check; lock is not covered */
2109  return false;
2110 }
2111 
2112 /*
2113  * Remove the dummy entry from the predicate lock target hash, to free up some
2114  * scratch space. The caller must be holding SerializablePredicateListLock,
2115  * and must restore the entry with RestoreScratchTarget() before releasing the
2116  * lock.
2117  *
2118  * If lockheld is true, the caller is already holding the partition lock
2119  * of the partition containing the scratch entry.
2120  */
2121 static void
2122 RemoveScratchTarget(bool lockheld)
2123 {
2124  bool found;
2125 
2126  Assert(LWLockHeldByMe(SerializablePredicateListLock));
2127 
2128  if (!lockheld)
2133  HASH_REMOVE, &found);
2134  Assert(found);
2135  if (!lockheld)
2137 }
2138 
2139 /*
2140  * Re-insert the dummy entry in predicate lock target hash.
2141  */
2142 static void
2143 RestoreScratchTarget(bool lockheld)
2144 {
2145  bool found;
2146 
2147  Assert(LWLockHeldByMe(SerializablePredicateListLock));
2148 
2149  if (!lockheld)
2154  HASH_ENTER, &found);
2155  Assert(!found);
2156  if (!lockheld)
2158 }
2159 
2160 /*
2161  * Check whether the list of related predicate locks is empty for a
2162  * predicate lock target, and remove the target if it is.
2163  */
2164 static void
2166 {
2168 
2169  Assert(LWLockHeldByMe(SerializablePredicateListLock));
2170 
2171  /* Can't remove it until no locks at this target. */
2172  if (!dlist_is_empty(&target->predicateLocks))
2173  return;
2174 
2175  /* Actually remove the target. */
2177  &target->tag,
2178  targettaghash,
2179  HASH_REMOVE, NULL);
2180  Assert(rmtarget == target);
2181 }
2182 
2183 /*
2184  * Delete child target locks owned by this process.
2185  * This implementation is assuming that the usage of each target tag field
2186  * is uniform. No need to make this hard if we don't have to.
2187  *
2188  * We acquire an LWLock in the case of parallel mode, because worker
2189  * backends have access to the leader's SERIALIZABLEXACT. Otherwise,
2190  * we aren't acquiring LWLocks for the predicate lock or lock
2191  * target structures associated with this transaction unless we're going
2192  * to modify them, because no other process is permitted to modify our
2193  * locks.
2194  */
2195 static void
2197 {
2198  SERIALIZABLEXACT *sxact;
2199  PREDICATELOCK *predlock;
2200  dlist_mutable_iter iter;
2201 
2202  LWLockAcquire(SerializablePredicateListLock, LW_SHARED);
2203  sxact = MySerializableXact;
2204  if (IsInParallelMode())
2206 
2207  dlist_foreach_modify(iter, &sxact->predicateLocks)
2208  {
2209  PREDICATELOCKTAG oldlocktag;
2210  PREDICATELOCKTARGET *oldtarget;
2211  PREDICATELOCKTARGETTAG oldtargettag;
2212 
2213  predlock = dlist_container(PREDICATELOCK, xactLink, iter.cur);
2214 
2215  oldlocktag = predlock->tag;
2216  Assert(oldlocktag.myXact == sxact);
2217  oldtarget = oldlocktag.myTarget;
2218  oldtargettag = oldtarget->tag;
2219 
2220  if (TargetTagIsCoveredBy(oldtargettag, *newtargettag))
2221  {
2222  uint32 oldtargettaghash;
2223  LWLock *partitionLock;
2225 
2226  oldtargettaghash = PredicateLockTargetTagHashCode(&oldtargettag);
2227  partitionLock = PredicateLockHashPartitionLock(oldtargettaghash);
2228 
2229  LWLockAcquire(partitionLock, LW_EXCLUSIVE);
2230 
2231  dlist_delete(&predlock->xactLink);
2232  dlist_delete(&predlock->targetLink);
2233  rmpredlock = hash_search_with_hash_value
2235  &oldlocktag,
2237  oldtargettaghash),
2238  HASH_REMOVE, NULL);
2239  Assert(rmpredlock == predlock);
2240 
2241  RemoveTargetIfNoLongerUsed(oldtarget, oldtargettaghash);
2242 
2243  LWLockRelease(partitionLock);
2244 
2245  DecrementParentLocks(&oldtargettag);
2246  }
2247  }
2248  if (IsInParallelMode())
2250  LWLockRelease(SerializablePredicateListLock);
2251 }
2252 
2253 /*
2254  * Returns the promotion limit for a given predicate lock target. This is the
2255  * max number of descendant locks allowed before promoting to the specified
2256  * tag. Note that the limit includes non-direct descendants (e.g., both tuples
2257  * and pages for a relation lock).
2258  *
2259  * Currently the default limit is 2 for a page lock, and half of the value of
2260  * max_pred_locks_per_transaction - 1 for a relation lock, to match behavior
2261  * of earlier releases when upgrading.
2262  *
2263  * TODO SSI: We should probably add additional GUCs to allow a maximum ratio
2264  * of page and tuple locks based on the pages in a relation, and the maximum
2265  * ratio of tuple locks to tuples in a page. This would provide more
2266  * generally "balanced" allocation of locks to where they are most useful,
2267  * while still allowing the absolute numbers to prevent one relation from
2268  * tying up all predicate lock resources.
2269  */
2270 static int
2272 {
2273  switch (GET_PREDICATELOCKTARGETTAG_TYPE(*tag))
2274  {
2275  case PREDLOCKTAG_RELATION:
2280 
2281  case PREDLOCKTAG_PAGE:
2283 
2284  case PREDLOCKTAG_TUPLE:
2285 
2286  /*
2287  * not reachable: nothing is finer-granularity than a tuple, so we
2288  * should never try to promote to it.
2289  */
2290  Assert(false);
2291  return 0;
2292  }
2293 
2294  /* not reachable */
2295  Assert(false);
2296  return 0;
2297 }
2298 
2299 /*
2300  * For all ancestors of a newly-acquired predicate lock, increment
2301  * their child count in the parent hash table. If any of them have
2302  * more descendants than their promotion threshold, acquire the
2303  * coarsest such lock.
2304  *
2305  * Returns true if a parent lock was acquired and false otherwise.
2306  */
2307 static bool
2309 {
2310  PREDICATELOCKTARGETTAG targettag,
2311  nexttag,
2312  promotiontag;
2313  LOCALPREDICATELOCK *parentlock;
2314  bool found,
2315  promote;
2316 
2317  promote = false;
2318 
2319  targettag = *reqtag;
2320 
2321  /* check parents iteratively */
2322  while (GetParentPredicateLockTag(&targettag, &nexttag))
2323  {
2324  targettag = nexttag;
2326  &targettag,
2327  HASH_ENTER,
2328  &found);
2329  if (!found)
2330  {
2331  parentlock->held = false;
2332  parentlock->childLocks = 1;
2333  }
2334  else
2335  parentlock->childLocks++;
2336 
2337  if (parentlock->childLocks >
2338  MaxPredicateChildLocks(&targettag))
2339  {
2340  /*
2341  * We should promote to this parent lock. Continue to check its
2342  * ancestors, however, both to get their child counts right and to
2343  * check whether we should just go ahead and promote to one of
2344  * them.
2345  */
2346  promotiontag = targettag;
2347  promote = true;
2348  }
2349  }
2350 
2351  if (promote)
2352  {
2353  /* acquire coarsest ancestor eligible for promotion */
2354  PredicateLockAcquire(&promotiontag);
2355  return true;
2356  }
2357  else
2358  return false;
2359 }
2360 
2361 /*
2362  * When releasing a lock, decrement the child count on all ancestor
2363  * locks.
2364  *
2365  * This is called only when releasing a lock via
2366  * DeleteChildTargetLocks (i.e. when a lock becomes redundant because
2367  * we've acquired its parent, possibly due to promotion) or when a new
2368  * MVCC write lock makes the predicate lock unnecessary. There's no
2369  * point in calling it when locks are released at transaction end, as
2370  * this information is no longer needed.
2371  */
2372 static void
2374 {
2375  PREDICATELOCKTARGETTAG parenttag,
2376  nexttag;
2377 
2378  parenttag = *targettag;
2379 
2380  while (GetParentPredicateLockTag(&parenttag, &nexttag))
2381  {
2382  uint32 targettaghash;
2383  LOCALPREDICATELOCK *parentlock,
2384  *rmlock PG_USED_FOR_ASSERTS_ONLY;
2385 
2386  parenttag = nexttag;
2387  targettaghash = PredicateLockTargetTagHashCode(&parenttag);
2388  parentlock = (LOCALPREDICATELOCK *)
2390  &parenttag, targettaghash,
2391  HASH_FIND, NULL);
2392 
2393  /*
2394  * There's a small chance the parent lock doesn't exist in the lock
2395  * table. This can happen if we prematurely removed it because an
2396  * index split caused the child refcount to be off.
2397  */
2398  if (parentlock == NULL)
2399  continue;
2400 
2401  parentlock->childLocks--;
2402 
2403  /*
2404  * Under similar circumstances the parent lock's refcount might be
2405  * zero. This only happens if we're holding that lock (otherwise we
2406  * would have removed the entry).
2407  */
2408  if (parentlock->childLocks < 0)
2409  {
2410  Assert(parentlock->held);
2411  parentlock->childLocks = 0;
2412  }
2413 
2414  if ((parentlock->childLocks == 0) && (!parentlock->held))
2415  {
2416  rmlock = (LOCALPREDICATELOCK *)
2418  &parenttag, targettaghash,
2419  HASH_REMOVE, NULL);
2420  Assert(rmlock == parentlock);
2421  }
2422  }
2423 }
2424 
2425 /*
2426  * Indicate that a predicate lock on the given target is held by the
2427  * specified transaction. Has no effect if the lock is already held.
2428  *
2429  * This updates the lock table and the sxact's lock list, and creates
2430  * the lock target if necessary, but does *not* do anything related to
2431  * granularity promotion or the local lock table. See
2432  * PredicateLockAcquire for that.
2433  */
2434 static void
2436  uint32 targettaghash,
2437  SERIALIZABLEXACT *sxact)
2438 {
2439  PREDICATELOCKTARGET *target;
2440  PREDICATELOCKTAG locktag;
2441  PREDICATELOCK *lock;
2442  LWLock *partitionLock;
2443  bool found;
2444 
2445  partitionLock = PredicateLockHashPartitionLock(targettaghash);
2446 
2447  LWLockAcquire(SerializablePredicateListLock, LW_SHARED);
2448  if (IsInParallelMode())
2450  LWLockAcquire(partitionLock, LW_EXCLUSIVE);
2451 
2452  /* Make sure that the target is represented. */
2453  target = (PREDICATELOCKTARGET *)
2455  targettag, targettaghash,
2456  HASH_ENTER_NULL, &found);
2457  if (!target)
2458  ereport(ERROR,
2459  (errcode(ERRCODE_OUT_OF_MEMORY),
2460  errmsg("out of shared memory"),
2461  errhint("You might need to increase %s.", "max_pred_locks_per_transaction")));
2462  if (!found)
2463  dlist_init(&target->predicateLocks);
2464 
2465  /* We've got the sxact and target, make sure they're joined. */
2466  locktag.myTarget = target;
2467  locktag.myXact = sxact;
2468  lock = (PREDICATELOCK *)
2470  PredicateLockHashCodeFromTargetHashCode(&locktag, targettaghash),
2471  HASH_ENTER_NULL, &found);
2472  if (!lock)
2473  ereport(ERROR,
2474  (errcode(ERRCODE_OUT_OF_MEMORY),
2475  errmsg("out of shared memory"),
2476  errhint("You might need to increase %s.", "max_pred_locks_per_transaction")));
2477 
2478  if (!found)
2479  {
2480  dlist_push_tail(&target->predicateLocks, &lock->targetLink);
2481  dlist_push_tail(&sxact->predicateLocks, &lock->xactLink);
2483  }
2484 
2485  LWLockRelease(partitionLock);
2486  if (IsInParallelMode())
2488  LWLockRelease(SerializablePredicateListLock);
2489 }
2490 
2491 /*
2492  * Acquire a predicate lock on the specified target for the current
2493  * connection if not already held. This updates the local lock table
2494  * and uses it to implement granularity promotion. It will consolidate
2495  * multiple locks into a coarser lock if warranted, and will release
2496  * any finer-grained locks covered by the new one.
2497  */
2498 static void
2500 {
2501  uint32 targettaghash;
2502  bool found;
2503  LOCALPREDICATELOCK *locallock;
2504 
2505  /* Do we have the lock already, or a covering lock? */
2506  if (PredicateLockExists(targettag))
2507  return;
2508 
2509  if (CoarserLockCovers(targettag))
2510  return;
2511 
2512  /* the same hash and LW lock apply to the lock target and the local lock. */
2513  targettaghash = PredicateLockTargetTagHashCode(targettag);
2514 
2515  /* Acquire lock in local table */
2516  locallock = (LOCALPREDICATELOCK *)
2518  targettag, targettaghash,
2519  HASH_ENTER, &found);
2520  locallock->held = true;
2521  if (!found)
2522  locallock->childLocks = 0;
2523 
2524  /* Actually create the lock */
2525  CreatePredicateLock(targettag, targettaghash, MySerializableXact);
2526 
2527  /*
2528  * Lock has been acquired. Check whether it should be promoted to a
2529  * coarser granularity, or whether there are finer-granularity locks to
2530  * clean up.
2531  */
2532  if (CheckAndPromotePredicateLockRequest(targettag))
2533  {
2534  /*
2535  * Lock request was promoted to a coarser-granularity lock, and that
2536  * lock was acquired. It will delete this lock and any of its
2537  * children, so we're done.
2538  */
2539  }
2540  else
2541  {
2542  /* Clean up any finer-granularity locks */
2544  DeleteChildTargetLocks(targettag);
2545  }
2546 }
2547 
2548 
2549 /*
2550  * PredicateLockRelation
2551  *
2552  * Gets a predicate lock at the relation level.
2553  * Skip if not in full serializable transaction isolation level.
2554  * Skip if this is a temporary table.
2555  * Clear any finer-grained predicate locks this session has on the relation.
2556  */
2557 void
2559 {
2561 
2562  if (!SerializationNeededForRead(relation, snapshot))
2563  return;
2564 
2566  relation->rd_locator.dbOid,
2567  relation->rd_id);
2568  PredicateLockAcquire(&tag);
2569 }
2570 
2571 /*
2572  * PredicateLockPage
2573  *
2574  * Gets a predicate lock at the page level.
2575  * Skip if not in full serializable transaction isolation level.
2576  * Skip if this is a temporary table.
2577  * Skip if a coarser predicate lock already covers this page.
2578  * Clear any finer-grained predicate locks this session has on the relation.
2579  */
2580 void
2582 {
2584 
2585  if (!SerializationNeededForRead(relation, snapshot))
2586  return;
2587 
2589  relation->rd_locator.dbOid,
2590  relation->rd_id,
2591  blkno);
2592  PredicateLockAcquire(&tag);
2593 }
2594 
2595 /*
2596  * PredicateLockTID
2597  *
2598  * Gets a predicate lock at the tuple level.
2599  * Skip if not in full serializable transaction isolation level.
2600  * Skip if this is a temporary table.
2601  */
2602 void
2604  TransactionId tuple_xid)
2605 {
2607 
2608  if (!SerializationNeededForRead(relation, snapshot))
2609  return;
2610 
2611  /*
2612  * Return if this xact wrote it.
2613  */
2614  if (relation->rd_index == NULL)
2615  {
2616  /* If we wrote it; we already have a write lock. */
2617  if (TransactionIdIsCurrentTransactionId(tuple_xid))
2618  return;
2619  }
2620 
2621  /*
2622  * Do quick-but-not-definitive test for a relation lock first. This will
2623  * never cause a return when the relation is *not* locked, but will
2624  * occasionally let the check continue when there really *is* a relation
2625  * level lock.
2626  */
2628  relation->rd_locator.dbOid,
2629  relation->rd_id);
2630  if (PredicateLockExists(&tag))
2631  return;
2632 
2634  relation->rd_locator.dbOid,
2635  relation->rd_id,
2638  PredicateLockAcquire(&tag);
2639 }
2640 
2641 
2642 /*
2643  * DeleteLockTarget
2644  *
2645  * Remove a predicate lock target along with any locks held for it.
2646  *
2647  * Caller must hold SerializablePredicateListLock and the
2648  * appropriate hash partition lock for the target.
2649  */
2650 static void
2652 {
2653  dlist_mutable_iter iter;
2654 
2655  Assert(LWLockHeldByMeInMode(SerializablePredicateListLock,
2656  LW_EXCLUSIVE));
2658 
2659  LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
2660 
2661  dlist_foreach_modify(iter, &target->predicateLocks)
2662  {
2663  PREDICATELOCK *predlock =
2664  dlist_container(PREDICATELOCK, targetLink, iter.cur);
2665  bool found;
2666 
2667  dlist_delete(&(predlock->xactLink));
2668  dlist_delete(&(predlock->targetLink));
2669 
2672  &predlock->tag,
2674  targettaghash),
2675  HASH_REMOVE, &found);
2676  Assert(found);
2677  }
2678  LWLockRelease(SerializableXactHashLock);
2679 
2680  /* Remove the target itself, if possible. */
2681  RemoveTargetIfNoLongerUsed(target, targettaghash);
2682 }
2683 
2684 
2685 /*
2686  * TransferPredicateLocksToNewTarget
2687  *
2688  * Move or copy all the predicate locks for a lock target, for use by
2689  * index page splits/combines and other things that create or replace
2690  * lock targets. If 'removeOld' is true, the old locks and the target
2691  * will be removed.
2692  *
2693  * Returns true on success, or false if we ran out of shared memory to
2694  * allocate the new target or locks. Guaranteed to always succeed if
2695  * removeOld is set (by using the scratch entry in PredicateLockTargetHash
2696  * for scratch space).
2697  *
2698  * Warning: the "removeOld" option should be used only with care,
2699  * because this function does not (indeed, can not) update other
2700  * backends' LocalPredicateLockHash. If we are only adding new
2701  * entries, this is not a problem: the local lock table is used only
2702  * as a hint, so missing entries for locks that are held are
2703  * OK. Having entries for locks that are no longer held, as can happen
2704  * when using "removeOld", is not in general OK. We can only use it
2705  * safely when replacing a lock with a coarser-granularity lock that
2706  * covers it, or if we are absolutely certain that no one will need to
2707  * refer to that lock in the future.
2708  *
2709  * Caller must hold SerializablePredicateListLock exclusively.
2710  */
2711 static bool
2713  PREDICATELOCKTARGETTAG newtargettag,
2714  bool removeOld)
2715 {
2716  uint32 oldtargettaghash;
2717  LWLock *oldpartitionLock;
2718  PREDICATELOCKTARGET *oldtarget;
2719  uint32 newtargettaghash;
2720  LWLock *newpartitionLock;
2721  bool found;
2722  bool outOfShmem = false;
2723 
2724  Assert(LWLockHeldByMeInMode(SerializablePredicateListLock,
2725  LW_EXCLUSIVE));
2726 
2727  oldtargettaghash = PredicateLockTargetTagHashCode(&oldtargettag);
2728  newtargettaghash = PredicateLockTargetTagHashCode(&newtargettag);
2729  oldpartitionLock = PredicateLockHashPartitionLock(oldtargettaghash);
2730  newpartitionLock = PredicateLockHashPartitionLock(newtargettaghash);
2731 
2732  if (removeOld)
2733  {
2734  /*
2735  * Remove the dummy entry to give us scratch space, so we know we'll
2736  * be able to create the new lock target.
2737  */
2738  RemoveScratchTarget(false);
2739  }
2740 
2741  /*
2742  * We must get the partition locks in ascending sequence to avoid
2743  * deadlocks. If old and new partitions are the same, we must request the
2744  * lock only once.
2745  */
2746  if (oldpartitionLock < newpartitionLock)
2747  {
2748  LWLockAcquire(oldpartitionLock,
2749  (removeOld ? LW_EXCLUSIVE : LW_SHARED));
2750  LWLockAcquire(newpartitionLock, LW_EXCLUSIVE);
2751  }
2752  else if (oldpartitionLock > newpartitionLock)
2753  {
2754  LWLockAcquire(newpartitionLock, LW_EXCLUSIVE);
2755  LWLockAcquire(oldpartitionLock,
2756  (removeOld ? LW_EXCLUSIVE : LW_SHARED));
2757  }
2758  else
2759  LWLockAcquire(newpartitionLock, LW_EXCLUSIVE);
2760 
2761  /*
2762  * Look for the old target. If not found, that's OK; no predicate locks
2763  * are affected, so we can just clean up and return. If it does exist,
2764  * walk its list of predicate locks and move or copy them to the new
2765  * target.
2766  */
2768  &oldtargettag,
2769  oldtargettaghash,
2770  HASH_FIND, NULL);
2771 
2772  if (oldtarget)
2773  {
2774  PREDICATELOCKTARGET *newtarget;
2775  PREDICATELOCKTAG newpredlocktag;
2776  dlist_mutable_iter iter;
2777 
2779  &newtargettag,
2780  newtargettaghash,
2781  HASH_ENTER_NULL, &found);
2782 
2783  if (!newtarget)
2784  {
2785  /* Failed to allocate due to insufficient shmem */
2786  outOfShmem = true;
2787  goto exit;
2788  }
2789 
2790  /* If we created a new entry, initialize it */
2791  if (!found)
2792  dlist_init(&newtarget->predicateLocks);
2793 
2794  newpredlocktag.myTarget = newtarget;
2795 
2796  /*
2797  * Loop through all the locks on the old target, replacing them with
2798  * locks on the new target.
2799  */
2800  LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
2801 
2802  dlist_foreach_modify(iter, &oldtarget->predicateLocks)
2803  {
2804  PREDICATELOCK *oldpredlock =
2805  dlist_container(PREDICATELOCK, targetLink, iter.cur);
2806  PREDICATELOCK *newpredlock;
2807  SerCommitSeqNo oldCommitSeqNo = oldpredlock->commitSeqNo;
2808 
2809  newpredlocktag.myXact = oldpredlock->tag.myXact;
2810 
2811  if (removeOld)
2812  {
2813  dlist_delete(&(oldpredlock->xactLink));
2814  dlist_delete(&(oldpredlock->targetLink));
2815 
2818  &oldpredlock->tag,
2820  oldtargettaghash),
2821  HASH_REMOVE, &found);
2822  Assert(found);
2823  }
2824 
2825  newpredlock = (PREDICATELOCK *)
2827  &newpredlocktag,
2829  newtargettaghash),
2831  &found);
2832  if (!newpredlock)
2833  {
2834  /* Out of shared memory. Undo what we've done so far. */
2835  LWLockRelease(SerializableXactHashLock);
2836  DeleteLockTarget(newtarget, newtargettaghash);
2837  outOfShmem = true;
2838  goto exit;
2839  }
2840  if (!found)
2841  {
2842  dlist_push_tail(&(newtarget->predicateLocks),
2843  &(newpredlock->targetLink));
2844  dlist_push_tail(&(newpredlocktag.myXact->predicateLocks),
2845  &(newpredlock->xactLink));
2846  newpredlock->commitSeqNo = oldCommitSeqNo;
2847  }
2848  else
2849  {
2850  if (newpredlock->commitSeqNo < oldCommitSeqNo)
2851  newpredlock->commitSeqNo = oldCommitSeqNo;
2852  }
2853 
2854  Assert(newpredlock->commitSeqNo != 0);
2855  Assert((newpredlock->commitSeqNo == InvalidSerCommitSeqNo)
2856  || (newpredlock->tag.myXact == OldCommittedSxact));
2857  }
2858  LWLockRelease(SerializableXactHashLock);
2859 
2860  if (removeOld)
2861  {
2862  Assert(dlist_is_empty(&oldtarget->predicateLocks));
2863  RemoveTargetIfNoLongerUsed(oldtarget, oldtargettaghash);
2864  }
2865  }
2866 
2867 
2868 exit:
2869  /* Release partition locks in reverse order of acquisition. */
2870  if (oldpartitionLock < newpartitionLock)
2871  {
2872  LWLockRelease(newpartitionLock);
2873  LWLockRelease(oldpartitionLock);
2874  }
2875  else if (oldpartitionLock > newpartitionLock)
2876  {
2877  LWLockRelease(oldpartitionLock);
2878  LWLockRelease(newpartitionLock);
2879  }
2880  else
2881  LWLockRelease(newpartitionLock);
2882 
2883  if (removeOld)
2884  {
2885  /* We shouldn't run out of memory if we're moving locks */
2886  Assert(!outOfShmem);
2887 
2888  /* Put the scratch entry back */
2889  RestoreScratchTarget(false);
2890  }
2891 
2892  return !outOfShmem;
2893 }
2894 
2895 /*
2896  * Drop all predicate locks of any granularity from the specified relation,
2897  * which can be a heap relation or an index relation. If 'transfer' is true,
2898  * acquire a relation lock on the heap for any transactions with any lock(s)
2899  * on the specified relation.
2900  *
2901  * This requires grabbing a lot of LW locks and scanning the entire lock
2902  * target table for matches. That makes this more expensive than most
2903  * predicate lock management functions, but it will only be called for DDL
2904  * type commands that are expensive anyway, and there are fast returns when
2905  * no serializable transactions are active or the relation is temporary.
2906  *
2907  * We don't use the TransferPredicateLocksToNewTarget function because it
2908  * acquires its own locks on the partitions of the two targets involved,
2909  * and we'll already be holding all partition locks.
2910  *
2911  * We can't throw an error from here, because the call could be from a
2912  * transaction which is not serializable.
2913  *
2914  * NOTE: This is currently only called with transfer set to true, but that may
2915  * change. If we decide to clean up the locks from a table on commit of a
2916  * transaction which executed DROP TABLE, the false condition will be useful.
2917  */
2918 static void
2920 {
2921  HASH_SEQ_STATUS seqstat;
2922  PREDICATELOCKTARGET *oldtarget;
2923  PREDICATELOCKTARGET *heaptarget;
2924  Oid dbId;
2925  Oid relId;
2926  Oid heapId;
2927  int i;
2928  bool isIndex;
2929  bool found;
2930  uint32 heaptargettaghash;
2931 
2932  /*
2933  * Bail out quickly if there are no serializable transactions running.
2934  * It's safe to check this without taking locks because the caller is
2935  * holding an ACCESS EXCLUSIVE lock on the relation. No new locks which
2936  * would matter here can be acquired while that is held.
2937  */
2939  return;
2940 
2941  if (!PredicateLockingNeededForRelation(relation))
2942  return;
2943 
2944  dbId = relation->rd_locator.dbOid;
2945  relId = relation->rd_id;
2946  if (relation->rd_index == NULL)
2947  {
2948  isIndex = false;
2949  heapId = relId;
2950  }
2951  else
2952  {
2953  isIndex = true;
2954  heapId = relation->rd_index->indrelid;
2955  }
2956  Assert(heapId != InvalidOid);
2957  Assert(transfer || !isIndex); /* index OID only makes sense with
2958  * transfer */
2959 
2960  /* Retrieve first time needed, then keep. */
2961  heaptargettaghash = 0;
2962  heaptarget = NULL;
2963 
2964  /* Acquire locks on all lock partitions */
2965  LWLockAcquire(SerializablePredicateListLock, LW_EXCLUSIVE);
2966  for (i = 0; i < NUM_PREDICATELOCK_PARTITIONS; i++)
2968  LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
2969 
2970  /*
2971  * Remove the dummy entry to give us scratch space, so we know we'll be
2972  * able to create the new lock target.
2973  */
2974  if (transfer)
2975  RemoveScratchTarget(true);
2976 
2977  /* Scan through target map */
2979 
2980  while ((oldtarget = (PREDICATELOCKTARGET *) hash_seq_search(&seqstat)))
2981  {
2982  dlist_mutable_iter iter;
2983 
2984  /*
2985  * Check whether this is a target which needs attention.
2986  */
2987  if (GET_PREDICATELOCKTARGETTAG_RELATION(oldtarget->tag) != relId)
2988  continue; /* wrong relation id */
2989  if (GET_PREDICATELOCKTARGETTAG_DB(oldtarget->tag) != dbId)
2990  continue; /* wrong database id */
2991  if (transfer && !isIndex
2993  continue; /* already the right lock */
2994 
2995  /*
2996  * If we made it here, we have work to do. We make sure the heap
2997  * relation lock exists, then we walk the list of predicate locks for
2998  * the old target we found, moving all locks to the heap relation lock
2999  * -- unless they already hold that.
3000  */
3001 
3002  /*
3003  * First make sure we have the heap relation target. We only need to
3004  * do this once.
3005  */
3006  if (transfer && heaptarget == NULL)
3007  {
3008  PREDICATELOCKTARGETTAG heaptargettag;
3009 
3010  SET_PREDICATELOCKTARGETTAG_RELATION(heaptargettag, dbId, heapId);
3011  heaptargettaghash = PredicateLockTargetTagHashCode(&heaptargettag);
3013  &heaptargettag,
3014  heaptargettaghash,
3015  HASH_ENTER, &found);
3016  if (!found)
3017  dlist_init(&heaptarget->predicateLocks);
3018  }
3019 
3020  /*
3021  * Loop through all the locks on the old target, replacing them with
3022  * locks on the new target.
3023  */
3024  dlist_foreach_modify(iter, &oldtarget->predicateLocks)
3025  {
3026  PREDICATELOCK *oldpredlock =
3027  dlist_container(PREDICATELOCK, targetLink, iter.cur);
3028  PREDICATELOCK *newpredlock;
3029  SerCommitSeqNo oldCommitSeqNo;
3030  SERIALIZABLEXACT *oldXact;
3031 
3032  /*
3033  * Remove the old lock first. This avoids the chance of running
3034  * out of lock structure entries for the hash table.
3035  */
3036  oldCommitSeqNo = oldpredlock->commitSeqNo;
3037  oldXact = oldpredlock->tag.myXact;
3038 
3039  dlist_delete(&(oldpredlock->xactLink));
3040 
3041  /*
3042  * No need for retail delete from oldtarget list, we're removing
3043  * the whole target anyway.
3044  */
3046  &oldpredlock->tag,
3047  HASH_REMOVE, &found);
3048  Assert(found);
3049 
3050  if (transfer)
3051  {
3052  PREDICATELOCKTAG newpredlocktag;
3053 
3054  newpredlocktag.myTarget = heaptarget;
3055  newpredlocktag.myXact = oldXact;
3056  newpredlock = (PREDICATELOCK *)
3058  &newpredlocktag,
3060  heaptargettaghash),
3061  HASH_ENTER,
3062  &found);
3063  if (!found)
3064  {
3065  dlist_push_tail(&(heaptarget->predicateLocks),
3066  &(newpredlock->targetLink));
3067  dlist_push_tail(&(newpredlocktag.myXact->predicateLocks),
3068  &(newpredlock->xactLink));
3069  newpredlock->commitSeqNo = oldCommitSeqNo;
3070  }
3071  else
3072  {
3073  if (newpredlock->commitSeqNo < oldCommitSeqNo)
3074  newpredlock->commitSeqNo = oldCommitSeqNo;
3075  }
3076 
3077  Assert(newpredlock->commitSeqNo != 0);
3078  Assert((newpredlock->commitSeqNo == InvalidSerCommitSeqNo)
3079  || (newpredlock->tag.myXact == OldCommittedSxact));
3080  }
3081  }
3082 
3084  &found);
3085  Assert(found);
3086  }
3087 
3088  /* Put the scratch entry back */
3089  if (transfer)
3090  RestoreScratchTarget(true);
3091 
3092  /* Release locks in reverse order */
3093  LWLockRelease(SerializableXactHashLock);
3094  for (i = NUM_PREDICATELOCK_PARTITIONS - 1; i >= 0; i--)
3096  LWLockRelease(SerializablePredicateListLock);
3097 }
3098 
3099 /*
3100  * TransferPredicateLocksToHeapRelation
3101  * For all transactions, transfer all predicate locks for the given
3102  * relation to a single relation lock on the heap.
3103  */
3104 void
3106 {
3107  DropAllPredicateLocksFromTable(relation, true);
3108 }
3109 
3110 
3111 /*
3112  * PredicateLockPageSplit
3113  *
3114  * Copies any predicate locks for the old page to the new page.
3115  * Skip if this is a temporary table or toast table.
3116  *
3117  * NOTE: A page split (or overflow) affects all serializable transactions,
3118  * even if it occurs in the context of another transaction isolation level.
3119  *
3120  * NOTE: This currently leaves the local copy of the locks without
3121  * information on the new lock which is in shared memory. This could cause
3122  * problems if enough page splits occur on locked pages without the processes
3123  * which hold the locks getting in and noticing.
3124  */
3125 void
3127  BlockNumber newblkno)
3128 {
3129  PREDICATELOCKTARGETTAG oldtargettag;
3130  PREDICATELOCKTARGETTAG newtargettag;
3131  bool success;
3132 
3133  /*
3134  * Bail out quickly if there are no serializable transactions running.
3135  *
3136  * It's safe to do this check without taking any additional locks. Even if
3137  * a serializable transaction starts concurrently, we know it can't take
3138  * any SIREAD locks on the page being split because the caller is holding
3139  * the associated buffer page lock. Memory reordering isn't an issue; the
3140  * memory barrier in the LWLock acquisition guarantees that this read
3141  * occurs while the buffer page lock is held.
3142  */
3144  return;
3145 
3146  if (!PredicateLockingNeededForRelation(relation))
3147  return;
3148 
3149  Assert(oldblkno != newblkno);
3150  Assert(BlockNumberIsValid(oldblkno));
3151  Assert(BlockNumberIsValid(newblkno));
3152 
3153  SET_PREDICATELOCKTARGETTAG_PAGE(oldtargettag,
3154  relation->rd_locator.dbOid,
3155  relation->rd_id,
3156  oldblkno);
3157  SET_PREDICATELOCKTARGETTAG_PAGE(newtargettag,
3158  relation->rd_locator.dbOid,
3159  relation->rd_id,
3160  newblkno);
3161 
3162  LWLockAcquire(SerializablePredicateListLock, LW_EXCLUSIVE);
3163 
3164  /*
3165  * Try copying the locks over to the new page's tag, creating it if
3166  * necessary.
3167  */
3169  newtargettag,
3170  false);
3171 
3172  if (!success)
3173  {
3174  /*
3175  * No more predicate lock entries are available. Failure isn't an
3176  * option here, so promote the page lock to a relation lock.
3177  */
3178 
3179  /* Get the parent relation lock's lock tag */
3180  success = GetParentPredicateLockTag(&oldtargettag,
3181  &newtargettag);
3182  Assert(success);
3183 
3184  /*
3185  * Move the locks to the parent. This shouldn't fail.
3186  *
3187  * Note that here we are removing locks held by other backends,
3188  * leading to a possible inconsistency in their local lock hash table.
3189  * This is OK because we're replacing it with a lock that covers the
3190  * old one.
3191  */
3193  newtargettag,
3194  true);
3195  Assert(success);
3196  }
3197 
3198  LWLockRelease(SerializablePredicateListLock);
3199 }
3200 
3201 /*
3202  * PredicateLockPageCombine
3203  *
3204  * Combines predicate locks for two existing pages.
3205  * Skip if this is a temporary table or toast table.
3206  *
3207  * NOTE: A page combine affects all serializable transactions, even if it
3208  * occurs in the context of another transaction isolation level.
3209  */
3210 void
3212  BlockNumber newblkno)
3213 {
3214  /*
3215  * Page combines differ from page splits in that we ought to be able to
3216  * remove the locks on the old page after transferring them to the new
3217  * page, instead of duplicating them. However, because we can't edit other
3218  * backends' local lock tables, removing the old lock would leave them
3219  * with an entry in their LocalPredicateLockHash for a lock they're not
3220  * holding, which isn't acceptable. So we wind up having to do the same
3221  * work as a page split, acquiring a lock on the new page and keeping the
3222  * old page locked too. That can lead to some false positives, but should
3223  * be rare in practice.
3224  */
3225  PredicateLockPageSplit(relation, oldblkno, newblkno);
3226 }
3227 
3228 /*
3229  * Walk the list of in-progress serializable transactions and find the new
3230  * xmin.
3231  */
3232 static void
3234 {
3235  dlist_iter iter;
3236 
3237  Assert(LWLockHeldByMe(SerializableXactHashLock));
3238 
3241 
3243  {
3244  SERIALIZABLEXACT *sxact =
3245  dlist_container(SERIALIZABLEXACT, xactLink, iter.cur);
3246 
3247  if (!SxactIsRolledBack(sxact)
3248  && !SxactIsCommitted(sxact)
3249  && sxact != OldCommittedSxact)
3250  {
3251  Assert(sxact->xmin != InvalidTransactionId);
3253  || TransactionIdPrecedes(sxact->xmin,
3255  {
3256  PredXact->SxactGlobalXmin = sxact->xmin;
3258  }
3259  else if (TransactionIdEquals(sxact->xmin,
3262  }
3263  }
3264 
3266 }
3267 
3268 /*
3269  * ReleasePredicateLocks
3270  *
3271  * Releases predicate locks based on completion of the current transaction,
3272  * whether committed or rolled back. It can also be called for a read only
3273  * transaction when it becomes impossible for the transaction to become
3274  * part of a dangerous structure.
3275  *
3276  * We do nothing unless this is a serializable transaction.
3277  *
3278  * This method must ensure that shared memory hash tables are cleaned
3279  * up in some relatively timely fashion.
3280  *
3281  * If this transaction is committing and is holding any predicate locks,
3282  * it must be added to a list of completed serializable transactions still
3283  * holding locks.
3284  *
3285  * If isReadOnlySafe is true, then predicate locks are being released before
3286  * the end of the transaction because MySerializableXact has been determined
3287  * to be RO_SAFE. In non-parallel mode we can release it completely, but it
3288  * in parallel mode we partially release the SERIALIZABLEXACT and keep it
3289  * around until the end of the transaction, allowing each backend to clear its
3290  * MySerializableXact variable and benefit from the optimization in its own
3291  * time.
3292  */
3293 void
3294 ReleasePredicateLocks(bool isCommit, bool isReadOnlySafe)
3295 {
3296  bool partiallyReleasing = false;
3297  bool needToClear;
3298  SERIALIZABLEXACT *roXact;
3299  dlist_mutable_iter iter;
3300 
3301  /*
3302  * We can't trust XactReadOnly here, because a transaction which started
3303  * as READ WRITE can show as READ ONLY later, e.g., within
3304  * subtransactions. We want to flag a transaction as READ ONLY if it
3305  * commits without writing so that de facto READ ONLY transactions get the
3306  * benefit of some RO optimizations, so we will use this local variable to
3307  * get some cleanup logic right which is based on whether the transaction
3308  * was declared READ ONLY at the top level.
3309  */
3310  bool topLevelIsDeclaredReadOnly;
3311 
3312  /* We can't be both committing and releasing early due to RO_SAFE. */
3313  Assert(!(isCommit && isReadOnlySafe));
3314 
3315  /* Are we at the end of a transaction, that is, a commit or abort? */
3316  if (!isReadOnlySafe)
3317  {
3318  /*
3319  * Parallel workers mustn't release predicate locks at the end of
3320  * their transaction. The leader will do that at the end of its
3321  * transaction.
3322  */
3323  if (IsParallelWorker())
3324  {
3326  return;
3327  }
3328 
3329  /*
3330  * By the time the leader in a parallel query reaches end of
3331  * transaction, it has waited for all workers to exit.
3332  */
3334 
3335  /*
3336  * If the leader in a parallel query earlier stashed a partially
3337  * released SERIALIZABLEXACT for final clean-up at end of transaction
3338  * (because workers might still have been accessing it), then it's
3339  * time to restore it.
3340  */
3342  {
3347  }
3348  }
3349 
3351  {
3352  Assert(LocalPredicateLockHash == NULL);
3353  return;
3354  }
3355 
3356  LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
3357 
3358  /*
3359  * If the transaction is committing, but it has been partially released
3360  * already, then treat this as a roll back. It was marked as rolled back.
3361  */
3363  isCommit = false;
3364 
3365  /*
3366  * If we're called in the middle of a transaction because we discovered
3367  * that the SXACT_FLAG_RO_SAFE flag was set, then we'll partially release
3368  * it (that is, release the predicate locks and conflicts, but not the
3369  * SERIALIZABLEXACT itself) if we're the first backend to have noticed.
3370  */
3371  if (isReadOnlySafe && IsInParallelMode())
3372  {
3373  /*
3374  * The leader needs to stash a pointer to it, so that it can
3375  * completely release it at end-of-transaction.
3376  */
3377  if (!IsParallelWorker())
3379 
3380  /*
3381  * The first backend to reach this condition will partially release
3382  * the SERIALIZABLEXACT. All others will just clear their
3383  * backend-local state so that they stop doing SSI checks for the rest
3384  * of the transaction.
3385  */
3387  {
3388  LWLockRelease(SerializableXactHashLock);
3390  return;
3391  }
3392  else
3393  {
3395  partiallyReleasing = true;
3396  /* ... and proceed to perform the partial release below. */
3397  }
3398  }
3399  Assert(!isCommit || SxactIsPrepared(MySerializableXact));
3400  Assert(!isCommit || !SxactIsDoomed(MySerializableXact));
3404 
3405  /* may not be serializable during COMMIT/ROLLBACK PREPARED */
3407 
3408  /* We'd better not already be on the cleanup list. */
3410 
3411  topLevelIsDeclaredReadOnly = SxactIsReadOnly(MySerializableXact);
3412 
3413  /*
3414  * We don't hold XidGenLock lock here, assuming that TransactionId is
3415  * atomic!
3416  *
3417  * If this value is changing, we don't care that much whether we get the
3418  * old or new value -- it is just used to determine how far
3419  * SxactGlobalXmin must advance before this transaction can be fully
3420  * cleaned up. The worst that could happen is we wait for one more
3421  * transaction to complete before freeing some RAM; correctness of visible
3422  * behavior is not affected.
3423  */
3425 
3426  /*
3427  * If it's not a commit it's either a rollback or a read-only transaction
3428  * flagged SXACT_FLAG_RO_SAFE, and we can clear our locks immediately.
3429  */
3430  if (isCommit)
3431  {
3434  /* Recognize implicit read-only transaction (commit without write). */
3435  if (!MyXactDidWrite)
3437  }
3438  else
3439  {
3440  /*
3441  * The DOOMED flag indicates that we intend to roll back this
3442  * transaction and so it should not cause serialization failures for
3443  * other transactions that conflict with it. Note that this flag might
3444  * already be set, if another backend marked this transaction for
3445  * abort.
3446  *
3447  * The ROLLED_BACK flag further indicates that ReleasePredicateLocks
3448  * has been called, and so the SerializableXact is eligible for
3449  * cleanup. This means it should not be considered when calculating
3450  * SxactGlobalXmin.
3451  */
3454 
3455  /*
3456  * If the transaction was previously prepared, but is now failing due
3457  * to a ROLLBACK PREPARED or (hopefully very rare) error after the
3458  * prepare, clear the prepared flag. This simplifies conflict
3459  * checking.
3460  */
3462  }
3463 
3464  if (!topLevelIsDeclaredReadOnly)
3465  {
3467  if (--(PredXact->WritableSxactCount) == 0)
3468  {
3469  /*
3470  * Release predicate locks and rw-conflicts in for all committed
3471  * transactions. There are no longer any transactions which might
3472  * conflict with the locks and no chance for new transactions to
3473  * overlap. Similarly, existing conflicts in can't cause pivots,
3474  * and any conflicts in which could have completed a dangerous
3475  * structure would already have caused a rollback, so any
3476  * remaining ones must be benign.
3477  */
3479  }
3480  }
3481  else
3482  {
3483  /*
3484  * Read-only transactions: clear the list of transactions that might
3485  * make us unsafe. Note that we use 'inLink' for the iteration as
3486  * opposed to 'outLink' for the r/w xacts.
3487  */
3489  {
3490  RWConflict possibleUnsafeConflict =
3491  dlist_container(RWConflictData, inLink, iter.cur);
3492 
3493  Assert(!SxactIsReadOnly(possibleUnsafeConflict->sxactOut));
3494  Assert(MySerializableXact == possibleUnsafeConflict->sxactIn);
3495 
3496  ReleaseRWConflict(possibleUnsafeConflict);
3497  }
3498  }
3499 
3500  /* Check for conflict out to old committed transactions. */
3501  if (isCommit
3504  {
3505  /*
3506  * we don't know which old committed transaction we conflicted with,
3507  * so be conservative and use FirstNormalSerCommitSeqNo here
3508  */
3512  }
3513 
3514  /*
3515  * Release all outConflicts to committed transactions. If we're rolling
3516  * back clear them all. Set SXACT_FLAG_CONFLICT_OUT if any point to
3517  * previously committed transactions.
3518  */
3520  {
3521  RWConflict conflict =
3522  dlist_container(RWConflictData, outLink, iter.cur);
3523 
3524  if (isCommit
3526  && SxactIsCommitted(conflict->sxactIn))
3527  {
3532  }
3533 
3534  if (!isCommit
3535  || SxactIsCommitted(conflict->sxactIn)
3537  ReleaseRWConflict(conflict);
3538  }
3539 
3540  /*
3541  * Release all inConflicts from committed and read-only transactions. If
3542  * we're rolling back, clear them all.
3543  */
3545  {
3546  RWConflict conflict =
3547  dlist_container(RWConflictData, inLink, iter.cur);
3548 
3549  if (!isCommit
3550  || SxactIsCommitted(conflict->sxactOut)
3551  || SxactIsReadOnly(conflict->sxactOut))
3552  ReleaseRWConflict(conflict);
3553  }
3554 
3555  if (!topLevelIsDeclaredReadOnly)
3556  {
3557  /*
3558  * Remove ourselves from the list of possible conflicts for concurrent
3559  * READ ONLY transactions, flagging them as unsafe if we have a
3560  * conflict out. If any are waiting DEFERRABLE transactions, wake them
3561  * up if they are known safe or known unsafe.
3562  */
3564  {
3565  RWConflict possibleUnsafeConflict =
3566  dlist_container(RWConflictData, outLink, iter.cur);
3567 
3568  roXact = possibleUnsafeConflict->sxactIn;
3569  Assert(MySerializableXact == possibleUnsafeConflict->sxactOut);
3570  Assert(SxactIsReadOnly(roXact));
3571 
3572  /* Mark conflicted if necessary. */
3573  if (isCommit
3574  && MyXactDidWrite
3577  <= roXact->SeqNo.lastCommitBeforeSnapshot))
3578  {
3579  /*
3580  * This releases possibleUnsafeConflict (as well as all other
3581  * possible conflicts for roXact)
3582  */
3583  FlagSxactUnsafe(roXact);
3584  }
3585  else
3586  {
3587  ReleaseRWConflict(possibleUnsafeConflict);
3588 
3589  /*
3590  * If we were the last possible conflict, flag it safe. The
3591  * transaction can now safely release its predicate locks (but
3592  * that transaction's backend has to do that itself).
3593  */
3595  roXact->flags |= SXACT_FLAG_RO_SAFE;
3596  }
3597 
3598  /*
3599  * Wake up the process for a waiting DEFERRABLE transaction if we
3600  * now know it's either safe or conflicted.
3601  */
3602  if (SxactIsDeferrableWaiting(roXact) &&
3603  (SxactIsROUnsafe(roXact) || SxactIsROSafe(roXact)))
3604  ProcSendSignal(roXact->pgprocno);
3605  }
3606  }
3607 
3608  /*
3609  * Check whether it's time to clean up old transactions. This can only be
3610  * done when the last serializable transaction with the oldest xmin among
3611  * serializable transactions completes. We then find the "new oldest"
3612  * xmin and purge any transactions which finished before this transaction
3613  * was launched.
3614  *
3615  * For parallel queries in read-only transactions, it might run twice. We
3616  * only release the reference on the first call.
3617  */
3618  needToClear = false;
3619  if ((partiallyReleasing ||
3623  {
3625  if (--(PredXact->SxactGlobalXminCount) == 0)
3626  {
3628  needToClear = true;
3629  }
3630  }
3631 
3632  LWLockRelease(SerializableXactHashLock);
3633 
3634  LWLockAcquire(SerializableFinishedListLock, LW_EXCLUSIVE);
3635 
3636  /* Add this to the list of transactions to check for later cleanup. */
3637  if (isCommit)
3640 
3641  /*
3642  * If we're releasing a RO_SAFE transaction in parallel mode, we'll only
3643  * partially release it. That's necessary because other backends may have
3644  * a reference to it. The leader will release the SERIALIZABLEXACT itself
3645  * at the end of the transaction after workers have stopped running.
3646  */
3647  if (!isCommit)
3649  isReadOnlySafe && IsInParallelMode(),
3650  false);
3651 
3652  LWLockRelease(SerializableFinishedListLock);
3653 
3654  if (needToClear)
3656 
3658 }
3659 
3660 static void
3662 {
3664  MyXactDidWrite = false;
3665 
3666  /* Delete per-transaction lock table */
3667  if (LocalPredicateLockHash != NULL)
3668  {
3670  LocalPredicateLockHash = NULL;
3671  }
3672 }
3673 
3674 /*
3675  * Clear old predicate locks, belonging to committed transactions that are no
3676  * longer interesting to any in-progress transaction.
3677  */
3678 static void
3680 {
3681  dlist_mutable_iter iter;
3682 
3683  /*
3684  * Loop through finished transactions. They are in commit order, so we can
3685  * stop as soon as we find one that's still interesting.
3686  */
3687  LWLockAcquire(SerializableFinishedListLock, LW_EXCLUSIVE);
3688  LWLockAcquire(SerializableXactHashLock, LW_SHARED);
3690  {
3691  SERIALIZABLEXACT *finishedSxact =
3692  dlist_container(SERIALIZABLEXACT, finishedLink, iter.cur);
3693 
3697  {
3698  /*
3699  * This transaction committed before any in-progress transaction
3700  * took its snapshot. It's no longer interesting.
3701  */
3702  LWLockRelease(SerializableXactHashLock);
3703  dlist_delete_thoroughly(&finishedSxact->finishedLink);
3704  ReleaseOneSerializableXact(finishedSxact, false, false);
3705  LWLockAcquire(SerializableXactHashLock, LW_SHARED);
3706  }
3707  else if (finishedSxact->commitSeqNo > PredXact->HavePartialClearedThrough
3708  && finishedSxact->commitSeqNo <= PredXact->CanPartialClearThrough)
3709  {
3710  /*
3711  * Any active transactions that took their snapshot before this
3712  * transaction committed are read-only, so we can clear part of
3713  * its state.
3714  */
3715  LWLockRelease(SerializableXactHashLock);
3716 
3717  if (SxactIsReadOnly(finishedSxact))
3718  {
3719  /* A read-only transaction can be removed entirely */
3720  dlist_delete_thoroughly(&(finishedSxact->finishedLink));
3721  ReleaseOneSerializableXact(finishedSxact, false, false);
3722  }
3723  else
3724  {
3725  /*
3726  * A read-write transaction can only be partially cleared. We
3727  * need to keep the SERIALIZABLEXACT but can release the
3728  * SIREAD locks and conflicts in.
3729  */
3730  ReleaseOneSerializableXact(finishedSxact, true, false);
3731  }
3732 
3734  LWLockAcquire(SerializableXactHashLock, LW_SHARED);
3735  }
3736  else
3737  {
3738  /* Still interesting. */
3739  break;
3740  }
3741  }
3742  LWLockRelease(SerializableXactHashLock);
3743 
3744  /*
3745  * Loop through predicate locks on dummy transaction for summarized data.
3746  */
3747  LWLockAcquire(SerializablePredicateListLock, LW_SHARED);
3749  {
3750  PREDICATELOCK *predlock =
3751  dlist_container(PREDICATELOCK, xactLink, iter.cur);
3752  bool canDoPartialCleanup;
3753 
3754  LWLockAcquire(SerializableXactHashLock, LW_SHARED);
3755  Assert(predlock->commitSeqNo != 0);
3757  canDoPartialCleanup = (predlock->commitSeqNo <= PredXact->CanPartialClearThrough);
3758  LWLockRelease(SerializableXactHashLock);
3759 
3760  /*
3761  * If this lock originally belonged to an old enough transaction, we
3762  * can release it.
3763  */
3764  if (canDoPartialCleanup)
3765  {
3766  PREDICATELOCKTAG tag;
3767  PREDICATELOCKTARGET *target;
3768  PREDICATELOCKTARGETTAG targettag;
3769  uint32 targettaghash;
3770  LWLock *partitionLock;
3771 
3772  tag = predlock->tag;
3773  target = tag.myTarget;
3774  targettag = target->tag;
3775  targettaghash = PredicateLockTargetTagHashCode(&targettag);
3776  partitionLock = PredicateLockHashPartitionLock(targettaghash);
3777 
3778  LWLockAcquire(partitionLock, LW_EXCLUSIVE);
3779 
3780  dlist_delete(&(predlock->targetLink));
3781  dlist_delete(&(predlock->xactLink));
3782 
3785  targettaghash),
3786  HASH_REMOVE, NULL);
3787  RemoveTargetIfNoLongerUsed(target, targettaghash);
3788 
3789  LWLockRelease(partitionLock);
3790  }
3791  }
3792 
3793  LWLockRelease(SerializablePredicateListLock);
3794  LWLockRelease(SerializableFinishedListLock);
3795 }
3796 
3797 /*
3798  * This is the normal way to delete anything from any of the predicate
3799  * locking hash tables. Given a transaction which we know can be deleted:
3800  * delete all predicate locks held by that transaction and any predicate
3801  * lock targets which are now unreferenced by a lock; delete all conflicts
3802  * for the transaction; delete all xid values for the transaction; then
3803  * delete the transaction.
3804  *
3805  * When the partial flag is set, we can release all predicate locks and
3806  * in-conflict information -- we've established that there are no longer
3807  * any overlapping read write transactions for which this transaction could
3808  * matter -- but keep the transaction entry itself and any outConflicts.
3809  *
3810  * When the summarize flag is set, we've run short of room for sxact data
3811  * and must summarize to the SLRU. Predicate locks are transferred to a
3812  * dummy "old" transaction, with duplicate locks on a single target
3813  * collapsing to a single lock with the "latest" commitSeqNo from among
3814  * the conflicting locks..
3815  */
3816 static void
3818  bool summarize)
3819 {
3820  SERIALIZABLEXIDTAG sxidtag;
3821  dlist_mutable_iter iter;
3822 
3823  Assert(sxact != NULL);
3824  Assert(SxactIsRolledBack(sxact) || SxactIsCommitted(sxact));
3825  Assert(partial || !SxactIsOnFinishedList(sxact));
3826  Assert(LWLockHeldByMe(SerializableFinishedListLock));
3827 
3828  /*
3829  * First release all the predicate locks held by this xact (or transfer
3830  * them to OldCommittedSxact if summarize is true)
3831  */
3832  LWLockAcquire(SerializablePredicateListLock, LW_SHARED);
3833  if (IsInParallelMode())
3835  dlist_foreach_modify(iter, &sxact->predicateLocks)
3836  {
3837  PREDICATELOCK *predlock =
3838  dlist_container(PREDICATELOCK, xactLink, iter.cur);
3839  PREDICATELOCKTAG tag;
3840  PREDICATELOCKTARGET *target;
3841  PREDICATELOCKTARGETTAG targettag;
3842  uint32 targettaghash;
3843  LWLock *partitionLock;
3844 
3845  tag = predlock->tag;
3846  target = tag.myTarget;
3847  targettag = target->tag;
3848  targettaghash = PredicateLockTargetTagHashCode(&targettag);
3849  partitionLock = PredicateLockHashPartitionLock(targettaghash);
3850 
3851  LWLockAcquire(partitionLock, LW_EXCLUSIVE);
3852 
3853  dlist_delete(&predlock->targetLink);
3854 
3857  targettaghash),
3858  HASH_REMOVE, NULL);
3859  if (summarize)
3860  {
3861  bool found;
3862 
3863  /* Fold into dummy transaction list. */
3864  tag.myXact = OldCommittedSxact;
3867  targettaghash),
3868  HASH_ENTER_NULL, &found);
3869  if (!predlock)
3870  ereport(ERROR,
3871  (errcode(ERRCODE_OUT_OF_MEMORY),
3872  errmsg("out of shared memory"),
3873  errhint("You might need to increase %s.", "max_pred_locks_per_transaction")));
3874  if (found)
3875  {
3876  Assert(predlock->commitSeqNo != 0);
3878  if (predlock->commitSeqNo < sxact->commitSeqNo)
3879  predlock->commitSeqNo = sxact->commitSeqNo;
3880  }
3881  else
3882  {
3884  &predlock->targetLink);
3886  &predlock->xactLink);
3887  predlock->commitSeqNo = sxact->commitSeqNo;
3888  }
3889  }
3890  else
3891  RemoveTargetIfNoLongerUsed(target, targettaghash);
3892 
3893  LWLockRelease(partitionLock);
3894  }
3895 
3896  /*
3897  * Rather than retail removal, just re-init the head after we've run
3898  * through the list.
3899  */
3900  dlist_init(&sxact->predicateLocks);
3901 
3902  if (IsInParallelMode())
3904  LWLockRelease(SerializablePredicateListLock);
3905 
3906  sxidtag.xid = sxact->topXid;
3907  LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
3908 
3909  /* Release all outConflicts (unless 'partial' is true) */
3910  if (!partial)
3911  {
3912  dlist_foreach_modify(iter, &sxact->outConflicts)
3913  {
3914  RWConflict conflict =
3915  dlist_container(RWConflictData, outLink, iter.cur);
3916 
3917  if (summarize)
3919  ReleaseRWConflict(conflict);
3920  }
3921  }
3922 
3923  /* Release all inConflicts. */
3924  dlist_foreach_modify(iter, &sxact->inConflicts)
3925  {
3926  RWConflict conflict =
3927  dlist_container(RWConflictData, inLink, iter.cur);
3928 
3929  if (summarize)
3931  ReleaseRWConflict(conflict);
3932  }
3933 
3934  /* Finally, get rid of the xid and the record of the transaction itself. */
3935  if (!partial)
3936  {
3937  if (sxidtag.xid != InvalidTransactionId)
3938  hash_search(SerializableXidHash, &sxidtag, HASH_REMOVE, NULL);
3939  ReleasePredXact(sxact);
3940  }
3941 
3942  LWLockRelease(SerializableXactHashLock);
3943 }
3944 
3945 /*
3946  * Tests whether the given top level transaction is concurrent with
3947  * (overlaps) our current transaction.
3948  *
3949  * We need to identify the top level transaction for SSI, anyway, so pass
3950  * that to this function to save the overhead of checking the snapshot's
3951  * subxip array.
3952  */
3953 static bool
3955 {
3956  Snapshot snap;
3957 
3960 
3961  snap = GetTransactionSnapshot();
3962 
3963  if (TransactionIdPrecedes(xid, snap->xmin))
3964  return false;
3965 
3966  if (TransactionIdFollowsOrEquals(xid, snap->xmax))
3967  return true;
3968 
3969  return pg_lfind32(xid, snap->xip, snap->xcnt);
3970 }
3971 
3972 bool
3974 {
3975  if (!SerializationNeededForRead(relation, snapshot))
3976  return false;
3977 
3978  /* Check if someone else has already decided that we need to die */
3980  {
3981  ereport(ERROR,
3983  errmsg("could not serialize access due to read/write dependencies among transactions"),
3984  errdetail_internal("Reason code: Canceled on identification as a pivot, during conflict out checking."),
3985  errhint("The transaction might succeed if retried.")));
3986  }
3987 
3988  return true;
3989 }
3990 
3991 /*
3992  * CheckForSerializableConflictOut
3993  * A table AM is reading a tuple that has been modified. If it determines
3994  * that the tuple version it is reading is not visible to us, it should
3995  * pass in the top level xid of the transaction that created it.
3996  * Otherwise, if it determines that it is visible to us but it has been
3997  * deleted or there is a newer version available due to an update, it
3998  * should pass in the top level xid of the modifying transaction.
3999  *
4000  * This function will check for overlap with our own transaction. If the given
4001  * xid is also serializable and the transactions overlap (i.e., they cannot see
4002  * each other's writes), then we have a conflict out.
4003  */
4004 void
4006 {
4007  SERIALIZABLEXIDTAG sxidtag;
4008  SERIALIZABLEXID *sxid;
4009  SERIALIZABLEXACT *sxact;
4010 
4011  if (!SerializationNeededForRead(relation, snapshot))
4012  return;
4013 
4014  /* Check if someone else has already decided that we need to die */
4016  {
4017  ereport(ERROR,
4019  errmsg("could not serialize access due to read/write dependencies among transactions"),
4020  errdetail_internal("Reason code: Canceled on identification as a pivot, during conflict out checking."),
4021  errhint("The transaction might succeed if retried.")));
4022  }
4024 
4026  return;
4027 
4028  /*
4029  * Find sxact or summarized info for the top level xid.
4030  */
4031  sxidtag.xid = xid;
4032  LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
4033  sxid = (SERIALIZABLEXID *)
4034  hash_search(SerializableXidHash, &sxidtag, HASH_FIND, NULL);
4035  if (!sxid)
4036  {
4037  /*
4038  * Transaction not found in "normal" SSI structures. Check whether it
4039  * got pushed out to SLRU storage for "old committed" transactions.
4040  */
4041  SerCommitSeqNo conflictCommitSeqNo;
4042 
4043  conflictCommitSeqNo = SerialGetMinConflictCommitSeqNo(xid);
4044  if (conflictCommitSeqNo != 0)
4045  {
4046  if (conflictCommitSeqNo != InvalidSerCommitSeqNo
4048  || conflictCommitSeqNo
4050  ereport(ERROR,
4052  errmsg("could not serialize access due to read/write dependencies among transactions"),
4053  errdetail_internal("Reason code: Canceled on conflict out to old pivot %u.", xid),
4054  errhint("The transaction might succeed if retried.")));
4055 
4058  ereport(ERROR,
4060  errmsg("could not serialize access due to read/write dependencies among transactions"),
4061  errdetail_internal("Reason code: Canceled on identification as a pivot, with conflict out to old committed transaction %u.", xid),
4062  errhint("The transaction might succeed if retried.")));
4063 
4065  }
4066 
4067  /* It's not serializable or otherwise not important. */
4068  LWLockRelease(SerializableXactHashLock);
4069  return;
4070  }
4071  sxact = sxid->myXact;
4072  Assert(TransactionIdEquals(sxact->topXid, xid));
4073  if (sxact == MySerializableXact || SxactIsDoomed(sxact))
4074  {
4075  /* Can't conflict with ourself or a transaction that will roll back. */
4076  LWLockRelease(SerializableXactHashLock);
4077  return;
4078  }
4079 
4080  /*
4081  * We have a conflict out to a transaction which has a conflict out to a
4082  * summarized transaction. That summarized transaction must have
4083  * committed first, and we can't tell when it committed in relation to our
4084  * snapshot acquisition, so something needs to be canceled.
4085  */
4086  if (SxactHasSummaryConflictOut(sxact))
4087  {
4088  if (!SxactIsPrepared(sxact))
4089  {
4090  sxact->flags |= SXACT_FLAG_DOOMED;
4091  LWLockRelease(SerializableXactHashLock);
4092  return;
4093  }
4094  else
4095  {
4096  LWLockRelease(SerializableXactHashLock);
4097  ereport(ERROR,
4099  errmsg("could not serialize access due to read/write dependencies among transactions"),
4100  errdetail_internal("Reason code: Canceled on conflict out to old pivot."),
4101  errhint("The transaction might succeed if retried.")));
4102  }
4103  }
4104 
4105  /*
4106  * If this is a read-only transaction and the writing transaction has
4107  * committed, and it doesn't have a rw-conflict to a transaction which
4108  * committed before it, no conflict.
4109  */
4111  && SxactIsCommitted(sxact)
4112  && !SxactHasSummaryConflictOut(sxact)
4113  && (!SxactHasConflictOut(sxact)
4115  {
4116  /* Read-only transaction will appear to run first. No conflict. */
4117  LWLockRelease(SerializableXactHashLock);
4118  return;
4119  }
4120 
4121  if (!XidIsConcurrent(xid))
4122  {
4123  /* This write was already in our snapshot; no conflict. */
4124  LWLockRelease(SerializableXactHashLock);
4125  return;
4126  }
4127 
4129  {
4130  /* We don't want duplicate conflict records in the list. */
4131  LWLockRelease(SerializableXactHashLock);
4132  return;
4133  }
4134 
4135  /*
4136  * Flag the conflict. But first, if this conflict creates a dangerous
4137  * structure, ereport an error.
4138  */
4140  LWLockRelease(SerializableXactHashLock);
4141 }
4142 
4143 /*
4144  * Check a particular target for rw-dependency conflict in. A subroutine of
4145  * CheckForSerializableConflictIn().
4146  */
4147 static void
4149 {
4150  uint32 targettaghash;
4151  LWLock *partitionLock;
4152  PREDICATELOCKTARGET *target;
4153  PREDICATELOCK *mypredlock = NULL;
4154  PREDICATELOCKTAG mypredlocktag;
4155  dlist_mutable_iter iter;
4156 
4158 
4159  /*
4160  * The same hash and LW lock apply to the lock target and the lock itself.
4161  */
4162  targettaghash = PredicateLockTargetTagHashCode(targettag);
4163  partitionLock = PredicateLockHashPartitionLock(targettaghash);
4164  LWLockAcquire(partitionLock, LW_SHARED);
4165  target = (PREDICATELOCKTARGET *)
4167  targettag, targettaghash,
4168  HASH_FIND, NULL);
4169  if (!target)
4170  {
4171  /* Nothing has this target locked; we're done here. */
4172  LWLockRelease(partitionLock);
4173  return;
4174  }
4175 
4176  /*
4177  * Each lock for an overlapping transaction represents a conflict: a
4178  * rw-dependency in to this transaction.
4179  */
4180  LWLockAcquire(SerializableXactHashLock, LW_SHARED);
4181 
4182  dlist_foreach_modify(iter, &target->predicateLocks)
4183  {
4184  PREDICATELOCK *predlock =
4185  dlist_container(PREDICATELOCK, targetLink, iter.cur);
4186  SERIALIZABLEXACT *sxact = predlock->tag.myXact;
4187 
4188  if (sxact == MySerializableXact)
4189  {
4190  /*
4191  * If we're getting a write lock on a tuple, we don't need a
4192  * predicate (SIREAD) lock on the same tuple. We can safely remove
4193  * our SIREAD lock, but we'll defer doing so until after the loop
4194  * because that requires upgrading to an exclusive partition lock.
4195  *
4196  * We can't use this optimization within a subtransaction because
4197  * the subtransaction could roll back, and we would be left
4198  * without any lock at the top level.
4199  */
4200  if (!IsSubTransaction()
4201  && GET_PREDICATELOCKTARGETTAG_OFFSET(*targettag))
4202  {
4203  mypredlock = predlock;
4204  mypredlocktag = predlock->tag;
4205  }
4206  }
4207  else if (!SxactIsDoomed(sxact)
4208  && (!SxactIsCommitted(sxact)
4210  sxact->finishedBefore))
4212  {
4213  LWLockRelease(SerializableXactHashLock);
4214  LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
4215 
4216  /*
4217  * Re-check after getting exclusive lock because the other
4218  * transaction may have flagged a conflict.
4219  */
4220  if (!SxactIsDoomed(sxact)
4221  && (!SxactIsCommitted(sxact)
4223  sxact->finishedBefore))
4225  {
4227  }
4228 
4229  LWLockRelease(SerializableXactHashLock);
4230  LWLockAcquire(SerializableXactHashLock, LW_SHARED);
4231  }
4232  }
4233  LWLockRelease(SerializableXactHashLock);
4234  LWLockRelease(partitionLock);
4235 
4236  /*
4237  * If we found one of our own SIREAD locks to remove, remove it now.
4238  *
4239  * At this point our transaction already has a RowExclusiveLock on the
4240  * relation, so we are OK to drop the predicate lock on the tuple, if
4241  * found, without fearing that another write against the tuple will occur
4242  * before the MVCC information makes it to the buffer.
4243  */
4244  if (mypredlock != NULL)
4245  {
4246  uint32 predlockhashcode;
4247  PREDICATELOCK *rmpredlock;
4248 
4249  LWLockAcquire(SerializablePredicateListLock, LW_SHARED);
4250  if (IsInParallelMode())
4252  LWLockAcquire(partitionLock, LW_EXCLUSIVE);
4253  LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
4254 
4255  /*
4256  * Remove the predicate lock from shared memory, if it wasn't removed
4257  * while the locks were released. One way that could happen is from
4258  * autovacuum cleaning up an index.
4259  */
4260  predlockhashcode = PredicateLockHashCodeFromTargetHashCode
4261  (&mypredlocktag, targettaghash);
4262  rmpredlock = (PREDICATELOCK *)
4264  &mypredlocktag,
4265  predlockhashcode,
4266  HASH_FIND, NULL);
4267  if (rmpredlock != NULL)
4268  {
4269  Assert(rmpredlock == mypredlock);
4270 
4271  dlist_delete(&(mypredlock->targetLink));
4272  dlist_delete(&(mypredlock->xactLink));
4273 
4274  rmpredlock = (PREDICATELOCK *)
4276  &mypredlocktag,
4277  predlockhashcode,
4278  HASH_REMOVE, NULL);
4279  Assert(rmpredlock == mypredlock);
4280 
4281  RemoveTargetIfNoLongerUsed(target, targettaghash);
4282  }
4283 
4284  LWLockRelease(SerializableXactHashLock);
4285  LWLockRelease(partitionLock);
4286  if (IsInParallelMode())
4288  LWLockRelease(SerializablePredicateListLock);
4289 
4290  if (rmpredlock != NULL)
4291  {
4292  /*
4293  * Remove entry in local lock table if it exists. It's OK if it
4294  * doesn't exist; that means the lock was transferred to a new
4295  * target by a different backend.
4296  */
4298  targettag, targettaghash,
4299  HASH_REMOVE, NULL);
4300 
4301  DecrementParentLocks(targettag);
4302  }
4303  }
4304 }
4305 
4306 /*
4307  * CheckForSerializableConflictIn
4308  * We are writing the given tuple. If that indicates a rw-conflict
4309  * in from another serializable transaction, take appropriate action.
4310  *
4311  * Skip checking for any granularity for which a parameter is missing.
4312  *
4313  * A tuple update or delete is in conflict if we have a predicate lock
4314  * against the relation or page in which the tuple exists, or against the
4315  * tuple itself.
4316  */
4317 void
4319 {
4320  PREDICATELOCKTARGETTAG targettag;
4321 
4322  if (!SerializationNeededForWrite(relation))
4323  return;
4324 
4325  /* Check if someone else has already decided that we need to die */
4327  ereport(ERROR,
4329  errmsg("could not serialize access due to read/write dependencies among transactions"),
4330  errdetail_internal("Reason code: Canceled on identification as a pivot, during conflict in checking."),
4331  errhint("The transaction might succeed if retried.")));
4332 
4333  /*
4334  * We're doing a write which might cause rw-conflicts now or later.
4335  * Memorize that fact.
4336  */
4337  MyXactDidWrite = true;
4338 
4339  /*
4340  * It is important that we check for locks from the finest granularity to
4341  * the coarsest granularity, so that granularity promotion doesn't cause
4342  * us to miss a lock. The new (coarser) lock will be acquired before the
4343  * old (finer) locks are released.
4344  *
4345  * It is not possible to take and hold a lock across the checks for all
4346  * granularities because each target could be in a separate partition.
4347  */
4348  if (tid != NULL)
4349  {
4351  relation->rd_locator.dbOid,
4352  relation->rd_id,
4355  CheckTargetForConflictsIn(&targettag);
4356  }
4357 
4358  if (blkno != InvalidBlockNumber)
4359  {
4361  relation->rd_locator.dbOid,
4362  relation->rd_id,
4363  blkno);
4364  CheckTargetForConflictsIn(&targettag);
4365  }
4366 
4368  relation->rd_locator.dbOid,
4369  relation->rd_id);
4370  CheckTargetForConflictsIn(&targettag);
4371 }
4372 
4373 /*
4374  * CheckTableForSerializableConflictIn
4375  * The entire table is going through a DDL-style logical mass delete
4376  * like TRUNCATE or DROP TABLE. If that causes a rw-conflict in from
4377  * another serializable transaction, take appropriate action.
4378  *
4379  * While these operations do not operate entirely within the bounds of
4380  * snapshot isolation, they can occur inside a serializable transaction, and
4381  * will logically occur after any reads which saw rows which were destroyed
4382  * by these operations, so we do what we can to serialize properly under
4383  * SSI.
4384  *
4385  * The relation passed in must be a heap relation. Any predicate lock of any
4386  * granularity on the heap will cause a rw-conflict in to this transaction.
4387  * Predicate locks on indexes do not matter because they only exist to guard
4388  * against conflicting inserts into the index, and this is a mass *delete*.
4389  * When a table is truncated or dropped, the index will also be truncated
4390  * or dropped, and we'll deal with locks on the index when that happens.
4391  *
4392  * Dropping or truncating a table also needs to drop any existing predicate
4393  * locks on heap tuples or pages, because they're about to go away. This
4394  * should be done before altering the predicate locks because the transaction
4395  * could be rolled back because of a conflict, in which case the lock changes
4396  * are not needed. (At the moment, we don't actually bother to drop the
4397  * existing locks on a dropped or truncated table at the moment. That might
4398  * lead to some false positives, but it doesn't seem worth the trouble.)
4399  */
4400 void
4402 {
4403  HASH_SEQ_STATUS seqstat;
4404  PREDICATELOCKTARGET *target;
4405  Oid dbId;
4406  Oid heapId;
4407  int i;
4408 
4409  /*
4410  * Bail out quickly if there are no serializable transactions running.
4411  * It's safe to check this without taking locks because the caller is
4412  * holding an ACCESS EXCLUSIVE lock on the relation. No new locks which
4413  * would matter here can be acquired while that is held.
4414  */
4416  return;
4417 
4418  if (!SerializationNeededForWrite(relation))
4419  return;
4420 
4421  /*
4422  * We're doing a write which might cause rw-conflicts now or later.
4423  * Memorize that fact.
4424  */
4425  MyXactDidWrite = true;
4426 
4427  Assert(relation->rd_index == NULL); /* not an index relation */
4428 
4429  dbId = relation->rd_locator.dbOid;
4430  heapId = relation->rd_id;
4431 
4432  LWLockAcquire(SerializablePredicateListLock, LW_EXCLUSIVE);
4433  for (i = 0; i < NUM_PREDICATELOCK_PARTITIONS; i++)
4435  LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
4436 
4437  /* Scan through target list */
4439 
4440  while ((target = (PREDICATELOCKTARGET *) hash_seq_search(&seqstat)))
4441  {
4442  dlist_mutable_iter iter;
4443 
4444  /*
4445  * Check whether this is a target which needs attention.
4446  */
4447  if (GET_PREDICATELOCKTARGETTAG_RELATION(target->tag) != heapId)
4448  continue; /* wrong relation id */
4449  if (GET_PREDICATELOCKTARGETTAG_DB(target->tag) != dbId)
4450  continue; /* wrong database id */
4451 
4452  /*
4453  * Loop through locks for this target and flag conflicts.
4454  */
4455  dlist_foreach_modify(iter, &target->predicateLocks)
4456  {
4457  PREDICATELOCK *predlock =
4458  dlist_container(PREDICATELOCK, targetLink, iter.cur);
4459 
4460  if (predlock->tag.myXact != MySerializableXact
4462  {
4464  }
4465  }
4466  }
4467 
4468  /* Release locks in reverse order */
4469  LWLockRelease(SerializableXactHashLock);
4470  for (i = NUM_PREDICATELOCK_PARTITIONS - 1; i >= 0; i--)
4472  LWLockRelease(SerializablePredicateListLock);
4473 }
4474 
4475 
4476 /*
4477  * Flag a rw-dependency between two serializable transactions.
4478  *
4479  * The caller is responsible for ensuring that we have a LW lock on
4480  * the transaction hash table.
4481  */
4482 static void
4484 {
4485  Assert(reader != writer);
4486 
4487  /* First, see if this conflict causes failure. */
4489 
4490  /* Actually do the conflict flagging. */
4491  if (reader == OldCommittedSxact)
4493  else if (writer == OldCommittedSxact)
4495  else
4496  SetRWConflict(reader, writer);
4497 }
4498 
4499 /*----------------------------------------------------------------------------
4500  * We are about to add a RW-edge to the dependency graph - check that we don't
4501  * introduce a dangerous structure by doing so, and abort one of the
4502  * transactions if so.
4503  *
4504  * A serialization failure can only occur if there is a dangerous structure
4505  * in the dependency graph:
4506  *
4507  * Tin ------> Tpivot ------> Tout
4508  * rw rw
4509  *
4510  * Furthermore, Tout must commit first.
4511  *
4512  * One more optimization is that if Tin is declared READ ONLY (or commits
4513  * without writing), we can only have a problem if Tout committed before Tin
4514  * acquired its snapshot.
4515  *----------------------------------------------------------------------------
4516  */
4517 static void
4519  SERIALIZABLEXACT *writer)
4520 {
4521  bool failure;
4522 
4523  Assert(LWLockHeldByMe(SerializableXactHashLock));
4524 
4525  failure = false;
4526 
4527  /*------------------------------------------------------------------------
4528  * Check for already-committed writer with rw-conflict out flagged
4529  * (conflict-flag on W means that T2 committed before W):
4530  *
4531  * R ------> W ------> T2
4532  * rw rw
4533  *
4534  * That is a dangerous structure, so we must abort. (Since the writer
4535  * has already committed, we must be the reader)
4536  *------------------------------------------------------------------------
4537  */
4538  if (SxactIsCommitted(writer)
4539  && (SxactHasConflictOut(writer) || SxactHasSummaryConflictOut(writer)))
4540  failure = true;
4541 
4542  /*------------------------------------------------------------------------
4543  * Check whether the writer has become a pivot with an out-conflict
4544  * committed transaction (T2), and T2 committed first:
4545  *
4546  * R ------> W ------> T2
4547  * rw rw
4548  *
4549  * Because T2 must've committed first, there is no anomaly if:
4550  * - the reader committed before T2
4551  * - the writer committed before T2
4552  * - the reader is a READ ONLY transaction and the reader was concurrent
4553  * with T2 (= reader acquired its snapshot before T2 committed)
4554  *
4555  * We also handle the case that T2 is prepared but not yet committed
4556  * here. In that case T2 has already checked for conflicts, so if it
4557  * commits first, making the above conflict real, it's too late for it
4558  * to abort.
4559  *------------------------------------------------------------------------
4560  */
4561  if (!failure && SxactHasSummaryConflictOut(writer))
4562  failure = true;
4563  else if (!failure)
4564  {
4565  dlist_iter iter;
4566 
4567  dlist_foreach(iter, &writer->outConflicts)
4568  {
4569  RWConflict conflict =
4570  dlist_container(RWConflictData, outLink, iter.cur);
4571  SERIALIZABLEXACT *t2 = conflict->sxactIn;
4572 
4573  if (SxactIsPrepared(t2)
4574  && (!SxactIsCommitted(reader)
4575  || t2->prepareSeqNo <= reader->commitSeqNo)
4576  && (!SxactIsCommitted(writer)
4577  || t2->prepareSeqNo <= writer->commitSeqNo)
4578  && (!SxactIsReadOnly(reader)
4579  || t2->prepareSeqNo <= reader->SeqNo.lastCommitBeforeSnapshot))
4580  {
4581  failure = true;
4582  break;
4583  }
4584  }
4585  }
4586 
4587  /*------------------------------------------------------------------------
4588  * Check whether the reader has become a pivot with a writer
4589  * that's committed (or prepared):
4590  *
4591  * T0 ------> R ------> W
4592  * rw rw
4593  *
4594  * Because W must've committed first for an anomaly to occur, there is no
4595  * anomaly if:
4596  * - T0 committed before the writer
4597  * - T0 is READ ONLY, and overlaps the writer
4598  *------------------------------------------------------------------------
4599  */
4600  if (!failure && SxactIsPrepared(writer) && !SxactIsReadOnly(reader))
4601  {
4602  if (SxactHasSummaryConflictIn(reader))
4603  {
4604  failure = true;
4605  }
4606  else
4607  {
4608  dlist_iter iter;
4609 
4610  /*
4611  * The unconstify is needed as we have no const version of
4612  * dlist_foreach().
4613  */
4614  dlist_foreach(iter, &unconstify(SERIALIZABLEXACT *, reader)->inConflicts)
4615  {
4616  const RWConflict conflict =
4617  dlist_container(RWConflictData, inLink, iter.cur);
4618  const SERIALIZABLEXACT *t0 = conflict->sxactOut;
4619 
4620  if (!SxactIsDoomed(t0)
4621  && (!SxactIsCommitted(t0)
4622  || t0->commitSeqNo >= writer->prepareSeqNo)
4623  && (!SxactIsReadOnly(t0)
4624  || t0->SeqNo.lastCommitBeforeSnapshot >= writer->prepareSeqNo))
4625  {
4626  failure = true;
4627  break;
4628  }
4629  }
4630  }
4631  }
4632 
4633  if (failure)
4634  {
4635  /*
4636  * We have to kill a transaction to avoid a possible anomaly from
4637  * occurring. If the writer is us, we can just ereport() to cause a
4638  * transaction abort. Otherwise we flag the writer for termination,
4639  * causing it to abort when it tries to commit. However, if the writer
4640  * is a prepared transaction, already prepared, we can't abort it
4641  * anymore, so we have to kill the reader instead.
4642  */
4643  if (MySerializableXact == writer)
4644  {
4645  LWLockRelease(SerializableXactHashLock);
4646  ereport(ERROR,
4648  errmsg("could not serialize access due to read/write dependencies among transactions"),
4649  errdetail_internal("Reason code: Canceled on identification as a pivot, during write."),
4650  errhint("The transaction might succeed if retried.")));
4651  }
4652  else if (SxactIsPrepared(writer))
4653  {
4654  LWLockRelease(SerializableXactHashLock);
4655 
4656  /* if we're not the writer, we have to be the reader */
4657  Assert(MySerializableXact == reader);
4658  ereport(ERROR,
4660  errmsg("could not serialize access due to read/write dependencies among transactions"),
4661  errdetail_internal("Reason code: Canceled on conflict out to pivot %u, during read.", writer->topXid),
4662  errhint("The transaction might succeed if retried.")));
4663  }
4664  writer->flags |= SXACT_FLAG_DOOMED;
4665  }
4666 }
4667 
4668 /*
4669  * PreCommit_CheckForSerializationFailure
4670  * Check for dangerous structures in a serializable transaction
4671  * at commit.
4672  *
4673  * We're checking for a dangerous structure as each conflict is recorded.
4674  * The only way we could have a problem at commit is if this is the "out"
4675  * side of a pivot, and neither the "in" side nor the pivot has yet
4676  * committed.
4677  *
4678  * If a dangerous structure is found, the pivot (the near conflict) is
4679  * marked for death, because rolling back another transaction might mean
4680  * that we fail without ever making progress. This transaction is
4681  * committing writes, so letting it commit ensures progress. If we
4682  * canceled the far conflict, it might immediately fail again on retry.
4683  */
4684 void
4686 {
4687  dlist_iter near_iter;
4688 
4690  return;
4691 
4693 
4694  LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
4695 
4696  /*
4697  * Check if someone else has already decided that we need to die. Since
4698  * we set our own DOOMED flag when partially releasing, ignore in that
4699  * case.
4700  */
4703  {
4704  LWLockRelease(SerializableXactHashLock);
4705  ereport(ERROR,
4707  errmsg("could not serialize access due to read/write dependencies among transactions"),
4708  errdetail_internal("Reason code: Canceled on identification as a pivot, during commit attempt."),
4709  errhint("The transaction might succeed if retried.")));
4710  }
4711 
4713  {
4714  RWConflict nearConflict =
4715  dlist_container(RWConflictData, inLink, near_iter.cur);
4716 
4717  if (!SxactIsCommitted(nearConflict->sxactOut)
4718  && !SxactIsDoomed(nearConflict->sxactOut))
4719  {
4720  dlist_iter far_iter;
4721 
4722  dlist_foreach(far_iter, &nearConflict->sxactOut->inConflicts)
4723  {
4724  RWConflict farConflict =
4725  dlist_container(RWConflictData, inLink, far_iter.cur);
4726 
4727  if (farConflict->sxactOut == MySerializableXact
4728  || (!SxactIsCommitted(farConflict->sxactOut)
4729  && !SxactIsReadOnly(farConflict->sxactOut)
4730  && !SxactIsDoomed(farConflict->sxactOut)))
4731  {
4732  /*
4733  * Normally, we kill the pivot transaction to make sure we
4734  * make progress if the failing transaction is retried.
4735  * However, we can't kill it if it's already prepared, so
4736  * in that case we commit suicide instead.
4737  */
4738  if (SxactIsPrepared(nearConflict->sxactOut))
4739  {
4740  LWLockRelease(SerializableXactHashLock);
4741  ereport(ERROR,
4743  errmsg("could not serialize access due to read/write dependencies among transactions"),
4744  errdetail_internal("Reason code: Canceled on commit attempt with conflict in from prepared pivot."),
4745  errhint("The transaction might succeed if retried.")));
4746  }
4747  nearConflict->sxactOut->flags |= SXACT_FLAG_DOOMED;
4748  break;
4749  }
4750  }
4751  }
4752  }
4753 
4756 
4757  LWLockRelease(SerializableXactHashLock);
4758 }
4759 
4760 /*------------------------------------------------------------------------*/
4761 
4762 /*
4763  * Two-phase commit support
4764  */
4765 
4766 /*
4767  * AtPrepare_Locks
4768  * Do the preparatory work for a PREPARE: make 2PC state file
4769  * records for all predicate locks currently held.
4770  */
4771 void
4773 {
4774  SERIALIZABLEXACT *sxact;
4775  TwoPhasePredicateRecord record;
4776  TwoPhasePredicateXactRecord *xactRecord;
4777  TwoPhasePredicateLockRecord *lockRecord;
4778  dlist_iter iter;
4779 
4780  sxact = MySerializableXact;
4781  xactRecord = &(record.data.xactRecord);
4782  lockRecord = &(record.data.lockRecord);
4783 
4785  return;
4786 
4787  /* Generate an xact record for our SERIALIZABLEXACT */
4789  xactRecord->xmin = MySerializableXact->xmin;
4790  xactRecord->flags = MySerializableXact->flags;
4791 
4792  /*
4793  * Note that we don't include the list of conflicts in our out in the
4794  * statefile, because new conflicts can be added even after the
4795  * transaction prepares. We'll just make a conservative assumption during
4796  * recovery instead.
4797  */
4798 
4800  &record, sizeof(record));
4801 
4802  /*
4803  * Generate a lock record for each lock.
4804  *
4805  * To do this, we need to walk the predicate lock list in our sxact rather
4806  * than using the local predicate lock table because the latter is not
4807  * guaranteed to be accurate.
4808  */
4809  LWLockAcquire(SerializablePredicateListLock, LW_SHARED);
4810 
4811  /*
4812  * No need to take sxact->perXactPredicateListLock in parallel mode
4813  * because there cannot be any parallel workers running while we are
4814  * preparing a transaction.
4815  */
4817 
4818  dlist_foreach(iter, &sxact->predicateLocks)
4819  {
4820  PREDICATELOCK *predlock =
4821  dlist_container(PREDICATELOCK, xactLink, iter.cur);
4822 
4824  lockRecord->target = predlock->tag.myTarget->tag;
4825 
4827  &record, sizeof(record));
4828  }
4829 
4830  LWLockRelease(SerializablePredicateListLock);
4831 }
4832 
4833 /*
4834  * PostPrepare_Locks
4835  * Clean up after successful PREPARE. Unlike the non-predicate
4836  * lock manager, we do not need to transfer locks to a dummy
4837  * PGPROC because our SERIALIZABLEXACT will stay around
4838  * anyway. We only need to clean up our local state.
4839  */
4840 void
4842 {
4844  return;
4845 
4847 
4848  MySerializableXact->pid = 0;
4850 
4852  LocalPredicateLockHash = NULL;
4853 
4855  MyXactDidWrite = false;
4856 }
4857 
4858 /*
4859  * PredicateLockTwoPhaseFinish
4860  * Release a prepared transaction's predicate locks once it
4861  * commits or aborts.
4862  */
4863 void
4865 {
4866  SERIALIZABLEXID *sxid;
4867  SERIALIZABLEXIDTAG sxidtag;
4868 
4869  sxidtag.xid = xid;
4870 
4871  LWLockAcquire(SerializableXactHashLock, LW_SHARED);
4872  sxid = (SERIALIZABLEXID *)
4873  hash_search(SerializableXidHash, &sxidtag, HASH_FIND, NULL);
4874  LWLockRelease(SerializableXactHashLock);
4875 
4876  /* xid will not be found if it wasn't a serializable transaction */
4877  if (sxid == NULL)
4878  return;
4879 
4880  /* Release its locks */
4881  MySerializableXact = sxid->myXact;
4882  MyXactDidWrite = true; /* conservatively assume that we wrote
4883  * something */
4884  ReleasePredicateLocks(isCommit, false);
4885 }
4886 
4887 /*
4888  * Re-acquire a predicate lock belonging to a transaction that was prepared.
4889  */
4890 void
4892  void *recdata, uint32 len)
4893 {
4894  TwoPhasePredicateRecord *record;
4895 
4896  Assert(len == sizeof(TwoPhasePredicateRecord));
4897 
4898  record = (TwoPhasePredicateRecord *) recdata;
4899 
4900  Assert((record->type == TWOPHASEPREDICATERECORD_XACT) ||
4901  (record->type == TWOPHASEPREDICATERECORD_LOCK));
4902 
4903  if (record->type == TWOPHASEPREDICATERECORD_XACT)
4904  {
4905  /* Per-transaction record. Set up a SERIALIZABLEXACT. */
4906  TwoPhasePredicateXactRecord *xactRecord;
4907  SERIALIZABLEXACT *sxact;
4908  SERIALIZABLEXID *sxid;
4909  SERIALIZABLEXIDTAG sxidtag;
4910  bool found;
4911 
4912  xactRecord = (TwoPhasePredicateXactRecord *) &record->data.xactRecord;
4913 
4914  LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
4915  sxact = CreatePredXact();
4916  if (!sxact)
4917  ereport(ERROR,
4918  (errcode(ERRCODE_OUT_OF_MEMORY),
4919  errmsg("out of shared memory")));
4920 
4921  /* vxid for a prepared xact is INVALID_PROC_NUMBER/xid; no pid */
4924  sxact->pid = 0;
4925  sxact->pgprocno = INVALID_PROC_NUMBER;
4926 
4927  /* a prepared xact hasn't committed yet */
4931 
4933 
4934  /*
4935  * Don't need to track this; no transactions running at the time the
4936  * recovered xact started are still active, except possibly other
4937  * prepared xacts and we don't care whether those are RO_SAFE or not.
4938  */
4940 
4941  dlist_init(&(sxact->predicateLocks));
4942  dlist_node_init(&sxact->finishedLink);
4943 
4944  sxact->topXid = xid;
4945  sxact->xmin = xactRecord->xmin;
4946  sxact->flags = xactRecord->flags;
4947  Assert(SxactIsPrepared(sxact));
4948  if (!SxactIsReadOnly(sxact))
4949  {
4953  }
4954 
4955  /*
4956  * We don't know whether the transaction had any conflicts or not, so
4957  * we'll conservatively assume that it had both a conflict in and a
4958  * conflict out, and represent that with the summary conflict flags.
4959  */
4960  dlist_init(&(sxact->outConflicts));
4961  dlist_init(&(sxact->inConflicts));
4964 
4965  /* Register the transaction's xid */
4966  sxidtag.xid = xid;
4968  &sxidtag,
4969  HASH_ENTER, &found);
4970  Assert(sxid != NULL);
4971  Assert(!found);
4972  sxid->myXact = (SERIALIZABLEXACT *) sxact;
4973 
4974  /*
4975  * Update global xmin. Note that this is a special case compared to
4976  * registering a normal transaction, because the global xmin might go
4977  * backwards. That's OK, because until recovery is over we're not
4978  * going to complete any transactions or create any non-prepared
4979  * transactions, so there's no danger of throwing away.
4980  */
4983  {
4984  PredXact->SxactGlobalXmin = sxact->xmin;
4986  SerialSetActiveSerXmin(sxact->xmin);
4987  }
4988  else if (TransactionIdEquals(sxact->xmin, PredXact->SxactGlobalXmin))
4989  {
4992  }
4993 
4994  LWLockRelease(SerializableXactHashLock);
4995  }
4996  else if (record->type == TWOPHASEPREDICATERECORD_LOCK)
4997  {
4998  /* Lock record. Recreate the PREDICATELOCK */
4999  TwoPhasePredicateLockRecord *lockRecord;
5000  SERIALIZABLEXID *sxid;
5001  SERIALIZABLEXACT *sxact;
5002  SERIALIZABLEXIDTAG sxidtag;
5003  uint32 targettaghash;
5004 
5005  lockRecord = (TwoPhasePredicateLockRecord *) &record->data.lockRecord;
5006  targettaghash = PredicateLockTargetTagHashCode(&lockRecord->target);
5007 
5008  LWLockAcquire(SerializableXactHashLock, LW_SHARED);
5009  sxidtag.xid = xid;
5010  sxid = (SERIALIZABLEXID *)
5011  hash_search(SerializableXidHash, &sxidtag, HASH_FIND, NULL);
5012  LWLockRelease(SerializableXactHashLock);
5013 
5014  Assert(sxid != NULL);
5015  sxact = sxid->myXact;
5016  Assert(sxact != InvalidSerializableXact);
5017 
5018  CreatePredicateLock(&lockRecord->target, targettaghash, sxact);
5019  }
5020 }
5021 
5022 /*
5023  * Prepare to share the current SERIALIZABLEXACT with parallel workers.
5024  * Return a handle object that can be used by AttachSerializableXact() in a
5025  * parallel worker.
5026  */
5029 {
5030  return MySerializableXact;
5031 }
5032 
5033 /*
5034  * Allow parallel workers to import the leader's SERIALIZABLEXACT.
5035  */
5036 void
5038 {
5039 
5041 
5042  MySerializableXact = (SERIALIZABLEXACT *) handle;
5045 }
bool ParallelContextActive(void)
Definition: parallel.c:1006
uint32 BlockNumber
Definition: block.h:31
#define InvalidBlockNumber
Definition: block.h:33
static bool BlockNumberIsValid(BlockNumber blockNumber)
Definition: block.h:71
unsigned short uint16
Definition: c.h:492
#define unconstify(underlying_type, expr)
Definition: c.h:1232
unsigned int uint32
Definition: c.h:493
#define PG_USED_FOR_ASSERTS_ONLY
Definition: c.h:169
uint32 LocalTransactionId
Definition: c.h:641
uint32 TransactionId
Definition: c.h:639
size_t Size
Definition: c.h:592
void hash_destroy(HTAB *hashp)
Definition: dynahash.c:863
void * hash_search(HTAB *hashp, const void *keyPtr, HASHACTION action, bool *foundPtr)
Definition: dynahash.c:953
HTAB * hash_create(const char *tabname, long nelem, const HASHCTL *info, int flags)
Definition: dynahash.c:350
long hash_get_num_entries(HTAB *hashp)
Definition: dynahash.c:1377
Size hash_estimate_size(long num_entries, Size entrysize)
Definition: dynahash.c:781
void * hash_search_with_hash_value(HTAB *hashp, const void *keyPtr, uint32 hashvalue, HASHACTION action, bool *foundPtr)
Definition: dynahash.c:966
void * hash_seq_search(HASH_SEQ_STATUS *status)
Definition: dynahash.c:1431
void hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
Definition: dynahash.c:1421
int errmsg_internal(const char *fmt,...)
Definition: elog.c:1160
int errdetail_internal(const char *fmt,...)
Definition: elog.c:1233
int errdetail(const char *fmt,...)
Definition: elog.c:1206
int errhint(const char *fmt,...)
Definition: elog.c:1320
int errcode(int sqlerrcode)
Definition: elog.c:860
int errmsg(const char *fmt,...)
Definition: elog.c:1073
#define DEBUG2
Definition: elog.h:29
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:224
#define ereport(elevel,...)
Definition: elog.h:149
int MyProcPid
Definition: globals.c:45
ProcNumber MyProcNumber
Definition: globals.c:86
bool IsUnderPostmaster
Definition: globals.c:116
int MaxBackends
Definition: globals.c:143
int serializable_buffers
Definition: globals.c:166
#define newval
GucSource
Definition: guc.h:108
@ HASH_FIND
Definition: hsearch.h:113
@ HASH_REMOVE
Definition: hsearch.h:115
@ HASH_ENTER
Definition: hsearch.h:114
@ HASH_ENTER_NULL
Definition: hsearch.h:116
#define HASH_ELEM
Definition: hsearch.h:95
#define HASH_FUNCTION
Definition: hsearch.h:98
#define HASH_BLOBS
Definition: hsearch.h:97
#define HASH_FIXED_SIZE
Definition: hsearch.h:105
#define HASH_PARTITION
Definition: hsearch.h:92
#define dlist_foreach(iter, lhead)
Definition: ilist.h:623
static void dlist_init(dlist_head *head)
Definition: ilist.h:314
#define dlist_head_element(type, membername, lhead)
Definition: ilist.h:603
static void dlist_delete_thoroughly(dlist_node *node)
Definition: ilist.h:416
static void dlist_delete(dlist_node *node)
Definition: ilist.h:405
static dlist_node * dlist_pop_head_node(dlist_head *head)
Definition: ilist.h:450
#define dlist_foreach_modify(iter, lhead)
Definition: ilist.h:640
static bool dlist_is_empty(const dlist_head *head)
Definition: ilist.h:336
static void dlist_push_tail(dlist_head *head, dlist_node *node)
Definition: ilist.h:364
static void dlist_node_init(dlist_node *node)
Definition: ilist.h:325
#define dlist_container(type, membername, ptr)
Definition: ilist.h:593
#define IsParallelWorker()
Definition: parallel.h:61
FILE * output
long val
Definition: informix.c:664
static bool success
Definition: initdb.c:184
int i
Definition: isn.c:73
static OffsetNumber ItemPointerGetOffsetNumber(const ItemPointerData *pointer)
Definition: itemptr.h:124
static BlockNumber ItemPointerGetBlockNumber(const ItemPointerData *pointer)
Definition: itemptr.h:103
Assert(fmt[strlen(fmt) - 1] !='\n')
exit(1)
#define GET_VXID_FROM_PGPROC(vxid_dst, proc)
Definition: lock.h:77
#define SetInvalidVirtualTransactionId(vxid)
Definition: lock.h:74
bool LWLockHeldByMe(LWLock *lock)
Definition: lwlock.c:1900
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1175
bool LWLockHeldByMeInMode(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1944
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1788
void LWLockInitialize(LWLock *lock, int tranche_id)
Definition: lwlock.c:710
@ LWTRANCHE_SERIAL_SLRU
Definition: lwlock.h:216
@ LWTRANCHE_SERIAL_BUFFER
Definition: lwlock.h:187
@ LWTRANCHE_PER_XACT_PREDICATE_LIST
Definition: lwlock.h:204
@ LW_SHARED
Definition: lwlock.h:117
@ LW_EXCLUSIVE
Definition: lwlock.h:116
#define NUM_PREDICATELOCK_PARTITIONS
Definition: lwlock.h:103
void * palloc(Size size)
Definition: mcxt.c:1201
#define InvalidPid
Definition: miscadmin.h:32
const void size_t len
const void * data
static bool pg_lfind32(uint32 key, uint32 *base, uint32 nelem)
Definition: pg_lfind.h:90
static rewind_source * source
Definition: pg_rewind.c:89
#define ERRCODE_T_R_SERIALIZATION_FAILURE
Definition: pgbench.c:76
#define InvalidOid
Definition: postgres_ext.h:36
unsigned int Oid
Definition: postgres_ext.h:31
void CheckPointPredicate(void)
Definition: predicate.c:1033
void PredicateLockPageSplit(Relation relation, BlockNumber oldblkno, BlockNumber newblkno)
Definition: predicate.c:3126
static void DecrementParentLocks(const PREDICATELOCKTARGETTAG *targettag)
Definition: predicate.c:2373
static HTAB * PredicateLockHash
Definition: predicate.c:400
static void SetPossibleUnsafeConflict(SERIALIZABLEXACT *roXact, SERIALIZABLEXACT *activeXact)
Definition: predicate.c:668
#define PredicateLockTargetTagHashCode(predicatelocktargettag)
Definition: predicate.c:305
static void SetNewSxactGlobalXmin(void)
Definition: predicate.c:3233
void PredicateLockTwoPhaseFinish(TransactionId xid, bool isCommit)
Definition: predicate.c:4864
PredicateLockData * GetPredicateLockStatusData(void)
Definition: predicate.c:1427
#define SerialPage(xid)
Definition: predicate.c:345
void InitPredicateLocks(void)
Definition: predicate.c:1137
static void ReleasePredXact(SERIALIZABLEXACT *sxact)
Definition: predicate.c:598
void SetSerializableTransactionSnapshot(Snapshot snapshot, VirtualTransactionId *sourcevxid, int sourcepid)
Definition: predicate.c:1704
static bool RWConflictExists(const SERIALIZABLEXACT *reader, const SERIALIZABLEXACT *writer)
Definition: predicate.c:612
static bool PredicateLockingNeededForRelation(Relation relation)
Definition: predicate.c:500
static bool SerializationNeededForRead(Relation relation, Snapshot snapshot)
Definition: predicate.c:518
static Snapshot GetSafeSnapshot(Snapshot origSnapshot)
Definition: predicate.c:1540
#define SxactIsCommitted(sxact)
Definition: predicate.c:279
static SerialControl serialControl
Definition: predicate.c:356
void PredicateLockPage(Relation relation, BlockNumber blkno, Snapshot snapshot)
Definition: predicate.c:2581
#define SxactIsROUnsafe(sxact)
Definition: predicate.c:294
static Snapshot GetSerializableTransactionSnapshotInt(Snapshot snapshot, VirtualTransactionId *sourcevxid, int sourcepid)
Definition: predicate.c:1746
static LWLock * ScratchPartitionLock
Definition: predicate.c:410
static void PredicateLockAcquire(const PREDICATELOCKTARGETTAG *targettag)
Definition: predicate.c:2499
#define SxactIsDeferrableWaiting(sxact)
Definition: predicate.c:292
static void ReleasePredicateLocksLocal(void)
Definition: predicate.c:3661
static HTAB * LocalPredicateLockHash
Definition: predicate.c:416
int max_predicate_locks_per_page
Definition: predicate.c:375
struct SerialControlData * SerialControl
Definition: predicate.c:354
static PredXactList PredXact
Definition: predicate.c:386
static void SetRWConflict(SERIALIZABLEXACT *reader, SERIALIZABLEXACT *writer)
Definition: predicate.c:645
int GetSafeSnapshotBlockingPids(int blocked_pid, int *output, int output_size)
Definition: predicate.c:1610
static uint32 ScratchTargetTagHash
Definition: predicate.c:409
static void RemoveTargetIfNoLongerUsed(PREDICATELOCKTARGET *target, uint32 targettaghash)
Definition: predicate.c:2165
static uint32 predicatelock_hash(const void *key, Size keysize)
Definition: predicate.c:1401
void CheckForSerializableConflictOut(Relation relation, TransactionId xid, Snapshot snapshot)
Definition: predicate.c:4005
#define SxactIsReadOnly(sxact)
Definition: predicate.c:283
#define SerialNextPage(page)
Definition: predicate.c:339
static void DropAllPredicateLocksFromTable(Relation relation, bool transfer)
Definition: predicate.c:2919
bool PageIsPredicateLocked(Relation relation, BlockNumber blkno)
Definition: predicate.c:1990
static SlruCtlData SerialSlruCtlData
Definition: predicate.c:326
static void CreatePredicateLock(const PREDICATELOCKTARGETTAG *targettag, uint32 targettaghash, SERIALIZABLEXACT *sxact)
Definition: predicate.c:2435
static void SerialAdd(TransactionId xid, SerCommitSeqNo minConflictCommitSeqNo)
Definition: predicate.c:860
static void ClearOldPredicateLocks(void)
Definition: predicate.c:3679
#define SxactHasSummaryConflictIn(sxact)
Definition: predicate.c:284
static SERIALIZABLEXACT * CreatePredXact(void)
Definition: predicate.c:584
static bool GetParentPredicateLockTag(const PREDICATELOCKTARGETTAG *tag, PREDICATELOCKTARGETTAG *parent)
Definition: predicate.c:2054
#define PredicateLockHashCodeFromTargetHashCode(predicatelocktag, targethash)
Definition: predicate.c:318
static void RestoreScratchTarget(bool lockheld)
Definition: predicate.c:2143
#define SerialValue(slotno, xid)
Definition: predicate.c:341
static void DeleteChildTargetLocks(const PREDICATELOCKTARGETTAG *newtargettag)
Definition: predicate.c:2196
static void DeleteLockTarget(PREDICATELOCKTARGET *target, uint32 targettaghash)
Definition: predicate.c:2651
static SERIALIZABLEXACT * OldCommittedSxact
Definition: predicate.c:364
#define SxactHasConflictOut(sxact)
Definition: predicate.c:291
void CheckForSerializableConflictIn(Relation relation, ItemPointer tid, BlockNumber blkno)
Definition: predicate.c:4318
static bool MyXactDidWrite
Definition: predicate.c:424
static int MaxPredicateChildLocks(const PREDICATELOCKTARGETTAG *tag)
Definition: predicate.c:2271
static void FlagSxactUnsafe(SERIALIZABLEXACT *sxact)
Definition: predicate.c:701
static void SerialInit(void)
Definition: predicate.c:808
void CheckTableForSerializableConflictIn(Relation relation)
Definition: predicate.c:4401
#define SxactIsPrepared(sxact)
Definition: predicate.c:280
void PredicateLockTID(Relation relation, ItemPointer tid, Snapshot snapshot, TransactionId tuple_xid)
Definition: predicate.c:2603
void AttachSerializableXact(SerializableXactHandle handle)
Definition: predicate.c:5037
struct SerialControlData SerialControlData
SerializableXactHandle ShareSerializableXact(void)
Definition: predicate.c:5028
static bool PredicateLockExists(const PREDICATELOCKTARGETTAG *targettag)
Definition: predicate.c:2027
static void RemoveScratchTarget(bool lockheld)
Definition: predicate.c:2122
#define SxactIsOnFinishedList(sxact)
Definition: predicate.c:269
#define SxactIsPartiallyReleased(sxact)
Definition: predicate.c:295
static void SerialSetActiveSerXmin(TransactionId xid)
Definition: predicate.c:982
static dlist_head * FinishedSerializableTransactions
Definition: predicate.c:401
static bool SerializationNeededForWrite(Relation relation)
Definition: predicate.c:562
static HTAB * SerializableXidHash
Definition: predicate.c:398
static bool CheckAndPromotePredicateLockRequest(const PREDICATELOCKTARGETTAG *reqtag)
Definition: predicate.c:2308
void PredicateLockPageCombine(Relation relation, BlockNumber oldblkno, BlockNumber newblkno)
Definition: predicate.c:3211
static bool SerialPagePrecedesLogically(int64 page1, int64 page2)
Definition: predicate.c:733
static void CheckTargetForConflictsIn(PREDICATELOCKTARGETTAG *targettag)
Definition: predicate.c:4148
int max_predicate_locks_per_relation
Definition: predicate.c:374
#define SxactIsROSafe(sxact)
Definition: predicate.c:293
void PreCommit_CheckForSerializationFailure(void)
Definition: predicate.c:4685
void ReleasePredicateLocks(bool isCommit, bool isReadOnlySafe)
Definition: predicate.c:3294
static void FlagRWConflict(SERIALIZABLEXACT *reader, SERIALIZABLEXACT *writer)
Definition: predicate.c:4483
static const PREDICATELOCKTARGETTAG ScratchTargetTag
Definition: predicate.c:408
#define PredicateLockHashPartitionLockByIndex(i)
Definition: predicate.c:263
static void OnConflict_CheckForSerializationFailure(const SERIALIZABLEXACT *reader, SERIALIZABLEXACT *writer)
Definition: predicate.c:4518
static bool CoarserLockCovers(const PREDICATELOCKTARGETTAG *newtargettag)
Definition: predicate.c:2093
void PredicateLockRelation(Relation relation, Snapshot snapshot)
Definition: predicate.c:2558
static SERIALIZABLEXACT * MySerializableXact
Definition: predicate.c:423
void predicatelock_twophase_recover(TransactionId xid, uint16 info, void *recdata, uint32 len)
Definition: predicate.c:4891
Size PredicateLockShmemSize(void)
Definition: predicate.c:1339
#define SxactIsDoomed(sxact)
Definition: predicate.c:282
#define NPREDICATELOCKTARGETENTS()
Definition: predicate.c:266
static SerCommitSeqNo SerialGetMinConflictCommitSeqNo(TransactionId xid)
Definition: predicate.c:941
static void SummarizeOldestCommittedSxact(void)
Definition: predicate.c:1485
bool check_serial_buffers(int *newval, void **extra, GucSource source)
Definition: predicate.c:849
#define TargetTagIsCoveredBy(covered_target, covering_target)
Definition: predicate.c:235
static RWConflictPoolHeader RWConflictPool
Definition: predicate.c:392
static void ReleaseRWConflict(RWConflict conflict)
Definition: predicate.c:693
static bool TransferPredicateLocksToNewTarget(PREDICATELOCKTARGETTAG oldtargettag, PREDICATELOCKTARGETTAG newtargettag, bool removeOld)
Definition: predicate.c:2712
void AtPrepare_PredicateLocks(void)
Definition: predicate.c:4772
void RegisterPredicateLockingXid(TransactionId xid)
Definition: predicate.c:1941
#define PredicateLockHashPartitionLock(hashcode)
Definition: predicate.c:260
#define SERIAL_ENTRIESPERPAGE
Definition: predicate.c:332
static bool XidIsConcurrent(TransactionId xid)
Definition: predicate.c:3954
static void ReleaseOneSerializableXact(SERIALIZABLEXACT *sxact, bool partial, bool summarize)
Definition: predicate.c:3817
static HTAB * PredicateLockTargetHash
Definition: predicate.c:399
bool CheckForSerializableConflictOutNeeded(Relation relation, Snapshot snapshot)
Definition: predicate.c:3973
#define SxactIsRolledBack(sxact)
Definition: predicate.c:281
static SERIALIZABLEXACT * SavedSerializableXact
Definition: predicate.c:433
#define SxactHasSummaryConflictOut(sxact)
Definition: predicate.c:285
void TransferPredicateLocksToHeapRelation(Relation relation)
Definition: predicate.c:3105
void PostPrepare_PredicateLocks(TransactionId xid)
Definition: predicate.c:4841
static void CreateLocalPredicateLockHash(void)
Definition: predicate.c:1922
#define SerialSlruCtl
Definition: predicate.c:328
int max_predicate_locks_per_xact
Definition: predicate.c:373
Snapshot GetSerializableTransactionSnapshot(Snapshot snapshot)
Definition: predicate.c:1664
void * SerializableXactHandle
Definition: predicate.h:33
#define RWConflictDataSize
#define SXACT_FLAG_DEFERRABLE_WAITING
#define SXACT_FLAG_SUMMARY_CONFLICT_IN
@ TWOPHASEPREDICATERECORD_XACT
@ TWOPHASEPREDICATERECORD_LOCK
#define FirstNormalSerCommitSeqNo
#define InvalidSerCommitSeqNo
@ PREDLOCKTAG_RELATION
@ PREDLOCKTAG_PAGE
@ PREDLOCKTAG_TUPLE
struct PREDICATELOCKTAG PREDICATELOCKTAG
#define SXACT_FLAG_CONFLICT_OUT
#define PredXactListDataSize
#define SXACT_FLAG_READ_ONLY
#define SXACT_FLAG_DOOMED
struct LOCALPREDICATELOCK LOCALPREDICATELOCK
#define GET_PREDICATELOCKTARGETTAG_DB(locktag)
#define GET_PREDICATELOCKTARGETTAG_RELATION(locktag)
#define RWConflictPoolHeaderDataSize
struct SERIALIZABLEXIDTAG SERIALIZABLEXIDTAG
#define InvalidSerializableXact
struct PREDICATELOCKTARGET PREDICATELOCKTARGET
#define SET_PREDICATELOCKTARGETTAG_PAGE(locktag, dboid, reloid, blocknum)
#define RecoverySerCommitSeqNo
struct PREDICATELOCKTARGETTAG PREDICATELOCKTARGETTAG
struct SERIALIZABLEXID SERIALIZABLEXID
#define GET_PREDICATELOCKTARGETTAG_TYPE(locktag)
#define SET_PREDICATELOCKTARGETTAG_RELATION(locktag, dboid, reloid)
uint64 SerCommitSeqNo
#define SXACT_FLAG_ROLLED_BACK
#define SXACT_FLAG_COMMITTED
#define SXACT_FLAG_RO_UNSAFE
#define SXACT_FLAG_PREPARED
#define SET_PREDICATELOCKTARGETTAG_TUPLE(locktag, dboid, reloid, blocknum, offnum)
#define SXACT_FLAG_PARTIALLY_RELEASED
#define GET_PREDICATELOCKTARGETTAG_PAGE(locktag)
#define SXACT_FLAG_RO_SAFE
struct PREDICATELOCK PREDICATELOCK
#define SXACT_FLAG_SUMMARY_CONFLICT_OUT
#define GET_PREDICATELOCKTARGETTAG_OFFSET(locktag)
Snapshot GetSnapshotData(Snapshot snapshot)
Definition: procarray.c:2166
bool ProcArrayInstallImportedXmin(TransactionId xmin, VirtualTransactionId *sourcevxid)
Definition: procarray.c:2525
#define INVALID_PROC_NUMBER
Definition: procnumber.h:26
#define RelationUsesLocalBuffers(relation)
Definition: rel.h:636
bool ShmemAddrIsValid(const void *addr)
Definition: shmem.c:275
void * ShmemAlloc(Size size)
Definition: shmem.c:153
Size add_size(Size s1, Size s2)
Definition: shmem.c:494
void * ShmemInitStruct(const char *name, Size size, bool *foundPtr)
Definition: shmem.c:388
Size mul_size(Size s1, Size s2)
Definition: shmem.c:511
HTAB * ShmemInitHash(const char *name, long init_size, long max_size, HASHCTL *infoP, int hash_flags)
Definition: shmem.c:333
static pg_noinline void Size size
Definition: slab.c:607
void SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns, const char *subdir, int buffer_tranche_id, int bank_tranche_id, SyncRequestHandler sync_handler, bool long_segment_names)
Definition: slru.c:238
int SimpleLruReadPage_ReadOnly(SlruCtl ctl, int64 pageno, TransactionId xid)
Definition: slru.c:592
void SimpleLruWriteAll(SlruCtl ctl, bool allow_redirtied)
Definition: slru.c:1305
int SimpleLruReadPage(SlruCtl ctl, int64 pageno, bool write_ok, TransactionId xid)
Definition: slru.c:488
int SimpleLruZeroPage(SlruCtl ctl, int64 pageno)
Definition: slru.c:361
void SimpleLruTruncate(SlruCtl ctl, int64 cutoffPage)
Definition: slru.c:1391
Size SimpleLruShmemSize(int nslots, int nlsns)
Definition: slru.c:184
bool check_slru_buffers(const char *name, int *newval)
Definition: slru.c:341
static LWLock * SimpleLruGetBankLock(SlruCtl ctl, int64 pageno)
Definition: slru.h:179
#define SlruPagePrecedesUnitTests(ctl, per_page)
Definition: slru.h:203
#define SLRU_PAGES_PER_SEGMENT
Definition: slru.h:39
Snapshot GetTransactionSnapshot(void)
Definition: snapmgr.c:223
#define IsMVCCSnapshot(snapshot)
Definition: snapmgr.h:62
void ProcSendSignal(ProcNumber procNumber)
Definition: proc.c:1864
PGPROC * MyProc
Definition: proc.c:68
void ProcWaitForSignal(uint32 wait_event_info)
Definition: