PostgreSQL Source Code  git master
predicate_internals.h
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * predicate_internals.h
4  * POSTGRES internal predicate locking definitions.
5  *
6  *
7  * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
8  * Portions Copyright (c) 1994, Regents of the University of California
9  *
10  * src/include/storage/predicate_internals.h
11  *
12  *-------------------------------------------------------------------------
13  */
14 #ifndef PREDICATE_INTERNALS_H
15 #define PREDICATE_INTERNALS_H
16 
17 #include "storage/lock.h"
18 #include "storage/lwlock.h"
19 
20 /*
21  * Commit number.
22  */
23 typedef uint64 SerCommitSeqNo;
24 
25 /*
26  * Reserved commit sequence numbers:
27  * - 0 is reserved to indicate a non-existent SLRU entry; it cannot be
28  * used as a SerCommitSeqNo, even an invalid one
29  * - InvalidSerCommitSeqNo is used to indicate a transaction that
30  * hasn't committed yet, so use a number greater than all valid
31  * ones to make comparison do the expected thing
32  * - RecoverySerCommitSeqNo is used to refer to transactions that
33  * happened before a crash/recovery, since we restart the sequence
34  * at that point. It's earlier than all normal sequence numbers,
35  * and is only used by recovered prepared transactions
36  */
37 #define InvalidSerCommitSeqNo ((SerCommitSeqNo) PG_UINT64_MAX)
38 #define RecoverySerCommitSeqNo ((SerCommitSeqNo) 1)
39 #define FirstNormalSerCommitSeqNo ((SerCommitSeqNo) 2)
40 
41 /*
42  * The SERIALIZABLEXACT struct contains information needed for each
43  * serializable database transaction to support SSI techniques.
44  *
45  * A home-grown list is maintained in shared memory to manage these.
46  * An entry is used when the serializable transaction acquires a snapshot.
47  * Unless the transaction is rolled back, this entry must generally remain
48  * until all concurrent transactions have completed. (There are special
49  * optimizations for READ ONLY transactions which often allow them to be
50  * cleaned up earlier.) A transaction which is rolled back is cleaned up
51  * as soon as possible.
52  *
53  * Eligibility for cleanup of committed transactions is generally determined
54  * by comparing the transaction's finishedBefore field to
55  * SxactGlobalXmin.
56  */
57 typedef struct SERIALIZABLEXACT
58 {
59  VirtualTransactionId vxid; /* The executing process always has one of
60  * these. */
61 
62  /*
63  * We use two numbers to track the order that transactions commit. Before
64  * commit, a transaction is marked as prepared, and prepareSeqNo is set.
65  * Shortly after commit, it's marked as committed, and commitSeqNo is set.
66  * This doesn't give a strict commit order, but these two values together
67  * are good enough for us, as we can always err on the safe side and
68  * assume that there's a conflict, if we can't be sure of the exact
69  * ordering of two commits.
70  *
71  * Note that a transaction is marked as prepared for a short period during
72  * commit processing, even if two-phase commit is not used. But with
73  * two-phase commit, a transaction can stay in prepared state for some
74  * time.
75  */
78 
79  /* these values are not both interesting at the same time */
80  union
81  {
82  SerCommitSeqNo earliestOutConflictCommit; /* when committed with
83  * conflict out */
84  SerCommitSeqNo lastCommitBeforeSnapshot; /* when not committed or
85  * no conflict out */
86  } SeqNo;
87  SHM_QUEUE outConflicts; /* list of write transactions whose data we
88  * couldn't read. */
89  SHM_QUEUE inConflicts; /* list of read transactions which couldn't
90  * see our write. */
91  SHM_QUEUE predicateLocks; /* list of associated PREDICATELOCK objects */
92  SHM_QUEUE finishedLink; /* list link in
93  * FinishedSerializableTransactions */
94 
95  LWLock predicateLockListLock; /* protects predicateLocks in parallel
96  * mode */
97 
98  /*
99  * for r/o transactions: list of concurrent r/w transactions that we could
100  * potentially have conflicts with, and vice versa for r/w transactions
101  */
103 
104  TransactionId topXid; /* top level xid for the transaction, if one
105  * exists; else invalid */
106  TransactionId finishedBefore; /* invalid means still running; else the
107  * struct expires when no serializable
108  * xids are before this. */
109  TransactionId xmin; /* the transaction's snapshot xmin */
110  uint32 flags; /* OR'd combination of values defined below */
111  int pid; /* pid of associated process */
113 
114 #define SXACT_FLAG_COMMITTED 0x00000001 /* already committed */
115 #define SXACT_FLAG_PREPARED 0x00000002 /* about to commit */
116 #define SXACT_FLAG_ROLLED_BACK 0x00000004 /* already rolled back */
117 #define SXACT_FLAG_DOOMED 0x00000008 /* will roll back */
118 /*
119  * The following flag actually means that the flagged transaction has a
120  * conflict out *to a transaction which committed ahead of it*. It's hard
121  * to get that into a name of a reasonable length.
122  */
123 #define SXACT_FLAG_CONFLICT_OUT 0x00000010
124 #define SXACT_FLAG_READ_ONLY 0x00000020
125 #define SXACT_FLAG_DEFERRABLE_WAITING 0x00000040
126 #define SXACT_FLAG_RO_SAFE 0x00000080
127 #define SXACT_FLAG_RO_UNSAFE 0x00000100
128 #define SXACT_FLAG_SUMMARY_CONFLICT_IN 0x00000200
129 #define SXACT_FLAG_SUMMARY_CONFLICT_OUT 0x00000400
130 /*
131  * The following flag means the transaction has been partially released
132  * already, but is being preserved because parallel workers might have a
133  * reference to it. It'll be recycled by the leader at end-of-transaction.
134  */
135 #define SXACT_FLAG_PARTIALLY_RELEASED 0x00000800
136 
137 /*
138  * The following types are used to provide an ad hoc list for holding
139  * SERIALIZABLEXACT objects. An HTAB is overkill, since there is no need to
140  * access these by key -- there are direct pointers to these objects where
141  * needed. If a shared memory list is created, these types can probably be
142  * eliminated in favor of using the general solution.
143  */
145 {
149 
151 
152 #define PredXactListElementDataSize \
153  ((Size)MAXALIGN(sizeof(PredXactListElementData)))
154 
155 typedef struct PredXactListData
156 {
159 
160  /*
161  * These global variables are maintained when registering and cleaning up
162  * serializable transactions. They must be global across all backends,
163  * but are not needed outside the predicate.c source file. Protected by
164  * SerializableXactHashLock.
165  */
166  TransactionId SxactGlobalXmin; /* global xmin for active serializable
167  * transactions */
168  int SxactGlobalXminCount; /* how many active serializable
169  * transactions have this xmin */
170  int WritableSxactCount; /* how many non-read-only serializable
171  * transactions are active */
172  SerCommitSeqNo LastSxactCommitSeqNo; /* a strictly monotonically
173  * increasing number for commits
174  * of serializable transactions */
175  /* Protected by SerializableXactHashLock. */
176  SerCommitSeqNo CanPartialClearThrough; /* can clear predicate locks and
177  * inConflicts for committed
178  * transactions through this seq
179  * no */
180  /* Protected by SerializableFinishedListLock. */
181  SerCommitSeqNo HavePartialClearedThrough; /* have cleared through this
182  * seq no */
183  SERIALIZABLEXACT *OldCommittedSxact; /* shared copy of dummy sxact */
184 
185  PredXactListElement element;
187 
189 
190 #define PredXactListDataSize \
191  ((Size)MAXALIGN(sizeof(PredXactListData)))
192 
193 
194 /*
195  * The following types are used to provide lists of rw-conflicts between
196  * pairs of transactions. Since exactly the same information is needed,
197  * they are also used to record possible unsafe transaction relationships
198  * for purposes of identifying safe snapshots for read-only transactions.
199  *
200  * When a RWConflictData is not in use to record either type of relationship
201  * between a pair of transactions, it is kept on an "available" list. The
202  * outLink field is used for maintaining that list.
203  */
204 typedef struct RWConflictData
205 {
206  SHM_QUEUE outLink; /* link for list of conflicts out from a sxact */
207  SHM_QUEUE inLink; /* link for list of conflicts in to a sxact */
211 
212 typedef struct RWConflictData *RWConflict;
213 
214 #define RWConflictDataSize \
215  ((Size)MAXALIGN(sizeof(RWConflictData)))
216 
218 {
220  RWConflict element;
222 
224 
225 #define RWConflictPoolHeaderDataSize \
226  ((Size)MAXALIGN(sizeof(RWConflictPoolHeaderData)))
227 
228 
229 /*
230  * The SERIALIZABLEXIDTAG struct identifies an xid assigned to a serializable
231  * transaction or any of its subtransactions.
232  */
233 typedef struct SERIALIZABLEXIDTAG
234 {
237 
238 /*
239  * The SERIALIZABLEXID struct provides a link from a TransactionId for a
240  * serializable transaction to the related SERIALIZABLEXACT record, even if
241  * the transaction has completed and its connection has been closed.
242  *
243  * These are created as new top level transaction IDs are first assigned to
244  * transactions which are participating in predicate locking. This may
245  * never happen for a particular transaction if it doesn't write anything.
246  * They are removed with their related serializable transaction objects.
247  *
248  * The SubTransGetTopmostTransaction method is used where necessary to get
249  * from an XID which might be from a subtransaction to the top level XID.
250  */
251 typedef struct SERIALIZABLEXID
252 {
253  /* hash key */
255 
256  /* data */
257  SERIALIZABLEXACT *myXact; /* pointer to the top level transaction data */
259 
260 
261 /*
262  * The PREDICATELOCKTARGETTAG struct identifies a database object which can
263  * be the target of predicate locks.
264  *
265  * Note that the hash function being used doesn't properly respect tag
266  * length -- if the length of the structure isn't a multiple of four bytes it
267  * will go to a four byte boundary past the end of the tag. If you change
268  * this struct, make sure any slack space is initialized, so that any random
269  * bytes in the middle or at the end are not included in the hash.
270  *
271  * TODO SSI: If we always use the same fields for the same type of value, we
272  * should rename these. Holding off until it's clear there are no exceptions.
273  * Since indexes are relations with blocks and tuples, it's looking likely that
274  * the rename will be possible. If not, we may need to divide the last field
275  * and use part of it for a target type, so that we know how to interpret the
276  * data..
277  */
279 {
280  uint32 locktag_field1; /* a 32-bit ID field */
281  uint32 locktag_field2; /* a 32-bit ID field */
282  uint32 locktag_field3; /* a 32-bit ID field */
283  uint32 locktag_field4; /* a 32-bit ID field */
285 
286 /*
287  * The PREDICATELOCKTARGET struct represents a database object on which there
288  * are predicate locks.
289  *
290  * A hash list of these objects is maintained in shared memory. An entry is
291  * added when a predicate lock is requested on an object which doesn't
292  * already have one. An entry is removed when the last lock is removed from
293  * its list.
294  */
295 typedef struct PREDICATELOCKTARGET
296 {
297  /* hash key */
298  PREDICATELOCKTARGETTAG tag; /* unique identifier of lockable object */
299 
300  /* data */
301  SHM_QUEUE predicateLocks; /* list of PREDICATELOCK objects assoc. with
302  * predicate lock target */
304 
305 
306 /*
307  * The PREDICATELOCKTAG struct identifies an individual predicate lock.
308  *
309  * It is the combination of predicate lock target (which is a lockable
310  * object) and a serializable transaction which has acquired a lock on that
311  * target.
312  */
313 typedef struct PREDICATELOCKTAG
314 {
318 
319 /*
320  * The PREDICATELOCK struct represents an individual lock.
321  *
322  * An entry can be created here when the related database object is read, or
323  * by promotion of multiple finer-grained targets. All entries related to a
324  * serializable transaction are removed when that serializable transaction is
325  * cleaned up. Entries can also be removed when they are combined into a
326  * single coarser-grained lock entry.
327  */
328 typedef struct PREDICATELOCK
329 {
330  /* hash key */
331  PREDICATELOCKTAG tag; /* unique identifier of lock */
332 
333  /* data */
334  SHM_QUEUE targetLink; /* list link in PREDICATELOCKTARGET's list of
335  * predicate locks */
336  SHM_QUEUE xactLink; /* list link in SERIALIZABLEXACT's list of
337  * predicate locks */
338  SerCommitSeqNo commitSeqNo; /* only used for summarized predicate locks */
339 } PREDICATELOCK;
340 
341 
342 /*
343  * The LOCALPREDICATELOCK struct represents a local copy of data which is
344  * also present in the PREDICATELOCK table, organized for fast access without
345  * needing to acquire a LWLock. It is strictly for optimization.
346  *
347  * Each serializable transaction creates its own local hash table to hold a
348  * collection of these. This information is used to determine when a number
349  * of fine-grained locks should be promoted to a single coarser-grained lock.
350  * The information is maintained more-or-less in parallel to the
351  * PREDICATELOCK data, but because this data is not protected by locks and is
352  * only used in an optimization heuristic, it is allowed to drift in a few
353  * corner cases where maintaining exact data would be expensive.
354  *
355  * The hash table is created when the serializable transaction acquires its
356  * snapshot, and its memory is released upon completion of the transaction.
357  */
358 typedef struct LOCALPREDICATELOCK
359 {
360  /* hash key */
361  PREDICATELOCKTARGETTAG tag; /* unique identifier of lockable object */
362 
363  /* data */
364  bool held; /* is lock held, or just its children? */
365  int childLocks; /* number of child locks currently held */
367 
368 
369 /*
370  * The types of predicate locks which can be acquired.
371  */
373 {
377  /* TODO SSI: Other types may be needed for index locking */
379 
380 
381 /*
382  * This structure is used to quickly capture a copy of all predicate
383  * locks. This is currently used only by the pg_lock_status function,
384  * which in turn is used by the pg_locks view.
385  */
386 typedef struct PredicateLockData
387 {
392 
393 
394 /*
395  * These macros define how we map logical IDs of lockable objects into the
396  * physical fields of PREDICATELOCKTARGETTAG. Use these to set up values,
397  * rather than accessing the fields directly. Note multiple eval of target!
398  */
399 #define SET_PREDICATELOCKTARGETTAG_RELATION(locktag,dboid,reloid) \
400  ((locktag).locktag_field1 = (dboid), \
401  (locktag).locktag_field2 = (reloid), \
402  (locktag).locktag_field3 = InvalidBlockNumber, \
403  (locktag).locktag_field4 = InvalidOffsetNumber)
404 
405 #define SET_PREDICATELOCKTARGETTAG_PAGE(locktag,dboid,reloid,blocknum) \
406  ((locktag).locktag_field1 = (dboid), \
407  (locktag).locktag_field2 = (reloid), \
408  (locktag).locktag_field3 = (blocknum), \
409  (locktag).locktag_field4 = InvalidOffsetNumber)
410 
411 #define SET_PREDICATELOCKTARGETTAG_TUPLE(locktag,dboid,reloid,blocknum,offnum) \
412  ((locktag).locktag_field1 = (dboid), \
413  (locktag).locktag_field2 = (reloid), \
414  (locktag).locktag_field3 = (blocknum), \
415  (locktag).locktag_field4 = (offnum))
416 
417 #define GET_PREDICATELOCKTARGETTAG_DB(locktag) \
418  ((Oid) (locktag).locktag_field1)
419 #define GET_PREDICATELOCKTARGETTAG_RELATION(locktag) \
420  ((Oid) (locktag).locktag_field2)
421 #define GET_PREDICATELOCKTARGETTAG_PAGE(locktag) \
422  ((BlockNumber) (locktag).locktag_field3)
423 #define GET_PREDICATELOCKTARGETTAG_OFFSET(locktag) \
424  ((OffsetNumber) (locktag).locktag_field4)
425 #define GET_PREDICATELOCKTARGETTAG_TYPE(locktag) \
426  (((locktag).locktag_field4 != InvalidOffsetNumber) ? PREDLOCKTAG_TUPLE : \
427  (((locktag).locktag_field3 != InvalidBlockNumber) ? PREDLOCKTAG_PAGE : \
428  PREDLOCKTAG_RELATION))
429 
430 /*
431  * Two-phase commit statefile records. There are two types: for each
432  * transaction, we generate one per-transaction record and a variable
433  * number of per-predicate-lock records.
434  */
436 {
440 
441 /*
442  * Per-transaction information to reconstruct a SERIALIZABLEXACT. Not
443  * much is needed because most of it not meaningful for a recovered
444  * prepared transaction.
445  *
446  * In particular, we do not record the in and out conflict lists for a
447  * prepared transaction because the associated SERIALIZABLEXACTs will
448  * not be available after recovery. Instead, we simply record the
449  * existence of each type of conflict by setting the transaction's
450  * summary conflict in/out flag.
451  */
453 {
457 
458 /* Per-lock state */
460 {
462  uint32 filler; /* to avoid length change in back-patched fix */
464 
466 {
468  union
469  {
472  } data;
474 
475 /*
476  * Define a macro to use for an "empty" SERIALIZABLEXACT reference.
477  */
478 #define InvalidSerializableXact ((SERIALIZABLEXACT *) NULL)
479 
480 
481 /*
482  * Function definitions for functions needing awareness of predicate
483  * locking internals.
484  */
486 extern int GetSafeSnapshotBlockingPids(int blocked_pid,
487  int *output, int output_size);
488 
489 #endif /* PREDICATE_INTERNALS_H */
TransactionId finishedBefore
struct RWConflictPoolHeaderData * RWConflictPoolHeader
struct TwoPhasePredicateRecord TwoPhasePredicateRecord
Definition: lwlock.h:32
struct SERIALIZABLEXID SERIALIZABLEXID
PredicateLockTargetType
uint32 TransactionId
Definition: c.h:508
static void output(uint64 loop_count)
PredicateLockData * GetPredicateLockStatusData(void)
Definition: predicate.c:1376
TransactionId SxactGlobalXmin
struct SERIALIZABLEXIDTAG SERIALIZABLEXIDTAG
struct PREDICATELOCKTARGET PREDICATELOCKTARGET
struct RWConflictData * RWConflict
struct TwoPhasePredicateLockRecord TwoPhasePredicateLockRecord
SERIALIZABLEXACT * xacts
SERIALIZABLEXACT * myXact
TwoPhasePredicateRecordType type
PREDICATELOCKTARGETTAG target
struct RWConflictPoolHeaderData RWConflictPoolHeaderData
SERIALIZABLEXACT * sxactIn
SHM_QUEUE possibleUnsafeConflicts
union SERIALIZABLEXACT::@109 SeqNo
struct PREDICATELOCK PREDICATELOCK
struct PREDICATELOCKTAG PREDICATELOCKTAG
TwoPhasePredicateXactRecord xactRecord
int GetSafeSnapshotBlockingPids(int blocked_pid, int *output, int output_size)
Definition: predicate.c:1561
struct LOCALPREDICATELOCK LOCALPREDICATELOCK
PREDICATELOCKTARGETTAG tag
VirtualTransactionId vxid
TwoPhasePredicateRecordType
unsigned int uint32
Definition: c.h:359
SerCommitSeqNo lastCommitBeforeSnapshot
PREDICATELOCKTARGETTAG * locktags
SerCommitSeqNo commitSeqNo
SerCommitSeqNo HavePartialClearedThrough
PREDICATELOCKTAG tag
SerCommitSeqNo CanPartialClearThrough
SerCommitSeqNo earliestOutConflictCommit
PREDICATELOCKTARGETTAG tag
SerCommitSeqNo commitSeqNo
uint64 SerCommitSeqNo
struct PredicateLockData PredicateLockData
struct PREDICATELOCKTARGETTAG PREDICATELOCKTARGETTAG
SerCommitSeqNo prepareSeqNo
SERIALIZABLEXIDTAG tag
SerCommitSeqNo LastSxactCommitSeqNo
struct TwoPhasePredicateXactRecord TwoPhasePredicateXactRecord
SERIALIZABLEXACT * OldCommittedSxact
struct PredXactListData PredXactListData
struct RWConflictData RWConflictData
SERIALIZABLEXACT * myXact
struct PredXactListElementData * PredXactListElement
struct PredXactListElementData PredXactListElementData
TwoPhasePredicateLockRecord lockRecord
PredXactListElement element
struct SERIALIZABLEXACT SERIALIZABLEXACT
PREDICATELOCKTARGET * myTarget
struct PredXactListData * PredXactList
SERIALIZABLEXACT * sxactOut