PostgreSQL Source Code  git master
partdesc.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * partdesc.c
4  * Support routines for manipulating partition descriptors
5  *
6  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  * IDENTIFICATION
10  * src/backend/partitioning/partdesc.c
11  *
12  *-------------------------------------------------------------------------
13  */
14 
15 #include "postgres.h"
16 
17 #include "access/genam.h"
18 #include "access/htup_details.h"
19 #include "access/table.h"
20 #include "catalog/partition.h"
21 #include "catalog/pg_inherits.h"
23 #include "partitioning/partdesc.h"
24 #include "utils/builtins.h"
25 #include "utils/fmgroids.h"
26 #include "utils/hsearch.h"
27 #include "utils/inval.h"
28 #include "utils/lsyscache.h"
29 #include "utils/memutils.h"
30 #include "utils/partcache.h"
31 #include "utils/rel.h"
32 #include "utils/snapmgr.h"
33 #include "utils/syscache.h"
34 
35 typedef struct PartitionDirectoryData
36 {
41 
43 {
48 
50  bool omit_detached);
51 
52 
53 /*
54  * RelationGetPartitionDesc -- get partition descriptor, if relation is partitioned
55  *
56  * We keep two partdescs in relcache: rd_partdesc includes all partitions
57  * (even those being concurrently marked detached), while rd_partdesc_nodetached
58  * omits (some of) those. We store the pg_inherits.xmin value for the latter,
59  * to determine whether it can be validly reused in each case, since that
60  * depends on the active snapshot.
61  *
62  * Note: we arrange for partition descriptors to not get freed until the
63  * relcache entry's refcount goes to zero (see hacks in RelationClose,
64  * RelationClearRelation, and RelationBuildPartitionDesc). Therefore, even
65  * though we hand back a direct pointer into the relcache entry, it's safe
66  * for callers to continue to use that pointer as long as (a) they hold the
67  * relation open, and (b) they hold a relation lock strong enough to ensure
68  * that the data doesn't become stale.
69  */
71 RelationGetPartitionDesc(Relation rel, bool omit_detached)
72 {
73  Assert(rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE);
74 
75  /*
76  * If relcache has a partition descriptor, use that. However, we can only
77  * do so when we are asked to include all partitions including detached;
78  * and also when we know that there are no detached partitions.
79  *
80  * If there is no active snapshot, detached partitions aren't omitted
81  * either, so we can use the cached descriptor too in that case.
82  */
83  if (likely(rel->rd_partdesc &&
84  (!rel->rd_partdesc->detached_exist || !omit_detached ||
85  !ActiveSnapshotSet())))
86  return rel->rd_partdesc;
87 
88  /*
89  * If we're asked to omit detached partitions, we may be able to use a
90  * cached descriptor too. We determine that based on the pg_inherits.xmin
91  * that was saved alongside that descriptor: if the xmin that was not in
92  * progress for that active snapshot is also not in progress for the
93  * current active snapshot, then we can use it. Otherwise build one from
94  * scratch.
95  */
96  if (omit_detached &&
99  {
100  Snapshot activesnap;
101 
103  activesnap = GetActiveSnapshot();
104 
105  if (!XidInMVCCSnapshot(rel->rd_partdesc_nodetached_xmin, activesnap))
106  return rel->rd_partdesc_nodetached;
107  }
108 
109  return RelationBuildPartitionDesc(rel, omit_detached);
110 }
111 
112 /*
113  * RelationBuildPartitionDesc
114  * Form rel's partition descriptor, and store in relcache entry
115  *
116  * Partition descriptor is a complex structure; to avoid complicated logic to
117  * free individual elements whenever the relcache entry is flushed, we give it
118  * its own memory context, a child of CacheMemoryContext, which can easily be
119  * deleted on its own. To avoid leaking memory in that context in case of an
120  * error partway through this function, the context is initially created as a
121  * child of CurTransactionContext and only re-parented to CacheMemoryContext
122  * at the end, when no further errors are possible. Also, we don't make this
123  * context the current context except in very brief code sections, out of fear
124  * that some of our callees allocate memory on their own which would be leaked
125  * permanently.
126  *
127  * As a special case, partition descriptors that are requested to omit
128  * partitions being detached (and which contain such partitions) are transient
129  * and are not associated with the relcache entry. Such descriptors only last
130  * through the requesting Portal, so we use the corresponding memory context
131  * for them.
132  */
133 static PartitionDesc
134 RelationBuildPartitionDesc(Relation rel, bool omit_detached)
135 {
136  PartitionDesc partdesc;
137  PartitionBoundInfo boundinfo = NULL;
138  List *inhoids;
139  PartitionBoundSpec **boundspecs = NULL;
140  Oid *oids = NULL;
141  bool *is_leaf = NULL;
142  bool detached_exist;
143  bool is_omit;
144  TransactionId detached_xmin;
145  ListCell *cell;
146  int i,
147  nparts;
148  bool retried = false;
150  MemoryContext new_pdcxt;
151  MemoryContext oldcxt;
152  int *mapping;
153 
154 retry:
155 
156  /*
157  * Get partition oids from pg_inherits. This uses a single snapshot to
158  * fetch the list of children, so while more children may be getting added
159  * or removed concurrently, whatever this function returns will be
160  * accurate as of some well-defined point in time.
161  */
162  detached_exist = false;
163  detached_xmin = InvalidTransactionId;
165  omit_detached, NoLock,
166  &detached_exist,
167  &detached_xmin);
168 
169  nparts = list_length(inhoids);
170 
171  /* Allocate working arrays for OIDs, leaf flags, and boundspecs. */
172  if (nparts > 0)
173  {
174  oids = (Oid *) palloc(nparts * sizeof(Oid));
175  is_leaf = (bool *) palloc(nparts * sizeof(bool));
176  boundspecs = palloc(nparts * sizeof(PartitionBoundSpec *));
177  }
178 
179  /* Collect bound spec nodes for each partition. */
180  i = 0;
181  foreach(cell, inhoids)
182  {
183  Oid inhrelid = lfirst_oid(cell);
184  HeapTuple tuple;
185  PartitionBoundSpec *boundspec = NULL;
186 
187  /* Try fetching the tuple from the catcache, for speed. */
188  tuple = SearchSysCache1(RELOID, ObjectIdGetDatum(inhrelid));
189  if (HeapTupleIsValid(tuple))
190  {
191  Datum datum;
192  bool isnull;
193 
194  datum = SysCacheGetAttr(RELOID, tuple,
195  Anum_pg_class_relpartbound,
196  &isnull);
197  if (!isnull)
198  boundspec = stringToNode(TextDatumGetCString(datum));
199  ReleaseSysCache(tuple);
200  }
201 
202  /*
203  * Two problems are possible here. First, a concurrent ATTACH
204  * PARTITION might be in the process of adding a new partition, but
205  * the syscache doesn't have it, or its copy of it does not yet have
206  * its relpartbound set. We cannot just AcceptInvalidationMessages(),
207  * because the other process might have already removed itself from
208  * the ProcArray but not yet added its invalidation messages to the
209  * shared queue. We solve this problem by reading pg_class directly
210  * for the desired tuple.
211  *
212  * The other problem is that DETACH CONCURRENTLY is in the process of
213  * removing a partition, which happens in two steps: first it marks it
214  * as "detach pending", commits, then unsets relpartbound. If
215  * find_inheritance_children_extended included that partition but we
216  * below we see that DETACH CONCURRENTLY has reset relpartbound for
217  * it, we'd see an inconsistent view. (The inconsistency is seen
218  * because table_open below reads invalidation messages.) We protect
219  * against this by retrying find_inheritance_children_extended().
220  */
221  if (boundspec == NULL)
222  {
223  Relation pg_class;
224  SysScanDesc scan;
225  ScanKeyData key[1];
226  Datum datum;
227  bool isnull;
228 
229  pg_class = table_open(RelationRelationId, AccessShareLock);
230  ScanKeyInit(&key[0],
231  Anum_pg_class_oid,
232  BTEqualStrategyNumber, F_OIDEQ,
233  ObjectIdGetDatum(inhrelid));
234  scan = systable_beginscan(pg_class, ClassOidIndexId, true,
235  NULL, 1, key);
236  tuple = systable_getnext(scan);
237  datum = heap_getattr(tuple, Anum_pg_class_relpartbound,
238  RelationGetDescr(pg_class), &isnull);
239  if (!isnull)
240  boundspec = stringToNode(TextDatumGetCString(datum));
241  systable_endscan(scan);
242  table_close(pg_class, AccessShareLock);
243 
244  /*
245  * If we still don't get a relpartbound value, then it must be
246  * because of DETACH CONCURRENTLY. Restart from the top, as
247  * explained above. We only do this once, for two reasons: first,
248  * only one DETACH CONCURRENTLY session could affect us at a time,
249  * since each of them would have to wait for the snapshot under
250  * which this is running; and second, to avoid possible infinite
251  * loops in case of catalog corruption.
252  *
253  * Note that the current memory context is short-lived enough, so
254  * we needn't worry about memory leaks here.
255  */
256  if (!boundspec && !retried)
257  {
259  retried = true;
260  goto retry;
261  }
262  }
263 
264  /* Sanity checks. */
265  if (!boundspec)
266  elog(ERROR, "missing relpartbound for relation %u", inhrelid);
267  if (!IsA(boundspec, PartitionBoundSpec))
268  elog(ERROR, "invalid relpartbound for relation %u", inhrelid);
269 
270  /*
271  * If the PartitionBoundSpec says this is the default partition, its
272  * OID should match pg_partitioned_table.partdefid; if not, the
273  * catalog is corrupt.
274  */
275  if (boundspec->is_default)
276  {
277  Oid partdefid;
278 
280  if (partdefid != inhrelid)
281  elog(ERROR, "expected partdefid %u, but got %u",
282  inhrelid, partdefid);
283  }
284 
285  /* Save results. */
286  oids[i] = inhrelid;
287  is_leaf[i] = (get_rel_relkind(inhrelid) != RELKIND_PARTITIONED_TABLE);
288  boundspecs[i] = boundspec;
289  ++i;
290  }
291 
292  /*
293  * Create PartitionBoundInfo and mapping, working in the caller's context.
294  * This could fail, but we haven't done any damage if so.
295  */
296  if (nparts > 0)
297  boundinfo = partition_bounds_create(boundspecs, nparts, key, &mapping);
298 
299  /*
300  * Now build the actual relcache partition descriptor, copying all the
301  * data into a new, small context. As per above comment, we don't make
302  * this a long-lived context until it's finished.
303  */
305  "partition descriptor",
309 
310  partdesc = (PartitionDescData *)
311  MemoryContextAllocZero(new_pdcxt, sizeof(PartitionDescData));
312  partdesc->nparts = nparts;
313  partdesc->detached_exist = detached_exist;
314  /* If there are no partitions, the rest of the partdesc can stay zero */
315  if (nparts > 0)
316  {
317  oldcxt = MemoryContextSwitchTo(new_pdcxt);
318  partdesc->boundinfo = partition_bounds_copy(boundinfo, key);
319 
320  /* Initialize caching fields for speeding up ExecFindPartition */
321  partdesc->last_found_datum_index = -1;
322  partdesc->last_found_part_index = -1;
323  partdesc->last_found_count = 0;
324 
325  partdesc->oids = (Oid *) palloc(nparts * sizeof(Oid));
326  partdesc->is_leaf = (bool *) palloc(nparts * sizeof(bool));
327 
328  /*
329  * Assign OIDs from the original array into mapped indexes of the
330  * result array. The order of OIDs in the former is defined by the
331  * catalog scan that retrieved them, whereas that in the latter is
332  * defined by canonicalized representation of the partition bounds.
333  * Also save leaf-ness of each partition.
334  */
335  for (i = 0; i < nparts; i++)
336  {
337  int index = mapping[i];
338 
339  partdesc->oids[index] = oids[i];
340  partdesc->is_leaf[index] = is_leaf[i];
341  }
342  MemoryContextSwitchTo(oldcxt);
343  }
344 
345  /*
346  * Are we working with the partdesc that omits the detached partition, or
347  * the one that includes it?
348  *
349  * Note that if a partition was found by the catalog's scan to have been
350  * detached, but the pg_inherit tuple saying so was not visible to the
351  * active snapshot (find_inheritance_children_extended will not have set
352  * detached_xmin in that case), we consider there to be no "omittable"
353  * detached partitions.
354  */
355  is_omit = omit_detached && detached_exist && ActiveSnapshotSet() &&
356  TransactionIdIsValid(detached_xmin);
357 
358  /*
359  * We have a fully valid partdesc. Reparent it so that it has the right
360  * lifespan.
361  */
363 
364  /*
365  * Store it into relcache.
366  *
367  * But first, a kluge: if there's an old context for this type of
368  * descriptor, it contains an old partition descriptor that may still be
369  * referenced somewhere. Preserve it, while not leaking it, by
370  * reattaching it as a child context of the new one. Eventually it will
371  * get dropped by either RelationClose or RelationClearRelation. (We keep
372  * the regular partdesc in rd_pdcxt, and the partdesc-excluding-
373  * detached-partitions in rd_pddcxt.)
374  */
375  if (is_omit)
376  {
377  if (rel->rd_pddcxt != NULL)
378  MemoryContextSetParent(rel->rd_pddcxt, new_pdcxt);
379  rel->rd_pddcxt = new_pdcxt;
380  rel->rd_partdesc_nodetached = partdesc;
381 
382  /*
383  * For partdescs built excluding detached partitions, which we save
384  * separately, we also record the pg_inherits.xmin of the detached
385  * partition that was omitted; this informs a future potential user of
386  * such a cached partdesc to only use it after cross-checking that the
387  * xmin is indeed visible to the snapshot it is going to be working
388  * with.
389  */
390  Assert(TransactionIdIsValid(detached_xmin));
391  rel->rd_partdesc_nodetached_xmin = detached_xmin;
392  }
393  else
394  {
395  if (rel->rd_pdcxt != NULL)
396  MemoryContextSetParent(rel->rd_pdcxt, new_pdcxt);
397  rel->rd_pdcxt = new_pdcxt;
398  rel->rd_partdesc = partdesc;
399  }
400 
401  return partdesc;
402 }
403 
404 /*
405  * CreatePartitionDirectory
406  * Create a new partition directory object.
407  */
409 CreatePartitionDirectory(MemoryContext mcxt, bool omit_detached)
410 {
411  MemoryContext oldcontext = MemoryContextSwitchTo(mcxt);
412  PartitionDirectory pdir;
413  HASHCTL ctl;
414 
415  pdir = palloc(sizeof(PartitionDirectoryData));
416  pdir->pdir_mcxt = mcxt;
417 
418  ctl.keysize = sizeof(Oid);
419  ctl.entrysize = sizeof(PartitionDirectoryEntry);
420  ctl.hcxt = mcxt;
421 
422  pdir->pdir_hash = hash_create("partition directory", 256, &ctl,
424  pdir->omit_detached = omit_detached;
425 
426  MemoryContextSwitchTo(oldcontext);
427  return pdir;
428 }
429 
430 /*
431  * PartitionDirectoryLookup
432  * Look up the partition descriptor for a relation in the directory.
433  *
434  * The purpose of this function is to ensure that we get the same
435  * PartitionDesc for each relation every time we look it up. In the
436  * face of concurrent DDL, different PartitionDescs may be constructed with
437  * different views of the catalog state, but any single particular OID
438  * will always get the same PartitionDesc for as long as the same
439  * PartitionDirectory is used.
440  */
443 {
445  Oid relid = RelationGetRelid(rel);
446  bool found;
447 
448  pde = hash_search(pdir->pdir_hash, &relid, HASH_ENTER, &found);
449  if (!found)
450  {
451  /*
452  * We must keep a reference count on the relation so that the
453  * PartitionDesc to which we are pointing can't get destroyed.
454  */
456  pde->rel = rel;
457  pde->pd = RelationGetPartitionDesc(rel, pdir->omit_detached);
458  Assert(pde->pd != NULL);
459  }
460  return pde->pd;
461 }
462 
463 /*
464  * DestroyPartitionDirectory
465  * Destroy a partition directory.
466  *
467  * Release the reference counts we're holding.
468  */
469 void
471 {
472  HASH_SEQ_STATUS status;
474 
475  hash_seq_init(&status, pdir->pdir_hash);
476  while ((pde = hash_seq_search(&status)) != NULL)
478 }
479 
480 /*
481  * get_default_oid_from_partdesc
482  *
483  * Given a partition descriptor, return the OID of the default partition, if
484  * one exists; else, return InvalidOid.
485  */
486 Oid
488 {
489  if (partdesc && partdesc->boundinfo &&
491  return partdesc->oids[partdesc->boundinfo->default_index];
492 
493  return InvalidOid;
494 }
#define TextDatumGetCString(d)
Definition: builtins.h:98
#define likely(x)
Definition: c.h:310
#define Assert(condition)
Definition: c.h:858
uint32 TransactionId
Definition: c.h:652
void * hash_search(HTAB *hashp, const void *keyPtr, HASHACTION action, bool *foundPtr)
Definition: dynahash.c:955
HTAB * hash_create(const char *tabname, long nelem, const HASHCTL *info, int flags)
Definition: dynahash.c:352
void * hash_seq_search(HASH_SEQ_STATUS *status)
Definition: dynahash.c:1395
void hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
Definition: dynahash.c:1385
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:224
void systable_endscan(SysScanDesc sysscan)
Definition: genam.c:596
HeapTuple systable_getnext(SysScanDesc sysscan)
Definition: genam.c:503
SysScanDesc systable_beginscan(Relation heapRelation, Oid indexId, bool indexOK, Snapshot snapshot, int nkeys, ScanKey key)
Definition: genam.c:384
@ HASH_ENTER
Definition: hsearch.h:114
#define HASH_CONTEXT
Definition: hsearch.h:102
#define HASH_ELEM
Definition: hsearch.h:95
#define HASH_BLOBS
Definition: hsearch.h:97
#define HeapTupleIsValid(tuple)
Definition: htup.h:78
static Datum heap_getattr(HeapTuple tup, int attnum, TupleDesc tupleDesc, bool *isnull)
Definition: htup_details.h:792
void AcceptInvalidationMessages(void)
Definition: inval.c:806
int i
Definition: isn.c:73
#define NoLock
Definition: lockdefs.h:34
#define AccessShareLock
Definition: lockdefs.h:36
char get_rel_relkind(Oid relid)
Definition: lsyscache.c:2003
void MemoryContextSetParent(MemoryContext context, MemoryContext new_parent)
Definition: mcxt.c:637
void * MemoryContextAllocZero(MemoryContext context, Size size)
Definition: mcxt.c:1215
MemoryContext CurTransactionContext
Definition: mcxt.c:155
MemoryContext CacheMemoryContext
Definition: mcxt.c:152
void * palloc(Size size)
Definition: mcxt.c:1317
#define AllocSetContextCreate
Definition: memutils.h:129
#define ALLOCSET_SMALL_SIZES
Definition: memutils.h:170
#define MemoryContextCopyAndSetIdentifier(cxt, id)
Definition: memutils.h:101
#define IsA(nodeptr, _type_)
Definition: nodes.h:158
PartitionBoundInfo partition_bounds_create(PartitionBoundSpec **boundspecs, int nparts, PartitionKey key, int **mapping)
Definition: partbounds.c:299
PartitionBoundInfo partition_bounds_copy(PartitionBoundInfo src, PartitionKey key)
Definition: partbounds.c:1002
#define partition_bound_has_default(bi)
Definition: partbounds.h:99
PartitionKey RelationGetPartitionKey(Relation rel)
Definition: partcache.c:51
PartitionDirectory CreatePartitionDirectory(MemoryContext mcxt, bool omit_detached)
Definition: partdesc.c:409
void DestroyPartitionDirectory(PartitionDirectory pdir)
Definition: partdesc.c:470
PartitionDesc PartitionDirectoryLookup(PartitionDirectory pdir, Relation rel)
Definition: partdesc.c:442
PartitionDesc RelationGetPartitionDesc(Relation rel, bool omit_detached)
Definition: partdesc.c:71
static PartitionDesc RelationBuildPartitionDesc(Relation rel, bool omit_detached)
Definition: partdesc.c:134
Oid get_default_oid_from_partdesc(PartitionDesc partdesc)
Definition: partdesc.c:487
struct PartitionDirectoryEntry PartitionDirectoryEntry
struct PartitionDirectoryData PartitionDirectoryData
Oid get_default_partition_oid(Oid parentId)
Definition: partition.c:315
List * find_inheritance_children_extended(Oid parentrelId, bool omit_detached, LOCKMODE lockmode, bool *detached_exist, TransactionId *detached_xmin)
Definition: pg_inherits.c:82
static int list_length(const List *l)
Definition: pg_list.h:152
#define lfirst_oid(lc)
Definition: pg_list.h:174
uintptr_t Datum
Definition: postgres.h:64
static Datum ObjectIdGetDatum(Oid X)
Definition: postgres.h:252
#define InvalidOid
Definition: postgres_ext.h:36
unsigned int Oid
Definition: postgres_ext.h:31
MemoryContextSwitchTo(old_ctx)
tree ctl
Definition: radixtree.h:1853
void * stringToNode(const char *str)
Definition: read.c:90
#define RelationGetRelid(relation)
Definition: rel.h:505
#define RelationGetDescr(relation)
Definition: rel.h:531
#define RelationGetRelationName(relation)
Definition: rel.h:539
void RelationDecrementReferenceCount(Relation rel)
Definition: relcache.c:2172
void RelationIncrementReferenceCount(Relation rel)
Definition: relcache.c:2159
void ScanKeyInit(ScanKey entry, AttrNumber attributeNumber, StrategyNumber strategy, RegProcedure procedure, Datum argument)
Definition: scankey.c:76
bool XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot)
Definition: snapmgr.c:1856
bool ActiveSnapshotSet(void)
Definition: snapmgr.c:782
Snapshot GetActiveSnapshot(void)
Definition: snapmgr.c:770
#define BTEqualStrategyNumber
Definition: stratnum.h:31
Definition: dynahash.c:220
Definition: pg_list.h:54
int last_found_datum_index
Definition: partdesc.h:46
PartitionBoundInfo boundinfo
Definition: partdesc.h:38
int last_found_count
Definition: partdesc.h:63
bool detached_exist
Definition: partdesc.h:32
bool * is_leaf
Definition: partdesc.h:35
int last_found_part_index
Definition: partdesc.h:52
MemoryContext pdir_mcxt
Definition: partdesc.c:37
PartitionDesc pd
Definition: partdesc.c:46
MemoryContext rd_pdcxt
Definition: rel.h:131
TransactionId rd_partdesc_nodetached_xmin
Definition: rel.h:144
PartitionDesc rd_partdesc
Definition: rel.h:130
PartitionDesc rd_partdesc_nodetached
Definition: rel.h:134
MemoryContext rd_pddcxt
Definition: rel.h:135
Form_pg_class rd_rel
Definition: rel.h:111
Definition: type.h:95
void ReleaseSysCache(HeapTuple tuple)
Definition: syscache.c:266
HeapTuple SearchSysCache1(int cacheId, Datum key1)
Definition: syscache.c:218
Datum SysCacheGetAttr(int cacheId, HeapTuple tup, AttrNumber attributeNumber, bool *isNull)
Definition: syscache.c:479
void table_close(Relation relation, LOCKMODE lockmode)
Definition: table.c:126
Relation table_open(Oid relationId, LOCKMODE lockmode)
Definition: table.c:40
#define InvalidTransactionId
Definition: transam.h:31
#define TransactionIdIsValid(xid)
Definition: transam.h:41