PostgreSQL Source Code  git master
partdesc.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * partdesc.c
4  * Support routines for manipulating partition descriptors
5  *
6  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  * IDENTIFICATION
10  * src/backend/partitioning/partdesc.c
11  *
12  *-------------------------------------------------------------------------
13  */
14 
15 #include "postgres.h"
16 
17 #include "access/genam.h"
18 #include "access/htup_details.h"
19 #include "access/table.h"
20 #include "catalog/partition.h"
21 #include "catalog/pg_inherits.h"
23 #include "partitioning/partdesc.h"
24 #include "utils/builtins.h"
25 #include "utils/fmgroids.h"
26 #include "utils/hsearch.h"
27 #include "utils/inval.h"
28 #include "utils/lsyscache.h"
29 #include "utils/memutils.h"
30 #include "utils/partcache.h"
31 #include "utils/rel.h"
32 #include "utils/snapmgr.h"
33 #include "utils/syscache.h"
34 
35 typedef struct PartitionDirectoryData
36 {
41 
43 {
48 
50  bool omit_detached);
51 
52 
53 /*
54  * RelationGetPartitionDesc -- get partition descriptor, if relation is partitioned
55  *
56  * We keep two partdescs in relcache: rd_partdesc includes all partitions
57  * (even those being concurrently marked detached), while rd_partdesc_nodetached
58  * omits (some of) those. We store the pg_inherits.xmin value for the latter,
59  * to determine whether it can be validly reused in each case, since that
60  * depends on the active snapshot.
61  *
62  * Note: we arrange for partition descriptors to not get freed until the
63  * relcache entry's refcount goes to zero (see hacks in RelationClose,
64  * RelationClearRelation, and RelationBuildPartitionDesc). Therefore, even
65  * though we hand back a direct pointer into the relcache entry, it's safe
66  * for callers to continue to use that pointer as long as (a) they hold the
67  * relation open, and (b) they hold a relation lock strong enough to ensure
68  * that the data doesn't become stale.
69  */
71 RelationGetPartitionDesc(Relation rel, bool omit_detached)
72 {
73  Assert(rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE);
74 
75  /*
76  * If relcache has a partition descriptor, use that. However, we can only
77  * do so when we are asked to include all partitions including detached;
78  * and also when we know that there are no detached partitions.
79  *
80  * If there is no active snapshot, detached partitions aren't omitted
81  * either, so we can use the cached descriptor too in that case.
82  */
83  if (likely(rel->rd_partdesc &&
84  (!rel->rd_partdesc->detached_exist || !omit_detached ||
85  !ActiveSnapshotSet())))
86  return rel->rd_partdesc;
87 
88  /*
89  * If we're asked to omit detached partitions, we may be able to use a
90  * cached descriptor too. We determine that based on the pg_inherits.xmin
91  * that was saved alongside that descriptor: if the xmin that was not in
92  * progress for that active snapshot is also not in progress for the
93  * current active snapshot, then we can use it. Otherwise build one from
94  * scratch.
95  */
96  if (omit_detached &&
99  {
100  Snapshot activesnap;
101 
103  activesnap = GetActiveSnapshot();
104 
105  if (!XidInMVCCSnapshot(rel->rd_partdesc_nodetached_xmin, activesnap))
106  return rel->rd_partdesc_nodetached;
107  }
108 
109  return RelationBuildPartitionDesc(rel, omit_detached);
110 }
111 
112 /*
113  * RelationBuildPartitionDesc
114  * Form rel's partition descriptor, and store in relcache entry
115  *
116  * Partition descriptor is a complex structure; to avoid complicated logic to
117  * free individual elements whenever the relcache entry is flushed, we give it
118  * its own memory context, a child of CacheMemoryContext, which can easily be
119  * deleted on its own. To avoid leaking memory in that context in case of an
120  * error partway through this function, the context is initially created as a
121  * child of CurTransactionContext and only re-parented to CacheMemoryContext
122  * at the end, when no further errors are possible. Also, we don't make this
123  * context the current context except in very brief code sections, out of fear
124  * that some of our callees allocate memory on their own which would be leaked
125  * permanently.
126  *
127  * As a special case, partition descriptors that are requested to omit
128  * partitions being detached (and which contain such partitions) are transient
129  * and are not associated with the relcache entry. Such descriptors only last
130  * through the requesting Portal, so we use the corresponding memory context
131  * for them.
132  */
133 static PartitionDesc
134 RelationBuildPartitionDesc(Relation rel, bool omit_detached)
135 {
136  PartitionDesc partdesc;
137  PartitionBoundInfo boundinfo = NULL;
138  List *inhoids;
139  PartitionBoundSpec **boundspecs = NULL;
140  Oid *oids = NULL;
141  bool *is_leaf = NULL;
142  bool detached_exist;
143  bool is_omit;
144  TransactionId detached_xmin;
145  ListCell *cell;
146  int i,
147  nparts;
148  bool retried = false;
150  MemoryContext new_pdcxt;
151  MemoryContext oldcxt;
152  int *mapping;
153 
154 retry:
155 
156  /*
157  * Get partition oids from pg_inherits. This uses a single snapshot to
158  * fetch the list of children, so while more children may be getting added
159  * or removed concurrently, whatever this function returns will be
160  * accurate as of some well-defined point in time.
161  */
162  detached_exist = false;
163  detached_xmin = InvalidTransactionId;
165  omit_detached, NoLock,
166  &detached_exist,
167  &detached_xmin);
168 
169  nparts = list_length(inhoids);
170 
171  /* Allocate working arrays for OIDs, leaf flags, and boundspecs. */
172  if (nparts > 0)
173  {
174  oids = (Oid *) palloc(nparts * sizeof(Oid));
175  is_leaf = (bool *) palloc(nparts * sizeof(bool));
176  boundspecs = palloc(nparts * sizeof(PartitionBoundSpec *));
177  }
178 
179  /* Collect bound spec nodes for each partition. */
180  i = 0;
181  foreach(cell, inhoids)
182  {
183  Oid inhrelid = lfirst_oid(cell);
184  HeapTuple tuple;
185  PartitionBoundSpec *boundspec = NULL;
186 
187  /* Try fetching the tuple from the catcache, for speed. */
188  tuple = SearchSysCache1(RELOID, ObjectIdGetDatum(inhrelid));
189  if (HeapTupleIsValid(tuple))
190  {
191  Datum datum;
192  bool isnull;
193 
194  datum = SysCacheGetAttr(RELOID, tuple,
195  Anum_pg_class_relpartbound,
196  &isnull);
197  if (!isnull)
198  boundspec = stringToNode(TextDatumGetCString(datum));
199  ReleaseSysCache(tuple);
200  }
201 
202  /*
203  * Two problems are possible here. First, a concurrent ATTACH
204  * PARTITION might be in the process of adding a new partition, but
205  * the syscache doesn't have it, or its copy of it does not yet have
206  * its relpartbound set. We cannot just AcceptInvalidationMessages(),
207  * because the other process might have already removed itself from
208  * the ProcArray but not yet added its invalidation messages to the
209  * shared queue. We solve this problem by reading pg_class directly
210  * for the desired tuple.
211  *
212  * If the partition recently detached is also dropped, we get no tuple
213  * from the scan. In that case, we also retry, and next time through
214  * here, we don't see that partition anymore.
215  *
216  * The other problem is that DETACH CONCURRENTLY is in the process of
217  * removing a partition, which happens in two steps: first it marks it
218  * as "detach pending", commits, then unsets relpartbound. If
219  * find_inheritance_children_extended included that partition but we
220  * below we see that DETACH CONCURRENTLY has reset relpartbound for
221  * it, we'd see an inconsistent view. (The inconsistency is seen
222  * because table_open below reads invalidation messages.) We protect
223  * against this by retrying find_inheritance_children_extended().
224  */
225  if (boundspec == NULL)
226  {
227  Relation pg_class;
228  SysScanDesc scan;
229  ScanKeyData key[1];
230 
231  pg_class = table_open(RelationRelationId, AccessShareLock);
232  ScanKeyInit(&key[0],
233  Anum_pg_class_oid,
234  BTEqualStrategyNumber, F_OIDEQ,
235  ObjectIdGetDatum(inhrelid));
236  scan = systable_beginscan(pg_class, ClassOidIndexId, true,
237  NULL, 1, key);
238 
239  /*
240  * We could get one tuple from the scan (the normal case), or zero
241  * tuples if the table has been dropped meanwhile.
242  */
243  tuple = systable_getnext(scan);
244  if (HeapTupleIsValid(tuple))
245  {
246  Datum datum;
247  bool isnull;
248 
249  datum = heap_getattr(tuple, Anum_pg_class_relpartbound,
250  RelationGetDescr(pg_class), &isnull);
251  if (!isnull)
252  boundspec = stringToNode(TextDatumGetCString(datum));
253  }
254  systable_endscan(scan);
255  table_close(pg_class, AccessShareLock);
256 
257  /*
258  * If we still don't get a relpartbound value (either because
259  * boundspec is null or because there was no tuple), then it must
260  * be because of DETACH CONCURRENTLY. Restart from the top, as
261  * explained above. We only do this once, for two reasons: first,
262  * only one DETACH CONCURRENTLY session could affect us at a time,
263  * since each of them would have to wait for the snapshot under
264  * which this is running; and second, to avoid possible infinite
265  * loops in case of catalog corruption.
266  *
267  * Note that the current memory context is short-lived enough, so
268  * we needn't worry about memory leaks here.
269  */
270  if (!boundspec && !retried)
271  {
273  retried = true;
274  goto retry;
275  }
276  }
277 
278  /* Sanity checks. */
279  if (!boundspec)
280  elog(ERROR, "missing relpartbound for relation %u", inhrelid);
281  if (!IsA(boundspec, PartitionBoundSpec))
282  elog(ERROR, "invalid relpartbound for relation %u", inhrelid);
283 
284  /*
285  * If the PartitionBoundSpec says this is the default partition, its
286  * OID should match pg_partitioned_table.partdefid; if not, the
287  * catalog is corrupt.
288  */
289  if (boundspec->is_default)
290  {
291  Oid partdefid;
292 
294  if (partdefid != inhrelid)
295  elog(ERROR, "expected partdefid %u, but got %u",
296  inhrelid, partdefid);
297  }
298 
299  /* Save results. */
300  oids[i] = inhrelid;
301  is_leaf[i] = (get_rel_relkind(inhrelid) != RELKIND_PARTITIONED_TABLE);
302  boundspecs[i] = boundspec;
303  ++i;
304  }
305 
306  /*
307  * Create PartitionBoundInfo and mapping, working in the caller's context.
308  * This could fail, but we haven't done any damage if so.
309  */
310  if (nparts > 0)
311  boundinfo = partition_bounds_create(boundspecs, nparts, key, &mapping);
312 
313  /*
314  * Now build the actual relcache partition descriptor, copying all the
315  * data into a new, small context. As per above comment, we don't make
316  * this a long-lived context until it's finished.
317  */
319  "partition descriptor",
323 
324  partdesc = (PartitionDescData *)
325  MemoryContextAllocZero(new_pdcxt, sizeof(PartitionDescData));
326  partdesc->nparts = nparts;
327  partdesc->detached_exist = detached_exist;
328  /* If there are no partitions, the rest of the partdesc can stay zero */
329  if (nparts > 0)
330  {
331  oldcxt = MemoryContextSwitchTo(new_pdcxt);
332  partdesc->boundinfo = partition_bounds_copy(boundinfo, key);
333 
334  /* Initialize caching fields for speeding up ExecFindPartition */
335  partdesc->last_found_datum_index = -1;
336  partdesc->last_found_part_index = -1;
337  partdesc->last_found_count = 0;
338 
339  partdesc->oids = (Oid *) palloc(nparts * sizeof(Oid));
340  partdesc->is_leaf = (bool *) palloc(nparts * sizeof(bool));
341 
342  /*
343  * Assign OIDs from the original array into mapped indexes of the
344  * result array. The order of OIDs in the former is defined by the
345  * catalog scan that retrieved them, whereas that in the latter is
346  * defined by canonicalized representation of the partition bounds.
347  * Also save leaf-ness of each partition.
348  */
349  for (i = 0; i < nparts; i++)
350  {
351  int index = mapping[i];
352 
353  partdesc->oids[index] = oids[i];
354  partdesc->is_leaf[index] = is_leaf[i];
355  }
356  MemoryContextSwitchTo(oldcxt);
357  }
358 
359  /*
360  * Are we working with the partdesc that omits the detached partition, or
361  * the one that includes it?
362  *
363  * Note that if a partition was found by the catalog's scan to have been
364  * detached, but the pg_inherit tuple saying so was not visible to the
365  * active snapshot (find_inheritance_children_extended will not have set
366  * detached_xmin in that case), we consider there to be no "omittable"
367  * detached partitions.
368  */
369  is_omit = omit_detached && detached_exist && ActiveSnapshotSet() &&
370  TransactionIdIsValid(detached_xmin);
371 
372  /*
373  * We have a fully valid partdesc. Reparent it so that it has the right
374  * lifespan.
375  */
377 
378  /*
379  * Store it into relcache.
380  *
381  * But first, a kluge: if there's an old context for this type of
382  * descriptor, it contains an old partition descriptor that may still be
383  * referenced somewhere. Preserve it, while not leaking it, by
384  * reattaching it as a child context of the new one. Eventually it will
385  * get dropped by either RelationClose or RelationClearRelation. (We keep
386  * the regular partdesc in rd_pdcxt, and the partdesc-excluding-
387  * detached-partitions in rd_pddcxt.)
388  */
389  if (is_omit)
390  {
391  if (rel->rd_pddcxt != NULL)
392  MemoryContextSetParent(rel->rd_pddcxt, new_pdcxt);
393  rel->rd_pddcxt = new_pdcxt;
394  rel->rd_partdesc_nodetached = partdesc;
395 
396  /*
397  * For partdescs built excluding detached partitions, which we save
398  * separately, we also record the pg_inherits.xmin of the detached
399  * partition that was omitted; this informs a future potential user of
400  * such a cached partdesc to only use it after cross-checking that the
401  * xmin is indeed visible to the snapshot it is going to be working
402  * with.
403  */
404  Assert(TransactionIdIsValid(detached_xmin));
405  rel->rd_partdesc_nodetached_xmin = detached_xmin;
406  }
407  else
408  {
409  if (rel->rd_pdcxt != NULL)
410  MemoryContextSetParent(rel->rd_pdcxt, new_pdcxt);
411  rel->rd_pdcxt = new_pdcxt;
412  rel->rd_partdesc = partdesc;
413  }
414 
415  return partdesc;
416 }
417 
418 /*
419  * CreatePartitionDirectory
420  * Create a new partition directory object.
421  */
423 CreatePartitionDirectory(MemoryContext mcxt, bool omit_detached)
424 {
425  MemoryContext oldcontext = MemoryContextSwitchTo(mcxt);
426  PartitionDirectory pdir;
427  HASHCTL ctl;
428 
429  pdir = palloc(sizeof(PartitionDirectoryData));
430  pdir->pdir_mcxt = mcxt;
431 
432  ctl.keysize = sizeof(Oid);
433  ctl.entrysize = sizeof(PartitionDirectoryEntry);
434  ctl.hcxt = mcxt;
435 
436  pdir->pdir_hash = hash_create("partition directory", 256, &ctl,
438  pdir->omit_detached = omit_detached;
439 
440  MemoryContextSwitchTo(oldcontext);
441  return pdir;
442 }
443 
444 /*
445  * PartitionDirectoryLookup
446  * Look up the partition descriptor for a relation in the directory.
447  *
448  * The purpose of this function is to ensure that we get the same
449  * PartitionDesc for each relation every time we look it up. In the
450  * face of concurrent DDL, different PartitionDescs may be constructed with
451  * different views of the catalog state, but any single particular OID
452  * will always get the same PartitionDesc for as long as the same
453  * PartitionDirectory is used.
454  */
457 {
459  Oid relid = RelationGetRelid(rel);
460  bool found;
461 
462  pde = hash_search(pdir->pdir_hash, &relid, HASH_ENTER, &found);
463  if (!found)
464  {
465  /*
466  * We must keep a reference count on the relation so that the
467  * PartitionDesc to which we are pointing can't get destroyed.
468  */
470  pde->rel = rel;
471  pde->pd = RelationGetPartitionDesc(rel, pdir->omit_detached);
472  Assert(pde->pd != NULL);
473  }
474  return pde->pd;
475 }
476 
477 /*
478  * DestroyPartitionDirectory
479  * Destroy a partition directory.
480  *
481  * Release the reference counts we're holding.
482  */
483 void
485 {
486  HASH_SEQ_STATUS status;
488 
489  hash_seq_init(&status, pdir->pdir_hash);
490  while ((pde = hash_seq_search(&status)) != NULL)
492 }
493 
494 /*
495  * get_default_oid_from_partdesc
496  *
497  * Given a partition descriptor, return the OID of the default partition, if
498  * one exists; else, return InvalidOid.
499  */
500 Oid
502 {
503  if (partdesc && partdesc->boundinfo &&
505  return partdesc->oids[partdesc->boundinfo->default_index];
506 
507  return InvalidOid;
508 }
#define TextDatumGetCString(d)
Definition: builtins.h:98
#define likely(x)
Definition: c.h:329
#define Assert(condition)
Definition: c.h:812
uint32 TransactionId
Definition: c.h:606
void * hash_search(HTAB *hashp, const void *keyPtr, HASHACTION action, bool *foundPtr)
Definition: dynahash.c:955
HTAB * hash_create(const char *tabname, long nelem, const HASHCTL *info, int flags)
Definition: dynahash.c:352
void * hash_seq_search(HASH_SEQ_STATUS *status)
Definition: dynahash.c:1420
void hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
Definition: dynahash.c:1385
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:225
void systable_endscan(SysScanDesc sysscan)
Definition: genam.c:606
HeapTuple systable_getnext(SysScanDesc sysscan)
Definition: genam.c:513
SysScanDesc systable_beginscan(Relation heapRelation, Oid indexId, bool indexOK, Snapshot snapshot, int nkeys, ScanKey key)
Definition: genam.c:387
@ HASH_ENTER
Definition: hsearch.h:114
#define HASH_CONTEXT
Definition: hsearch.h:102
#define HASH_ELEM
Definition: hsearch.h:95
#define HASH_BLOBS
Definition: hsearch.h:97
#define HeapTupleIsValid(tuple)
Definition: htup.h:78
static Datum heap_getattr(HeapTuple tup, int attnum, TupleDesc tupleDesc, bool *isnull)
Definition: htup_details.h:792
void AcceptInvalidationMessages(void)
Definition: inval.c:863
int i
Definition: isn.c:72
#define NoLock
Definition: lockdefs.h:34
#define AccessShareLock
Definition: lockdefs.h:36
char get_rel_relkind(Oid relid)
Definition: lsyscache.c:2003
void MemoryContextSetParent(MemoryContext context, MemoryContext new_parent)
Definition: mcxt.c:637
void * MemoryContextAllocZero(MemoryContext context, Size size)
Definition: mcxt.c:1215
MemoryContext CurTransactionContext
Definition: mcxt.c:155
MemoryContext CacheMemoryContext
Definition: mcxt.c:152
void * palloc(Size size)
Definition: mcxt.c:1317
#define AllocSetContextCreate
Definition: memutils.h:129
#define ALLOCSET_SMALL_SIZES
Definition: memutils.h:170
#define MemoryContextCopyAndSetIdentifier(cxt, id)
Definition: memutils.h:101
#define IsA(nodeptr, _type_)
Definition: nodes.h:158
PartitionBoundInfo partition_bounds_create(PartitionBoundSpec **boundspecs, int nparts, PartitionKey key, int **mapping)
Definition: partbounds.c:299
PartitionBoundInfo partition_bounds_copy(PartitionBoundInfo src, PartitionKey key)
Definition: partbounds.c:1002
#define partition_bound_has_default(bi)
Definition: partbounds.h:99
PartitionKey RelationGetPartitionKey(Relation rel)
Definition: partcache.c:51
PartitionDirectory CreatePartitionDirectory(MemoryContext mcxt, bool omit_detached)
Definition: partdesc.c:423
void DestroyPartitionDirectory(PartitionDirectory pdir)
Definition: partdesc.c:484
PartitionDesc PartitionDirectoryLookup(PartitionDirectory pdir, Relation rel)
Definition: partdesc.c:456
PartitionDesc RelationGetPartitionDesc(Relation rel, bool omit_detached)
Definition: partdesc.c:71
static PartitionDesc RelationBuildPartitionDesc(Relation rel, bool omit_detached)
Definition: partdesc.c:134
Oid get_default_oid_from_partdesc(PartitionDesc partdesc)
Definition: partdesc.c:501
struct PartitionDirectoryEntry PartitionDirectoryEntry
struct PartitionDirectoryData PartitionDirectoryData
Oid get_default_partition_oid(Oid parentId)
Definition: partition.c:315
List * find_inheritance_children_extended(Oid parentrelId, bool omit_detached, LOCKMODE lockmode, bool *detached_exist, TransactionId *detached_xmin)
Definition: pg_inherits.c:82
static int list_length(const List *l)
Definition: pg_list.h:152
#define lfirst_oid(lc)
Definition: pg_list.h:174
uintptr_t Datum
Definition: postgres.h:64
static Datum ObjectIdGetDatum(Oid X)
Definition: postgres.h:252
#define InvalidOid
Definition: postgres_ext.h:36
unsigned int Oid
Definition: postgres_ext.h:31
MemoryContextSwitchTo(old_ctx)
tree ctl
Definition: radixtree.h:1855
void * stringToNode(const char *str)
Definition: read.c:90
#define RelationGetRelid(relation)
Definition: rel.h:505
#define RelationGetDescr(relation)
Definition: rel.h:531
#define RelationGetRelationName(relation)
Definition: rel.h:539
void RelationDecrementReferenceCount(Relation rel)
Definition: relcache.c:2163
void RelationIncrementReferenceCount(Relation rel)
Definition: relcache.c:2150
void ScanKeyInit(ScanKey entry, AttrNumber attributeNumber, StrategyNumber strategy, RegProcedure procedure, Datum argument)
Definition: scankey.c:76
bool XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot)
Definition: snapmgr.c:1804
bool ActiveSnapshotSet(void)
Definition: snapmgr.c:744
Snapshot GetActiveSnapshot(void)
Definition: snapmgr.c:732
#define BTEqualStrategyNumber
Definition: stratnum.h:31
Definition: dynahash.c:220
Definition: pg_list.h:54
int last_found_datum_index
Definition: partdesc.h:46
PartitionBoundInfo boundinfo
Definition: partdesc.h:38
int last_found_count
Definition: partdesc.h:63
bool detached_exist
Definition: partdesc.h:32
bool * is_leaf
Definition: partdesc.h:35
int last_found_part_index
Definition: partdesc.h:52
MemoryContext pdir_mcxt
Definition: partdesc.c:37
PartitionDesc pd
Definition: partdesc.c:46
MemoryContext rd_pdcxt
Definition: rel.h:131
TransactionId rd_partdesc_nodetached_xmin
Definition: rel.h:144
PartitionDesc rd_partdesc
Definition: rel.h:130
PartitionDesc rd_partdesc_nodetached
Definition: rel.h:134
MemoryContext rd_pddcxt
Definition: rel.h:135
Form_pg_class rd_rel
Definition: rel.h:111
Definition: type.h:96
void ReleaseSysCache(HeapTuple tuple)
Definition: syscache.c:269
HeapTuple SearchSysCache1(int cacheId, Datum key1)
Definition: syscache.c:221
Datum SysCacheGetAttr(int cacheId, HeapTuple tup, AttrNumber attributeNumber, bool *isNull)
Definition: syscache.c:600
void table_close(Relation relation, LOCKMODE lockmode)
Definition: table.c:126
Relation table_open(Oid relationId, LOCKMODE lockmode)
Definition: table.c:40
#define InvalidTransactionId
Definition: transam.h:31
#define TransactionIdIsValid(xid)
Definition: transam.h:41