PostgreSQL Source Code  git master
partdesc.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * partdesc.c
4  * Support routines for manipulating partition descriptors
5  *
6  * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  * IDENTIFICATION
10  * src/backend/partitioning/partdesc.c
11  *
12  *-------------------------------------------------------------------------
13  */
14 
15 #include "postgres.h"
16 
17 #include "access/genam.h"
18 #include "access/htup_details.h"
19 #include "access/table.h"
20 #include "catalog/partition.h"
21 #include "catalog/pg_inherits.h"
23 #include "partitioning/partdesc.h"
24 #include "storage/bufmgr.h"
25 #include "storage/sinval.h"
26 #include "utils/builtins.h"
27 #include "utils/fmgroids.h"
28 #include "utils/hsearch.h"
29 #include "utils/inval.h"
30 #include "utils/lsyscache.h"
31 #include "utils/memutils.h"
32 #include "utils/partcache.h"
33 #include "utils/rel.h"
34 #include "utils/syscache.h"
35 
36 typedef struct PartitionDirectoryData
37 {
42 
44 {
49 
51  bool omit_detached);
52 
53 
54 /*
55  * RelationGetPartitionDesc -- get partition descriptor, if relation is partitioned
56  *
57  * We keep two partdescs in relcache: rd_partdesc includes all partitions
58  * (even those being concurrently marked detached), while rd_partdesc_nodetached
59  * omits (some of) those. We store the pg_inherits.xmin value for the latter,
60  * to determine whether it can be validly reused in each case, since that
61  * depends on the active snapshot.
62  *
63  * Note: we arrange for partition descriptors to not get freed until the
64  * relcache entry's refcount goes to zero (see hacks in RelationClose,
65  * RelationClearRelation, and RelationBuildPartitionDesc). Therefore, even
66  * though we hand back a direct pointer into the relcache entry, it's safe
67  * for callers to continue to use that pointer as long as (a) they hold the
68  * relation open, and (b) they hold a relation lock strong enough to ensure
69  * that the data doesn't become stale.
70  */
72 RelationGetPartitionDesc(Relation rel, bool omit_detached)
73 {
74  Assert(rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE);
75 
76  /*
77  * If relcache has a partition descriptor, use that. However, we can only
78  * do so when we are asked to include all partitions including detached;
79  * and also when we know that there are no detached partitions.
80  *
81  * If there is no active snapshot, detached partitions aren't omitted
82  * either, so we can use the cached descriptor too in that case.
83  */
84  if (likely(rel->rd_partdesc &&
85  (!rel->rd_partdesc->detached_exist || !omit_detached ||
86  !ActiveSnapshotSet())))
87  return rel->rd_partdesc;
88 
89  /*
90  * If we're asked to omit detached partitions, we may be able to use a
91  * cached descriptor too. We determine that based on the pg_inherits.xmin
92  * that was saved alongside that descriptor: if the xmin that was not in
93  * progress for that active snapshot is also not in progress for the
94  * current active snapshot, then we can use it. Otherwise build one from
95  * scratch.
96  */
97  if (omit_detached &&
100  {
101  Snapshot activesnap;
102 
104  activesnap = GetActiveSnapshot();
105 
106  if (!XidInMVCCSnapshot(rel->rd_partdesc_nodetached_xmin, activesnap))
107  return rel->rd_partdesc_nodetached;
108  }
109 
110  return RelationBuildPartitionDesc(rel, omit_detached);
111 }
112 
113 /*
114  * RelationBuildPartitionDesc
115  * Form rel's partition descriptor, and store in relcache entry
116  *
117  * Partition descriptor is a complex structure; to avoid complicated logic to
118  * free individual elements whenever the relcache entry is flushed, we give it
119  * its own memory context, a child of CacheMemoryContext, which can easily be
120  * deleted on its own. To avoid leaking memory in that context in case of an
121  * error partway through this function, the context is initially created as a
122  * child of CurTransactionContext and only re-parented to CacheMemoryContext
123  * at the end, when no further errors are possible. Also, we don't make this
124  * context the current context except in very brief code sections, out of fear
125  * that some of our callees allocate memory on their own which would be leaked
126  * permanently.
127  *
128  * As a special case, partition descriptors that are requested to omit
129  * partitions being detached (and which contain such partitions) are transient
130  * and are not associated with the relcache entry. Such descriptors only last
131  * through the requesting Portal, so we use the corresponding memory context
132  * for them.
133  */
134 static PartitionDesc
135 RelationBuildPartitionDesc(Relation rel, bool omit_detached)
136 {
137  PartitionDesc partdesc;
138  PartitionBoundInfo boundinfo = NULL;
139  List *inhoids;
140  PartitionBoundSpec **boundspecs = NULL;
141  Oid *oids = NULL;
142  bool *is_leaf = NULL;
143  bool detached_exist;
144  bool is_omit;
145  TransactionId detached_xmin;
146  ListCell *cell;
147  int i,
148  nparts;
150  MemoryContext new_pdcxt;
151  MemoryContext oldcxt;
152  int *mapping;
153 
154  /*
155  * Get partition oids from pg_inherits. This uses a single snapshot to
156  * fetch the list of children, so while more children may be getting added
157  * concurrently, whatever this function returns will be accurate as of
158  * some well-defined point in time.
159  */
160  detached_exist = false;
161  detached_xmin = InvalidTransactionId;
163  omit_detached, NoLock,
164  &detached_exist,
165  &detached_xmin);
166 
167  nparts = list_length(inhoids);
168 
169  /* Allocate working arrays for OIDs, leaf flags, and boundspecs. */
170  if (nparts > 0)
171  {
172  oids = (Oid *) palloc(nparts * sizeof(Oid));
173  is_leaf = (bool *) palloc(nparts * sizeof(bool));
174  boundspecs = palloc(nparts * sizeof(PartitionBoundSpec *));
175  }
176 
177  /* Collect bound spec nodes for each partition. */
178  i = 0;
179  foreach(cell, inhoids)
180  {
181  Oid inhrelid = lfirst_oid(cell);
182  HeapTuple tuple;
183  PartitionBoundSpec *boundspec = NULL;
184 
185  /* Try fetching the tuple from the catcache, for speed. */
186  tuple = SearchSysCache1(RELOID, ObjectIdGetDatum(inhrelid));
187  if (HeapTupleIsValid(tuple))
188  {
189  Datum datum;
190  bool isnull;
191 
192  datum = SysCacheGetAttr(RELOID, tuple,
193  Anum_pg_class_relpartbound,
194  &isnull);
195  if (!isnull)
196  boundspec = stringToNode(TextDatumGetCString(datum));
197  ReleaseSysCache(tuple);
198  }
199 
200  /*
201  * The system cache may be out of date; if so, we may find no pg_class
202  * tuple or an old one where relpartbound is NULL. In that case, try
203  * the table directly. We can't just AcceptInvalidationMessages() and
204  * retry the system cache lookup because it's possible that a
205  * concurrent ATTACH PARTITION operation has removed itself from the
206  * ProcArray but not yet added invalidation messages to the shared
207  * queue; InvalidateSystemCaches() would work, but seems excessive.
208  *
209  * Note that this algorithm assumes that PartitionBoundSpec we manage
210  * to fetch is the right one -- so this is only good enough for
211  * concurrent ATTACH PARTITION, not concurrent DETACH PARTITION or
212  * some hypothetical operation that changes the partition bounds.
213  */
214  if (boundspec == NULL)
215  {
216  Relation pg_class;
217  SysScanDesc scan;
218  ScanKeyData key[1];
219  Datum datum;
220  bool isnull;
221 
222  pg_class = table_open(RelationRelationId, AccessShareLock);
223  ScanKeyInit(&key[0],
224  Anum_pg_class_oid,
225  BTEqualStrategyNumber, F_OIDEQ,
226  ObjectIdGetDatum(inhrelid));
227  scan = systable_beginscan(pg_class, ClassOidIndexId, true,
228  NULL, 1, key);
229  tuple = systable_getnext(scan);
230  datum = heap_getattr(tuple, Anum_pg_class_relpartbound,
231  RelationGetDescr(pg_class), &isnull);
232  if (!isnull)
233  boundspec = stringToNode(TextDatumGetCString(datum));
234  systable_endscan(scan);
235  table_close(pg_class, AccessShareLock);
236  }
237 
238  /* Sanity checks. */
239  if (!boundspec)
240  elog(ERROR, "missing relpartbound for relation %u", inhrelid);
241  if (!IsA(boundspec, PartitionBoundSpec))
242  elog(ERROR, "invalid relpartbound for relation %u", inhrelid);
243 
244  /*
245  * If the PartitionBoundSpec says this is the default partition, its
246  * OID should match pg_partitioned_table.partdefid; if not, the
247  * catalog is corrupt.
248  */
249  if (boundspec->is_default)
250  {
251  Oid partdefid;
252 
254  if (partdefid != inhrelid)
255  elog(ERROR, "expected partdefid %u, but got %u",
256  inhrelid, partdefid);
257  }
258 
259  /* Save results. */
260  oids[i] = inhrelid;
261  is_leaf[i] = (get_rel_relkind(inhrelid) != RELKIND_PARTITIONED_TABLE);
262  boundspecs[i] = boundspec;
263  ++i;
264  }
265 
266  /*
267  * Create PartitionBoundInfo and mapping, working in the caller's context.
268  * This could fail, but we haven't done any damage if so.
269  */
270  if (nparts > 0)
271  boundinfo = partition_bounds_create(boundspecs, nparts, key, &mapping);
272 
273  /*
274  * Now build the actual relcache partition descriptor, copying all the
275  * data into a new, small context. As per above comment, we don't make
276  * this a long-lived context until it's finished.
277  */
279  "partition descriptor",
283 
284  partdesc = (PartitionDescData *)
285  MemoryContextAllocZero(new_pdcxt, sizeof(PartitionDescData));
286  partdesc->nparts = nparts;
287  partdesc->detached_exist = detached_exist;
288  /* If there are no partitions, the rest of the partdesc can stay zero */
289  if (nparts > 0)
290  {
291  oldcxt = MemoryContextSwitchTo(new_pdcxt);
292  partdesc->boundinfo = partition_bounds_copy(boundinfo, key);
293 
294  /* Initialize caching fields for speeding up ExecFindPartition */
295  partdesc->last_found_datum_index = -1;
296  partdesc->last_found_part_index = -1;
297  partdesc->last_found_count = 0;
298 
299  partdesc->oids = (Oid *) palloc(nparts * sizeof(Oid));
300  partdesc->is_leaf = (bool *) palloc(nparts * sizeof(bool));
301 
302  /*
303  * Assign OIDs from the original array into mapped indexes of the
304  * result array. The order of OIDs in the former is defined by the
305  * catalog scan that retrieved them, whereas that in the latter is
306  * defined by canonicalized representation of the partition bounds.
307  * Also save leaf-ness of each partition.
308  */
309  for (i = 0; i < nparts; i++)
310  {
311  int index = mapping[i];
312 
313  partdesc->oids[index] = oids[i];
314  partdesc->is_leaf[index] = is_leaf[i];
315  }
316  MemoryContextSwitchTo(oldcxt);
317  }
318 
319  /*
320  * Are we working with the partdesc that omits the detached partition, or
321  * the one that includes it?
322  *
323  * Note that if a partition was found by the catalog's scan to have been
324  * detached, but the pg_inherit tuple saying so was not visible to the
325  * active snapshot (find_inheritance_children_extended will not have set
326  * detached_xmin in that case), we consider there to be no "omittable"
327  * detached partitions.
328  */
329  is_omit = omit_detached && detached_exist && ActiveSnapshotSet() &&
330  TransactionIdIsValid(detached_xmin);
331 
332  /*
333  * We have a fully valid partdesc. Reparent it so that it has the right
334  * lifespan.
335  */
337 
338  /*
339  * Store it into relcache.
340  *
341  * But first, a kluge: if there's an old context for this type of
342  * descriptor, it contains an old partition descriptor that may still be
343  * referenced somewhere. Preserve it, while not leaking it, by
344  * reattaching it as a child context of the new one. Eventually it will
345  * get dropped by either RelationClose or RelationClearRelation. (We keep
346  * the regular partdesc in rd_pdcxt, and the partdesc-excluding-
347  * detached-partitions in rd_pddcxt.)
348  */
349  if (is_omit)
350  {
351  if (rel->rd_pddcxt != NULL)
352  MemoryContextSetParent(rel->rd_pddcxt, new_pdcxt);
353  rel->rd_pddcxt = new_pdcxt;
354  rel->rd_partdesc_nodetached = partdesc;
355 
356  /*
357  * For partdescs built excluding detached partitions, which we save
358  * separately, we also record the pg_inherits.xmin of the detached
359  * partition that was omitted; this informs a future potential user of
360  * such a cached partdesc to only use it after cross-checking that the
361  * xmin is indeed visible to the snapshot it is going to be working
362  * with.
363  */
364  Assert(TransactionIdIsValid(detached_xmin));
365  rel->rd_partdesc_nodetached_xmin = detached_xmin;
366  }
367  else
368  {
369  if (rel->rd_pdcxt != NULL)
370  MemoryContextSetParent(rel->rd_pdcxt, new_pdcxt);
371  rel->rd_pdcxt = new_pdcxt;
372  rel->rd_partdesc = partdesc;
373  }
374 
375  return partdesc;
376 }
377 
378 /*
379  * CreatePartitionDirectory
380  * Create a new partition directory object.
381  */
383 CreatePartitionDirectory(MemoryContext mcxt, bool omit_detached)
384 {
385  MemoryContext oldcontext = MemoryContextSwitchTo(mcxt);
386  PartitionDirectory pdir;
387  HASHCTL ctl;
388 
389  pdir = palloc(sizeof(PartitionDirectoryData));
390  pdir->pdir_mcxt = mcxt;
391 
392  ctl.keysize = sizeof(Oid);
393  ctl.entrysize = sizeof(PartitionDirectoryEntry);
394  ctl.hcxt = mcxt;
395 
396  pdir->pdir_hash = hash_create("partition directory", 256, &ctl,
398  pdir->omit_detached = omit_detached;
399 
400  MemoryContextSwitchTo(oldcontext);
401  return pdir;
402 }
403 
404 /*
405  * PartitionDirectoryLookup
406  * Look up the partition descriptor for a relation in the directory.
407  *
408  * The purpose of this function is to ensure that we get the same
409  * PartitionDesc for each relation every time we look it up. In the
410  * face of concurrent DDL, different PartitionDescs may be constructed with
411  * different views of the catalog state, but any single particular OID
412  * will always get the same PartitionDesc for as long as the same
413  * PartitionDirectory is used.
414  */
417 {
419  Oid relid = RelationGetRelid(rel);
420  bool found;
421 
422  pde = hash_search(pdir->pdir_hash, &relid, HASH_ENTER, &found);
423  if (!found)
424  {
425  /*
426  * We must keep a reference count on the relation so that the
427  * PartitionDesc to which we are pointing can't get destroyed.
428  */
430  pde->rel = rel;
431  pde->pd = RelationGetPartitionDesc(rel, pdir->omit_detached);
432  Assert(pde->pd != NULL);
433  }
434  return pde->pd;
435 }
436 
437 /*
438  * DestroyPartitionDirectory
439  * Destroy a partition directory.
440  *
441  * Release the reference counts we're holding.
442  */
443 void
445 {
446  HASH_SEQ_STATUS status;
448 
449  hash_seq_init(&status, pdir->pdir_hash);
450  while ((pde = hash_seq_search(&status)) != NULL)
452 }
453 
454 /*
455  * get_default_oid_from_partdesc
456  *
457  * Given a partition descriptor, return the OID of the default partition, if
458  * one exists; else, return InvalidOid.
459  */
460 Oid
462 {
463  if (partdesc && partdesc->boundinfo &&
465  return partdesc->oids[partdesc->boundinfo->default_index];
466 
467  return InvalidOid;
468 }
#define TextDatumGetCString(d)
Definition: builtins.h:95
#define likely(x)
Definition: c.h:299
uint32 TransactionId
Definition: c.h:641
void * hash_search(HTAB *hashp, const void *keyPtr, HASHACTION action, bool *foundPtr)
Definition: dynahash.c:953
HTAB * hash_create(const char *tabname, long nelem, const HASHCTL *info, int flags)
Definition: dynahash.c:350
void * hash_seq_search(HASH_SEQ_STATUS *status)
Definition: dynahash.c:1431
void hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
Definition: dynahash.c:1421
#define ERROR
Definition: elog.h:39
void systable_endscan(SysScanDesc sysscan)
Definition: genam.c:599
HeapTuple systable_getnext(SysScanDesc sysscan)
Definition: genam.c:506
SysScanDesc systable_beginscan(Relation heapRelation, Oid indexId, bool indexOK, Snapshot snapshot, int nkeys, ScanKey key)
Definition: genam.c:387
@ HASH_ENTER
Definition: hsearch.h:114
#define HASH_CONTEXT
Definition: hsearch.h:102
#define HASH_ELEM
Definition: hsearch.h:95
#define HASH_BLOBS
Definition: hsearch.h:97
#define HeapTupleIsValid(tuple)
Definition: htup.h:78
static Datum heap_getattr(HeapTuple tup, int attnum, TupleDesc tupleDesc, bool *isnull)
Definition: htup_details.h:792
int i
Definition: isn.c:73
Assert(fmt[strlen(fmt) - 1] !='\n')
#define NoLock
Definition: lockdefs.h:34
#define AccessShareLock
Definition: lockdefs.h:36
char get_rel_relkind(Oid relid)
Definition: lsyscache.c:2007
void MemoryContextSetParent(MemoryContext context, MemoryContext new_parent)
Definition: mcxt.c:546
void * MemoryContextAllocZero(MemoryContext context, Size size)
Definition: mcxt.c:1064
MemoryContext CurTransactionContext
Definition: mcxt.c:147
MemoryContext CacheMemoryContext
Definition: mcxt.c:144
void * palloc(Size size)
Definition: mcxt.c:1226
#define AllocSetContextCreate
Definition: memutils.h:129
#define ALLOCSET_SMALL_SIZES
Definition: memutils.h:163
#define MemoryContextCopyAndSetIdentifier(cxt, id)
Definition: memutils.h:101
#define IsA(nodeptr, _type_)
Definition: nodes.h:179
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
Definition: palloc.h:138
PartitionBoundInfo partition_bounds_create(PartitionBoundSpec **boundspecs, int nparts, PartitionKey key, int **mapping)
Definition: partbounds.c:300
PartitionBoundInfo partition_bounds_copy(PartitionBoundInfo src, PartitionKey key)
Definition: partbounds.c:1003
#define partition_bound_has_default(bi)
Definition: partbounds.h:99
PartitionKey RelationGetPartitionKey(Relation rel)
Definition: partcache.c:54
PartitionDirectory CreatePartitionDirectory(MemoryContext mcxt, bool omit_detached)
Definition: partdesc.c:383
void DestroyPartitionDirectory(PartitionDirectory pdir)
Definition: partdesc.c:444
PartitionDesc PartitionDirectoryLookup(PartitionDirectory pdir, Relation rel)
Definition: partdesc.c:416
PartitionDesc RelationGetPartitionDesc(Relation rel, bool omit_detached)
Definition: partdesc.c:72
static PartitionDesc RelationBuildPartitionDesc(Relation rel, bool omit_detached)
Definition: partdesc.c:135
Oid get_default_oid_from_partdesc(PartitionDesc partdesc)
Definition: partdesc.c:461
struct PartitionDirectoryEntry PartitionDirectoryEntry
struct PartitionDirectoryData PartitionDirectoryData
Oid get_default_partition_oid(Oid parentId)
Definition: partition.c:314
List * find_inheritance_children_extended(Oid parentrelId, bool omit_detached, LOCKMODE lockmode, bool *detached_exist, TransactionId *detached_xmin)
Definition: pg_inherits.c:83
static int list_length(const List *l)
Definition: pg_list.h:152
#define lfirst_oid(lc)
Definition: pg_list.h:174
uintptr_t Datum
Definition: postgres.h:64
static Datum ObjectIdGetDatum(Oid X)
Definition: postgres.h:252
#define InvalidOid
Definition: postgres_ext.h:36
unsigned int Oid
Definition: postgres_ext.h:31
void * stringToNode(const char *str)
Definition: read.c:90
#define RelationGetRelid(relation)
Definition: rel.h:504
#define RelationGetDescr(relation)
Definition: rel.h:530
#define RelationGetRelationName(relation)
Definition: rel.h:538
void RelationDecrementReferenceCount(Relation rel)
Definition: relcache.c:2140
void RelationIncrementReferenceCount(Relation rel)
Definition: relcache.c:2127
void ScanKeyInit(ScanKey entry, AttrNumber attributeNumber, StrategyNumber strategy, RegProcedure procedure, Datum argument)
Definition: scankey.c:76
bool XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot)
Definition: snapmgr.c:1831
bool ActiveSnapshotSet(void)
Definition: snapmgr.c:763
Snapshot GetActiveSnapshot(void)
Definition: snapmgr.c:751
#define BTEqualStrategyNumber
Definition: stratnum.h:31
Size keysize
Definition: hsearch.h:75
Size entrysize
Definition: hsearch.h:76
MemoryContext hcxt
Definition: hsearch.h:86
Definition: dynahash.c:220
Definition: pg_list.h:54
int last_found_datum_index
Definition: partdesc.h:46
PartitionBoundInfo boundinfo
Definition: partdesc.h:38
int last_found_count
Definition: partdesc.h:63
bool detached_exist
Definition: partdesc.h:32
bool * is_leaf
Definition: partdesc.h:35
int last_found_part_index
Definition: partdesc.h:52
MemoryContext pdir_mcxt
Definition: partdesc.c:38
PartitionDesc pd
Definition: partdesc.c:47
MemoryContext rd_pdcxt
Definition: rel.h:131
TransactionId rd_partdesc_nodetached_xmin
Definition: rel.h:144
PartitionDesc rd_partdesc
Definition: rel.h:130
PartitionDesc rd_partdesc_nodetached
Definition: rel.h:134
MemoryContext rd_pddcxt
Definition: rel.h:135
Form_pg_class rd_rel
Definition: rel.h:111
Definition: type.h:95
void ReleaseSysCache(HeapTuple tuple)
Definition: syscache.c:868
HeapTuple SearchSysCache1(int cacheId, Datum key1)
Definition: syscache.c:820
Datum SysCacheGetAttr(int cacheId, HeapTuple tup, AttrNumber attributeNumber, bool *isNull)
Definition: syscache.c:1081
@ RELOID
Definition: syscache.h:89
void table_close(Relation relation, LOCKMODE lockmode)
Definition: table.c:126
Relation table_open(Oid relationId, LOCKMODE lockmode)
Definition: table.c:40
#define InvalidTransactionId
Definition: transam.h:31
#define TransactionIdIsValid(xid)
Definition: transam.h:41