PostgreSQL Source Code git master
partdesc.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * partdesc.c
4 * Support routines for manipulating partition descriptors
5 *
6 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
8 *
9 * IDENTIFICATION
10 * src/backend/partitioning/partdesc.c
11 *
12 *-------------------------------------------------------------------------
13 */
14
15#include "postgres.h"
16
17#include "access/genam.h"
18#include "access/htup_details.h"
19#include "access/table.h"
20#include "catalog/partition.h"
21#include "catalog/pg_inherits.h"
24#include "utils/builtins.h"
25#include "utils/fmgroids.h"
26#include "utils/hsearch.h"
27#include "utils/inval.h"
28#include "utils/lsyscache.h"
29#include "utils/memutils.h"
30#include "utils/partcache.h"
31#include "utils/rel.h"
32#include "utils/snapmgr.h"
33#include "utils/syscache.h"
34
36{
41
43{
48
50 bool omit_detached);
51
52
53/*
54 * RelationGetPartitionDesc -- get partition descriptor, if relation is partitioned
55 *
56 * We keep two partdescs in relcache: rd_partdesc includes all partitions
57 * (even those being concurrently marked detached), while rd_partdesc_nodetached
58 * omits (some of) those. We store the pg_inherits.xmin value for the latter,
59 * to determine whether it can be validly reused in each case, since that
60 * depends on the active snapshot.
61 *
62 * Note: we arrange for partition descriptors to not get freed until the
63 * relcache entry's refcount goes to zero (see hacks in RelationClose,
64 * RelationClearRelation, and RelationBuildPartitionDesc). Therefore, even
65 * though we hand back a direct pointer into the relcache entry, it's safe
66 * for callers to continue to use that pointer as long as (a) they hold the
67 * relation open, and (b) they hold a relation lock strong enough to ensure
68 * that the data doesn't become stale.
69 */
71RelationGetPartitionDesc(Relation rel, bool omit_detached)
72{
73 Assert(rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE);
74
75 /*
76 * If relcache has a partition descriptor, use that. However, we can only
77 * do so when we are asked to include all partitions including detached;
78 * and also when we know that there are no detached partitions.
79 *
80 * If there is no active snapshot, detached partitions aren't omitted
81 * either, so we can use the cached descriptor too in that case.
82 */
83 if (likely(rel->rd_partdesc &&
84 (!rel->rd_partdesc->detached_exist || !omit_detached ||
86 return rel->rd_partdesc;
87
88 /*
89 * If we're asked to omit detached partitions, we may be able to use a
90 * cached descriptor too. We determine that based on the pg_inherits.xmin
91 * that was saved alongside that descriptor: if the xmin that was not in
92 * progress for that active snapshot is also not in progress for the
93 * current active snapshot, then we can use it. Otherwise build one from
94 * scratch.
95 */
96 if (omit_detached &&
99 {
100 Snapshot activesnap;
101
103 activesnap = GetActiveSnapshot();
104
105 if (!XidInMVCCSnapshot(rel->rd_partdesc_nodetached_xmin, activesnap))
106 return rel->rd_partdesc_nodetached;
107 }
108
109 return RelationBuildPartitionDesc(rel, omit_detached);
110}
111
112/*
113 * RelationBuildPartitionDesc
114 * Form rel's partition descriptor, and store in relcache entry
115 *
116 * Partition descriptor is a complex structure; to avoid complicated logic to
117 * free individual elements whenever the relcache entry is flushed, we give it
118 * its own memory context, a child of CacheMemoryContext, which can easily be
119 * deleted on its own. To avoid leaking memory in that context in case of an
120 * error partway through this function, the context is initially created as a
121 * child of CurTransactionContext and only re-parented to CacheMemoryContext
122 * at the end, when no further errors are possible. Also, we don't make this
123 * context the current context except in very brief code sections, out of fear
124 * that some of our callees allocate memory on their own which would be leaked
125 * permanently.
126 *
127 * As a special case, partition descriptors that are requested to omit
128 * partitions being detached (and which contain such partitions) are transient
129 * and are not associated with the relcache entry. Such descriptors only last
130 * through the requesting Portal, so we use the corresponding memory context
131 * for them.
132 */
133static PartitionDesc
134RelationBuildPartitionDesc(Relation rel, bool omit_detached)
135{
136 PartitionDesc partdesc;
137 PartitionBoundInfo boundinfo = NULL;
138 List *inhoids;
139 PartitionBoundSpec **boundspecs = NULL;
140 Oid *oids = NULL;
141 bool *is_leaf = NULL;
142 bool detached_exist;
143 bool is_omit;
144 TransactionId detached_xmin;
145 ListCell *cell;
146 int i,
147 nparts;
148 bool retried = false;
150 MemoryContext new_pdcxt;
151 MemoryContext oldcxt;
152 int *mapping;
153
154retry:
155
156 /*
157 * Get partition oids from pg_inherits. This uses a single snapshot to
158 * fetch the list of children, so while more children may be getting added
159 * or removed concurrently, whatever this function returns will be
160 * accurate as of some well-defined point in time.
161 */
162 detached_exist = false;
163 detached_xmin = InvalidTransactionId;
165 omit_detached, NoLock,
166 &detached_exist,
167 &detached_xmin);
168
169 nparts = list_length(inhoids);
170
171 /* Allocate working arrays for OIDs, leaf flags, and boundspecs. */
172 if (nparts > 0)
173 {
174 oids = (Oid *) palloc(nparts * sizeof(Oid));
175 is_leaf = (bool *) palloc(nparts * sizeof(bool));
176 boundspecs = palloc(nparts * sizeof(PartitionBoundSpec *));
177 }
178
179 /* Collect bound spec nodes for each partition. */
180 i = 0;
181 foreach(cell, inhoids)
182 {
183 Oid inhrelid = lfirst_oid(cell);
184 HeapTuple tuple;
185 PartitionBoundSpec *boundspec = NULL;
186
187 /* Try fetching the tuple from the catcache, for speed. */
188 tuple = SearchSysCache1(RELOID, ObjectIdGetDatum(inhrelid));
189 if (HeapTupleIsValid(tuple))
190 {
191 Datum datum;
192 bool isnull;
193
194 datum = SysCacheGetAttr(RELOID, tuple,
195 Anum_pg_class_relpartbound,
196 &isnull);
197 if (!isnull)
198 boundspec = stringToNode(TextDatumGetCString(datum));
199 ReleaseSysCache(tuple);
200 }
201
202 /*
203 * Two problems are possible here. First, a concurrent ATTACH
204 * PARTITION might be in the process of adding a new partition, but
205 * the syscache doesn't have it, or its copy of it does not yet have
206 * its relpartbound set. We cannot just AcceptInvalidationMessages(),
207 * because the other process might have already removed itself from
208 * the ProcArray but not yet added its invalidation messages to the
209 * shared queue. We solve this problem by reading pg_class directly
210 * for the desired tuple.
211 *
212 * If the partition recently detached is also dropped, we get no tuple
213 * from the scan. In that case, we also retry, and next time through
214 * here, we don't see that partition anymore.
215 *
216 * The other problem is that DETACH CONCURRENTLY is in the process of
217 * removing a partition, which happens in two steps: first it marks it
218 * as "detach pending", commits, then unsets relpartbound. If
219 * find_inheritance_children_extended included that partition but we
220 * below we see that DETACH CONCURRENTLY has reset relpartbound for
221 * it, we'd see an inconsistent view. (The inconsistency is seen
222 * because table_open below reads invalidation messages.) We protect
223 * against this by retrying find_inheritance_children_extended().
224 */
225 if (boundspec == NULL)
226 {
227 Relation pg_class;
228 SysScanDesc scan;
229 ScanKeyData key[1];
230
231 pg_class = table_open(RelationRelationId, AccessShareLock);
232 ScanKeyInit(&key[0],
233 Anum_pg_class_oid,
234 BTEqualStrategyNumber, F_OIDEQ,
235 ObjectIdGetDatum(inhrelid));
236 scan = systable_beginscan(pg_class, ClassOidIndexId, true,
237 NULL, 1, key);
238
239 /*
240 * We could get one tuple from the scan (the normal case), or zero
241 * tuples if the table has been dropped meanwhile.
242 */
243 tuple = systable_getnext(scan);
244 if (HeapTupleIsValid(tuple))
245 {
246 Datum datum;
247 bool isnull;
248
249 datum = heap_getattr(tuple, Anum_pg_class_relpartbound,
250 RelationGetDescr(pg_class), &isnull);
251 if (!isnull)
252 boundspec = stringToNode(TextDatumGetCString(datum));
253 }
254 systable_endscan(scan);
255 table_close(pg_class, AccessShareLock);
256
257 /*
258 * If we still don't get a relpartbound value (either because
259 * boundspec is null or because there was no tuple), then it must
260 * be because of DETACH CONCURRENTLY. Restart from the top, as
261 * explained above. We only do this once, for two reasons: first,
262 * only one DETACH CONCURRENTLY session could affect us at a time,
263 * since each of them would have to wait for the snapshot under
264 * which this is running; and second, to avoid possible infinite
265 * loops in case of catalog corruption.
266 *
267 * Note that the current memory context is short-lived enough, so
268 * we needn't worry about memory leaks here.
269 */
270 if (!boundspec && !retried)
271 {
273 retried = true;
274 goto retry;
275 }
276 }
277
278 /* Sanity checks. */
279 if (!boundspec)
280 elog(ERROR, "missing relpartbound for relation %u", inhrelid);
281 if (!IsA(boundspec, PartitionBoundSpec))
282 elog(ERROR, "invalid relpartbound for relation %u", inhrelid);
283
284 /*
285 * If the PartitionBoundSpec says this is the default partition, its
286 * OID should match pg_partitioned_table.partdefid; if not, the
287 * catalog is corrupt.
288 */
289 if (boundspec->is_default)
290 {
291 Oid partdefid;
292
294 if (partdefid != inhrelid)
295 elog(ERROR, "expected partdefid %u, but got %u",
296 inhrelid, partdefid);
297 }
298
299 /* Save results. */
300 oids[i] = inhrelid;
301 is_leaf[i] = (get_rel_relkind(inhrelid) != RELKIND_PARTITIONED_TABLE);
302 boundspecs[i] = boundspec;
303 ++i;
304 }
305
306 /*
307 * Create PartitionBoundInfo and mapping, working in the caller's context.
308 * This could fail, but we haven't done any damage if so.
309 */
310 if (nparts > 0)
311 boundinfo = partition_bounds_create(boundspecs, nparts, key, &mapping);
312
313 /*
314 * Now build the actual relcache partition descriptor, copying all the
315 * data into a new, small context. As per above comment, we don't make
316 * this a long-lived context until it's finished.
317 */
319 "partition descriptor",
323
324 partdesc = (PartitionDescData *)
325 MemoryContextAllocZero(new_pdcxt, sizeof(PartitionDescData));
326 partdesc->nparts = nparts;
327 partdesc->detached_exist = detached_exist;
328 /* If there are no partitions, the rest of the partdesc can stay zero */
329 if (nparts > 0)
330 {
331 oldcxt = MemoryContextSwitchTo(new_pdcxt);
332 partdesc->boundinfo = partition_bounds_copy(boundinfo, key);
333
334 /* Initialize caching fields for speeding up ExecFindPartition */
335 partdesc->last_found_datum_index = -1;
336 partdesc->last_found_part_index = -1;
337 partdesc->last_found_count = 0;
338
339 partdesc->oids = (Oid *) palloc(nparts * sizeof(Oid));
340 partdesc->is_leaf = (bool *) palloc(nparts * sizeof(bool));
341
342 /*
343 * Assign OIDs from the original array into mapped indexes of the
344 * result array. The order of OIDs in the former is defined by the
345 * catalog scan that retrieved them, whereas that in the latter is
346 * defined by canonicalized representation of the partition bounds.
347 * Also save leaf-ness of each partition.
348 */
349 for (i = 0; i < nparts; i++)
350 {
351 int index = mapping[i];
352
353 partdesc->oids[index] = oids[i];
354 partdesc->is_leaf[index] = is_leaf[i];
355 }
356 MemoryContextSwitchTo(oldcxt);
357 }
358
359 /*
360 * Are we working with the partdesc that omits the detached partition, or
361 * the one that includes it?
362 *
363 * Note that if a partition was found by the catalog's scan to have been
364 * detached, but the pg_inherit tuple saying so was not visible to the
365 * active snapshot (find_inheritance_children_extended will not have set
366 * detached_xmin in that case), we consider there to be no "omittable"
367 * detached partitions.
368 */
369 is_omit = omit_detached && detached_exist && ActiveSnapshotSet() &&
370 TransactionIdIsValid(detached_xmin);
371
372 /*
373 * We have a fully valid partdesc. Reparent it so that it has the right
374 * lifespan.
375 */
377
378 /*
379 * Store it into relcache.
380 *
381 * But first, a kluge: if there's an old context for this type of
382 * descriptor, it contains an old partition descriptor that may still be
383 * referenced somewhere. Preserve it, while not leaking it, by
384 * reattaching it as a child context of the new one. Eventually it will
385 * get dropped by either RelationClose or RelationClearRelation. (We keep
386 * the regular partdesc in rd_pdcxt, and the partdesc-excluding-
387 * detached-partitions in rd_pddcxt.)
388 */
389 if (is_omit)
390 {
391 if (rel->rd_pddcxt != NULL)
392 MemoryContextSetParent(rel->rd_pddcxt, new_pdcxt);
393 rel->rd_pddcxt = new_pdcxt;
394 rel->rd_partdesc_nodetached = partdesc;
395
396 /*
397 * For partdescs built excluding detached partitions, which we save
398 * separately, we also record the pg_inherits.xmin of the detached
399 * partition that was omitted; this informs a future potential user of
400 * such a cached partdesc to only use it after cross-checking that the
401 * xmin is indeed visible to the snapshot it is going to be working
402 * with.
403 */
404 Assert(TransactionIdIsValid(detached_xmin));
405 rel->rd_partdesc_nodetached_xmin = detached_xmin;
406 }
407 else
408 {
409 if (rel->rd_pdcxt != NULL)
410 MemoryContextSetParent(rel->rd_pdcxt, new_pdcxt);
411 rel->rd_pdcxt = new_pdcxt;
412 rel->rd_partdesc = partdesc;
413 }
414
415 return partdesc;
416}
417
418/*
419 * CreatePartitionDirectory
420 * Create a new partition directory object.
421 */
424{
425 MemoryContext oldcontext = MemoryContextSwitchTo(mcxt);
427 HASHCTL ctl;
428
429 pdir = palloc(sizeof(PartitionDirectoryData));
430 pdir->pdir_mcxt = mcxt;
431
432 ctl.keysize = sizeof(Oid);
433 ctl.entrysize = sizeof(PartitionDirectoryEntry);
434 ctl.hcxt = mcxt;
435
436 pdir->pdir_hash = hash_create("partition directory", 256, &ctl,
438 pdir->omit_detached = omit_detached;
439
440 MemoryContextSwitchTo(oldcontext);
441 return pdir;
442}
443
444/*
445 * PartitionDirectoryLookup
446 * Look up the partition descriptor for a relation in the directory.
447 *
448 * The purpose of this function is to ensure that we get the same
449 * PartitionDesc for each relation every time we look it up. In the
450 * face of concurrent DDL, different PartitionDescs may be constructed with
451 * different views of the catalog state, but any single particular OID
452 * will always get the same PartitionDesc for as long as the same
453 * PartitionDirectory is used.
454 */
457{
459 Oid relid = RelationGetRelid(rel);
460 bool found;
461
462 pde = hash_search(pdir->pdir_hash, &relid, HASH_ENTER, &found);
463 if (!found)
464 {
465 /*
466 * We must keep a reference count on the relation so that the
467 * PartitionDesc to which we are pointing can't get destroyed.
468 */
470 pde->rel = rel;
471 pde->pd = RelationGetPartitionDesc(rel, pdir->omit_detached);
472 Assert(pde->pd != NULL);
473 }
474 return pde->pd;
475}
476
477/*
478 * DestroyPartitionDirectory
479 * Destroy a partition directory.
480 *
481 * Release the reference counts we're holding.
482 */
483void
485{
486 HASH_SEQ_STATUS status;
488
489 hash_seq_init(&status, pdir->pdir_hash);
490 while ((pde = hash_seq_search(&status)) != NULL)
492}
493
494/*
495 * get_default_oid_from_partdesc
496 *
497 * Given a partition descriptor, return the OID of the default partition, if
498 * one exists; else, return InvalidOid.
499 */
500Oid
502{
503 if (partdesc && partdesc->boundinfo &&
505 return partdesc->oids[partdesc->boundinfo->default_index];
506
507 return InvalidOid;
508}
#define TextDatumGetCString(d)
Definition: builtins.h:98
#define likely(x)
Definition: c.h:346
uint32 TransactionId
Definition: c.h:623
void * hash_search(HTAB *hashp, const void *keyPtr, HASHACTION action, bool *foundPtr)
Definition: dynahash.c:955
void * hash_seq_search(HASH_SEQ_STATUS *status)
Definition: dynahash.c:1420
HTAB * hash_create(const char *tabname, long nelem, const HASHCTL *info, int flags)
Definition: dynahash.c:352
void hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
Definition: dynahash.c:1385
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:225
void systable_endscan(SysScanDesc sysscan)
Definition: genam.c:603
HeapTuple systable_getnext(SysScanDesc sysscan)
Definition: genam.c:514
SysScanDesc systable_beginscan(Relation heapRelation, Oid indexId, bool indexOK, Snapshot snapshot, int nkeys, ScanKey key)
Definition: genam.c:388
Assert(PointerIsAligned(start, uint64))
@ HASH_ENTER
Definition: hsearch.h:114
#define HASH_CONTEXT
Definition: hsearch.h:102
#define HASH_ELEM
Definition: hsearch.h:95
#define HASH_BLOBS
Definition: hsearch.h:97
#define HeapTupleIsValid(tuple)
Definition: htup.h:78
static Datum heap_getattr(HeapTuple tup, int attnum, TupleDesc tupleDesc, bool *isnull)
Definition: htup_details.h:903
void AcceptInvalidationMessages(void)
Definition: inval.c:929
int i
Definition: isn.c:74
#define NoLock
Definition: lockdefs.h:34
#define AccessShareLock
Definition: lockdefs.h:36
char get_rel_relkind(Oid relid)
Definition: lsyscache.c:2086
void * MemoryContextAllocZero(MemoryContext context, Size size)
Definition: mcxt.c:1215
void MemoryContextSetParent(MemoryContext context, MemoryContext new_parent)
Definition: mcxt.c:637
void * palloc(Size size)
Definition: mcxt.c:1317
MemoryContext CurTransactionContext
Definition: mcxt.c:155
MemoryContext CacheMemoryContext
Definition: mcxt.c:152
#define AllocSetContextCreate
Definition: memutils.h:129
#define ALLOCSET_SMALL_SIZES
Definition: memutils.h:170
#define MemoryContextCopyAndSetIdentifier(cxt, id)
Definition: memutils.h:101
#define IsA(nodeptr, _type_)
Definition: nodes.h:160
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
Definition: palloc.h:124
PartitionBoundInfo partition_bounds_create(PartitionBoundSpec **boundspecs, int nparts, PartitionKey key, int **mapping)
Definition: partbounds.c:299
PartitionBoundInfo partition_bounds_copy(PartitionBoundInfo src, PartitionKey key)
Definition: partbounds.c:1002
#define partition_bound_has_default(bi)
Definition: partbounds.h:99
PartitionKey RelationGetPartitionKey(Relation rel)
Definition: partcache.c:51
PartitionDirectory CreatePartitionDirectory(MemoryContext mcxt, bool omit_detached)
Definition: partdesc.c:423
void DestroyPartitionDirectory(PartitionDirectory pdir)
Definition: partdesc.c:484
PartitionDesc PartitionDirectoryLookup(PartitionDirectory pdir, Relation rel)
Definition: partdesc.c:456
PartitionDesc RelationGetPartitionDesc(Relation rel, bool omit_detached)
Definition: partdesc.c:71
static PartitionDesc RelationBuildPartitionDesc(Relation rel, bool omit_detached)
Definition: partdesc.c:134
Oid get_default_oid_from_partdesc(PartitionDesc partdesc)
Definition: partdesc.c:501
struct PartitionDirectoryEntry PartitionDirectoryEntry
struct PartitionDirectoryData PartitionDirectoryData
Oid get_default_partition_oid(Oid parentId)
Definition: partition.c:315
List * find_inheritance_children_extended(Oid parentrelId, bool omit_detached, LOCKMODE lockmode, bool *detached_exist, TransactionId *detached_xmin)
Definition: pg_inherits.c:82
static int list_length(const List *l)
Definition: pg_list.h:152
#define lfirst_oid(lc)
Definition: pg_list.h:174
uintptr_t Datum
Definition: postgres.h:69
static Datum ObjectIdGetDatum(Oid X)
Definition: postgres.h:257
#define InvalidOid
Definition: postgres_ext.h:37
unsigned int Oid
Definition: postgres_ext.h:32
tree ctl
Definition: radixtree.h:1838
void * stringToNode(const char *str)
Definition: read.c:90
#define RelationGetRelid(relation)
Definition: rel.h:513
#define RelationGetDescr(relation)
Definition: rel.h:539
#define RelationGetRelationName(relation)
Definition: rel.h:547
void RelationDecrementReferenceCount(Relation rel)
Definition: relcache.c:2158
void RelationIncrementReferenceCount(Relation rel)
Definition: relcache.c:2145
void ScanKeyInit(ScanKey entry, AttrNumber attributeNumber, StrategyNumber strategy, RegProcedure procedure, Datum argument)
Definition: scankey.c:76
bool XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot)
Definition: snapmgr.c:1859
bool ActiveSnapshotSet(void)
Definition: snapmgr.c:799
Snapshot GetActiveSnapshot(void)
Definition: snapmgr.c:787
#define BTEqualStrategyNumber
Definition: stratnum.h:31
Definition: dynahash.c:220
Definition: pg_list.h:54
int last_found_datum_index
Definition: partdesc.h:46
PartitionBoundInfo boundinfo
Definition: partdesc.h:38
int last_found_count
Definition: partdesc.h:63
bool detached_exist
Definition: partdesc.h:32
bool * is_leaf
Definition: partdesc.h:35
int last_found_part_index
Definition: partdesc.h:52
MemoryContext pdir_mcxt
Definition: partdesc.c:37
PartitionDesc pd
Definition: partdesc.c:46
MemoryContext rd_pdcxt
Definition: rel.h:131
TransactionId rd_partdesc_nodetached_xmin
Definition: rel.h:144
PartitionDesc rd_partdesc
Definition: rel.h:130
PartitionDesc rd_partdesc_nodetached
Definition: rel.h:134
MemoryContext rd_pddcxt
Definition: rel.h:135
Form_pg_class rd_rel
Definition: rel.h:111
Definition: type.h:96
void ReleaseSysCache(HeapTuple tuple)
Definition: syscache.c:269
HeapTuple SearchSysCache1(int cacheId, Datum key1)
Definition: syscache.c:221
Datum SysCacheGetAttr(int cacheId, HeapTuple tup, AttrNumber attributeNumber, bool *isNull)
Definition: syscache.c:600
void table_close(Relation relation, LOCKMODE lockmode)
Definition: table.c:126
Relation table_open(Oid relationId, LOCKMODE lockmode)
Definition: table.c:40
#define InvalidTransactionId
Definition: transam.h:31
#define TransactionIdIsValid(xid)
Definition: transam.h:41