PostgreSQL Source Code git master
brin.c
Go to the documentation of this file.
1/*
2 * brin.c
3 * Implementation of BRIN indexes for Postgres
4 *
5 * See src/backend/access/brin/README for details.
6 *
7 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
8 * Portions Copyright (c) 1994, Regents of the University of California
9 *
10 * IDENTIFICATION
11 * src/backend/access/brin/brin.c
12 *
13 * TODO
14 * * ScalarArrayOpExpr (amsearcharray -> SK_SEARCHARRAY)
15 */
16#include "postgres.h"
17
18#include "access/brin.h"
19#include "access/brin_page.h"
20#include "access/brin_pageops.h"
21#include "access/brin_xlog.h"
22#include "access/relation.h"
23#include "access/reloptions.h"
24#include "access/relscan.h"
25#include "access/table.h"
26#include "access/tableam.h"
27#include "access/xloginsert.h"
28#include "catalog/index.h"
29#include "catalog/pg_am.h"
30#include "commands/vacuum.h"
31#include "miscadmin.h"
32#include "pgstat.h"
34#include "storage/bufmgr.h"
35#include "storage/freespace.h"
36#include "tcop/tcopprot.h"
37#include "utils/acl.h"
38#include "utils/datum.h"
39#include "utils/fmgrprotos.h"
40#include "utils/guc.h"
42#include "utils/memutils.h"
43#include "utils/rel.h"
44#include "utils/tuplesort.h"
45
46/* Magic numbers for parallel state sharing */
47#define PARALLEL_KEY_BRIN_SHARED UINT64CONST(0xB000000000000001)
48#define PARALLEL_KEY_TUPLESORT UINT64CONST(0xB000000000000002)
49#define PARALLEL_KEY_QUERY_TEXT UINT64CONST(0xB000000000000003)
50#define PARALLEL_KEY_WAL_USAGE UINT64CONST(0xB000000000000004)
51#define PARALLEL_KEY_BUFFER_USAGE UINT64CONST(0xB000000000000005)
52
53/*
54 * Status for index builds performed in parallel. This is allocated in a
55 * dynamic shared memory segment.
56 */
57typedef struct BrinShared
58{
59 /*
60 * These fields are not modified during the build. They primarily exist
61 * for the benefit of worker processes that need to create state
62 * corresponding to that used by the leader.
63 */
69
70 /* Query ID, for report in worker processes */
72
73 /*
74 * workersdonecv is used to monitor the progress of workers. All parallel
75 * participants must indicate that they are done before leader can use
76 * results built by the workers (and before leader can write the data into
77 * the index).
78 */
80
81 /*
82 * mutex protects all fields before heapdesc.
83 *
84 * These fields contain status information of interest to BRIN index
85 * builds that must work just the same when an index is built in parallel.
86 */
87 slock_t mutex;
88
89 /*
90 * Mutable state that is maintained by workers, and reported back to
91 * leader at end of the scans.
92 *
93 * nparticipantsdone is number of worker processes finished.
94 *
95 * reltuples is the total number of input heap tuples.
96 *
97 * indtuples is the total number of tuples that made it into the index.
98 */
100 double reltuples;
101 double indtuples;
102
103 /*
104 * ParallelTableScanDescData data follows. Can't directly embed here, as
105 * implementations of the parallel table scan desc interface might need
106 * stronger alignment.
107 */
109
110/*
111 * Return pointer to a BrinShared's parallel table scan.
112 *
113 * c.f. shm_toc_allocate as to why BUFFERALIGN is used, rather than just
114 * MAXALIGN.
115 */
116#define ParallelTableScanFromBrinShared(shared) \
117 (ParallelTableScanDesc) ((char *) (shared) + BUFFERALIGN(sizeof(BrinShared)))
118
119/*
120 * Status for leader in parallel index build.
121 */
122typedef struct BrinLeader
123{
124 /* parallel context itself */
126
127 /*
128 * nparticipanttuplesorts is the exact number of worker processes
129 * successfully launched, plus one leader process if it participates as a
130 * worker (only DISABLE_LEADER_PARTICIPATION builds avoid leader
131 * participating as a worker).
132 */
134
135 /*
136 * Leader process convenience pointers to shared state (leader avoids TOC
137 * lookups).
138 *
139 * brinshared is the shared state for entire build. sharedsort is the
140 * shared, tuplesort-managed state passed to each process tuplesort.
141 * snapshot is the snapshot used by the scan iff an MVCC snapshot is
142 * required.
143 */
150
151/*
152 * We use a BrinBuildState during initial construction of a BRIN index.
153 * The running state is kept in a BrinMemTuple.
154 */
155typedef struct BrinBuildState
156{
167
171
172 /*
173 * bs_leader is only present when a parallel index build is performed, and
174 * only in the leader process. (Actually, only the leader process has a
175 * BrinBuildState.)
176 */
179
180 /*
181 * The sortstate is used by workers (including the leader). It has to be
182 * part of the build state, because that's the only thing passed to the
183 * build callback etc.
184 */
187
188/*
189 * We use a BrinInsertState to capture running state spanning multiple
190 * brininsert invocations, within the same command.
191 */
192typedef struct BrinInsertState
193{
198
199/*
200 * Struct used as "opaque" during index scans
201 */
202typedef struct BrinOpaque
203{
208
209#define BRIN_ALL_BLOCKRANGES InvalidBlockNumber
210
212 BrinRevmap *revmap,
213 BlockNumber pagesPerRange,
214 BlockNumber tablePages);
217static void brinsummarize(Relation index, Relation heapRel, BlockNumber pageRange,
218 bool include_partial, double *numSummarized, double *numExisting);
221static void union_tuples(BrinDesc *bdesc, BrinMemTuple *a,
222 BrinTuple *b);
223static void brin_vacuum_scan(Relation idxrel, BufferAccessStrategy strategy);
224static bool add_values_to_range(Relation idxRel, BrinDesc *bdesc,
225 BrinMemTuple *dtup, const Datum *values, const bool *nulls);
226static bool check_null_keys(BrinValues *bval, ScanKey *nullkeys, int nnullkeys);
228 BlockNumber prevRange, BlockNumber nextRange);
229
230/* parallel index builds */
231static void _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index,
232 bool isconcurrent, int request);
233static void _brin_end_parallel(BrinLeader *brinleader, BrinBuildState *state);
238 Relation heap, Relation index);
240 BrinShared *brinshared,
241 Sharedsort *sharedsort,
242 Relation heap, Relation index,
243 int sortmem, bool progress);
244
245/*
246 * BRIN handler function: return IndexAmRoutine with access method parameters
247 * and callbacks.
248 */
249Datum
251{
253
254 amroutine->amstrategies = 0;
257 amroutine->amcanorder = false;
258 amroutine->amcanorderbyop = false;
259 amroutine->amcanbackward = false;
260 amroutine->amcanunique = false;
261 amroutine->amcanmulticol = true;
262 amroutine->amoptionalkey = true;
263 amroutine->amsearcharray = false;
264 amroutine->amsearchnulls = true;
265 amroutine->amstorage = true;
266 amroutine->amclusterable = false;
267 amroutine->ampredlocks = false;
268 amroutine->amcanparallel = false;
269 amroutine->amcanbuildparallel = true;
270 amroutine->amcaninclude = false;
271 amroutine->amusemaintenanceworkmem = false;
272 amroutine->amsummarizing = true;
273 amroutine->amparallelvacuumoptions =
275 amroutine->amkeytype = InvalidOid;
276
277 amroutine->ambuild = brinbuild;
278 amroutine->ambuildempty = brinbuildempty;
279 amroutine->aminsert = brininsert;
281 amroutine->ambulkdelete = brinbulkdelete;
283 amroutine->amcanreturn = NULL;
284 amroutine->amcostestimate = brincostestimate;
285 amroutine->amgettreeheight = NULL;
286 amroutine->amoptions = brinoptions;
287 amroutine->amproperty = NULL;
288 amroutine->ambuildphasename = NULL;
289 amroutine->amvalidate = brinvalidate;
290 amroutine->amadjustmembers = NULL;
291 amroutine->ambeginscan = brinbeginscan;
292 amroutine->amrescan = brinrescan;
293 amroutine->amgettuple = NULL;
294 amroutine->amgetbitmap = bringetbitmap;
295 amroutine->amendscan = brinendscan;
296 amroutine->ammarkpos = NULL;
297 amroutine->amrestrpos = NULL;
298 amroutine->amestimateparallelscan = NULL;
299 amroutine->aminitparallelscan = NULL;
300 amroutine->amparallelrescan = NULL;
301
302 PG_RETURN_POINTER(amroutine);
303}
304
305/*
306 * Initialize a BrinInsertState to maintain state to be used across multiple
307 * tuple inserts, within the same command.
308 */
309static BrinInsertState *
311{
312 BrinInsertState *bistate;
313 MemoryContext oldcxt;
314
315 oldcxt = MemoryContextSwitchTo(indexInfo->ii_Context);
316 bistate = palloc0(sizeof(BrinInsertState));
317 bistate->bis_desc = brin_build_desc(idxRel);
318 bistate->bis_rmAccess = brinRevmapInitialize(idxRel,
319 &bistate->bis_pages_per_range);
320 indexInfo->ii_AmCache = bistate;
321 MemoryContextSwitchTo(oldcxt);
322
323 return bistate;
324}
325
326/*
327 * A tuple in the heap is being inserted. To keep a brin index up to date,
328 * we need to obtain the relevant index tuple and compare its stored values
329 * with those of the new tuple. If the tuple values are not consistent with
330 * the summary tuple, we need to update the index tuple.
331 *
332 * If autosummarization is enabled, check if we need to summarize the previous
333 * page range.
334 *
335 * If the range is not currently summarized (i.e. the revmap returns NULL for
336 * it), there's nothing to do for this tuple.
337 */
338bool
339brininsert(Relation idxRel, Datum *values, bool *nulls,
340 ItemPointer heaptid, Relation heapRel,
341 IndexUniqueCheck checkUnique,
342 bool indexUnchanged,
343 IndexInfo *indexInfo)
344{
345 BlockNumber pagesPerRange;
346 BlockNumber origHeapBlk;
347 BlockNumber heapBlk;
348 BrinInsertState *bistate = (BrinInsertState *) indexInfo->ii_AmCache;
349 BrinRevmap *revmap;
350 BrinDesc *bdesc;
352 MemoryContext tupcxt = NULL;
354 bool autosummarize = BrinGetAutoSummarize(idxRel);
355
356 /*
357 * If first time through in this statement, initialize the insert state
358 * that we keep for all the inserts in the command.
359 */
360 if (!bistate)
361 bistate = initialize_brin_insertstate(idxRel, indexInfo);
362
363 revmap = bistate->bis_rmAccess;
364 bdesc = bistate->bis_desc;
365 pagesPerRange = bistate->bis_pages_per_range;
366
367 /*
368 * origHeapBlk is the block number where the insertion occurred. heapBlk
369 * is the first block in the corresponding page range.
370 */
371 origHeapBlk = ItemPointerGetBlockNumber(heaptid);
372 heapBlk = (origHeapBlk / pagesPerRange) * pagesPerRange;
373
374 for (;;)
375 {
376 bool need_insert = false;
377 OffsetNumber off;
378 BrinTuple *brtup;
379 BrinMemTuple *dtup;
380
382
383 /*
384 * If auto-summarization is enabled and we just inserted the first
385 * tuple into the first block of a new non-first page range, request a
386 * summarization run of the previous range.
387 */
388 if (autosummarize &&
389 heapBlk > 0 &&
390 heapBlk == origHeapBlk &&
392 {
393 BlockNumber lastPageRange = heapBlk - 1;
394 BrinTuple *lastPageTuple;
395
396 lastPageTuple =
397 brinGetTupleForHeapBlock(revmap, lastPageRange, &buf, &off,
398 NULL, BUFFER_LOCK_SHARE);
399 if (!lastPageTuple)
400 {
401 bool recorded;
402
404 RelationGetRelid(idxRel),
405 lastPageRange);
406 if (!recorded)
407 ereport(LOG,
408 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
409 errmsg("request for BRIN range summarization for index \"%s\" page %u was not recorded",
411 lastPageRange)));
412 }
413 else
415 }
416
417 brtup = brinGetTupleForHeapBlock(revmap, heapBlk, &buf, &off,
418 NULL, BUFFER_LOCK_SHARE);
419
420 /* if range is unsummarized, there's nothing to do */
421 if (!brtup)
422 break;
423
424 /* First time through in this brininsert call? */
425 if (tupcxt == NULL)
426 {
428 "brininsert cxt",
430 MemoryContextSwitchTo(tupcxt);
431 }
432
433 dtup = brin_deform_tuple(bdesc, brtup, NULL);
434
435 need_insert = add_values_to_range(idxRel, bdesc, dtup, values, nulls);
436
437 if (!need_insert)
438 {
439 /*
440 * The tuple is consistent with the new values, so there's nothing
441 * to do.
442 */
444 }
445 else
446 {
447 Page page = BufferGetPage(buf);
448 ItemId lp = PageGetItemId(page, off);
449 Size origsz;
450 BrinTuple *origtup;
451 Size newsz;
452 BrinTuple *newtup;
453 bool samepage;
454
455 /*
456 * Make a copy of the old tuple, so that we can compare it after
457 * re-acquiring the lock.
458 */
459 origsz = ItemIdGetLength(lp);
460 origtup = brin_copy_tuple(brtup, origsz, NULL, NULL);
461
462 /*
463 * Before releasing the lock, check if we can attempt a same-page
464 * update. Another process could insert a tuple concurrently in
465 * the same page though, so downstream we must be prepared to cope
466 * if this turns out to not be possible after all.
467 */
468 newtup = brin_form_tuple(bdesc, heapBlk, dtup, &newsz);
469 samepage = brin_can_do_samepage_update(buf, origsz, newsz);
471
472 /*
473 * Try to update the tuple. If this doesn't work for whatever
474 * reason, we need to restart from the top; the revmap might be
475 * pointing at a different tuple for this block now, so we need to
476 * recompute to ensure both our new heap tuple and the other
477 * inserter's are covered by the combined tuple. It might be that
478 * we don't need to update at all.
479 */
480 if (!brin_doupdate(idxRel, pagesPerRange, revmap, heapBlk,
481 buf, off, origtup, origsz, newtup, newsz,
482 samepage))
483 {
484 /* no luck; start over */
485 MemoryContextReset(tupcxt);
486 continue;
487 }
488 }
489
490 /* success! */
491 break;
492 }
493
494 if (BufferIsValid(buf))
496 MemoryContextSwitchTo(oldcxt);
497 if (tupcxt != NULL)
498 MemoryContextDelete(tupcxt);
499
500 return false;
501}
502
503/*
504 * Callback to clean up the BrinInsertState once all tuple inserts are done.
505 */
506void
508{
509 BrinInsertState *bistate = (BrinInsertState *) indexInfo->ii_AmCache;
510
511 /* bail out if cache not initialized */
512 if (indexInfo->ii_AmCache == NULL)
513 return;
514
515 /*
516 * Clean up the revmap. Note that the brinDesc has already been cleaned up
517 * as part of its own memory context.
518 */
520 bistate->bis_rmAccess = NULL;
521 bistate->bis_desc = NULL;
522}
523
524/*
525 * Initialize state for a BRIN index scan.
526 *
527 * We read the metapage here to determine the pages-per-range number that this
528 * index was built with. Note that since this cannot be changed while we're
529 * holding lock on index, it's not necessary to recompute it during brinrescan.
530 */
532brinbeginscan(Relation r, int nkeys, int norderbys)
533{
534 IndexScanDesc scan;
535 BrinOpaque *opaque;
536
537 scan = RelationGetIndexScan(r, nkeys, norderbys);
538
539 opaque = palloc_object(BrinOpaque);
541 opaque->bo_bdesc = brin_build_desc(r);
542 scan->opaque = opaque;
543
544 return scan;
545}
546
547/*
548 * Execute the index scan.
549 *
550 * This works by reading index TIDs from the revmap, and obtaining the index
551 * tuples pointed to by them; the summary values in the index tuples are
552 * compared to the scan keys. We return into the TID bitmap all the pages in
553 * ranges corresponding to index tuples that match the scan keys.
554 *
555 * If a TID from the revmap is read as InvalidTID, we know that range is
556 * unsummarized. Pages in those ranges need to be returned regardless of scan
557 * keys.
558 */
559int64
561{
562 Relation idxRel = scan->indexRelation;
564 BrinDesc *bdesc;
565 Oid heapOid;
566 Relation heapRel;
567 BrinOpaque *opaque;
568 BlockNumber nblocks;
569 BlockNumber heapBlk;
570 int64 totalpages = 0;
571 FmgrInfo *consistentFn;
572 MemoryContext oldcxt;
573 MemoryContext perRangeCxt;
574 BrinMemTuple *dtup;
575 BrinTuple *btup = NULL;
576 Size btupsz = 0;
577 ScanKey **keys,
578 **nullkeys;
579 int *nkeys,
580 *nnullkeys;
581 char *ptr;
582 Size len;
583 char *tmp PG_USED_FOR_ASSERTS_ONLY;
584
585 opaque = (BrinOpaque *) scan->opaque;
586 bdesc = opaque->bo_bdesc;
588
589 /*
590 * We need to know the size of the table so that we know how long to
591 * iterate on the revmap.
592 */
593 heapOid = IndexGetRelation(RelationGetRelid(idxRel), false);
594 heapRel = table_open(heapOid, AccessShareLock);
595 nblocks = RelationGetNumberOfBlocks(heapRel);
597
598 /*
599 * Make room for the consistent support procedures of indexed columns. We
600 * don't look them up here; we do that lazily the first time we see a scan
601 * key reference each of them. We rely on zeroing fn_oid to InvalidOid.
602 */
603 consistentFn = palloc0_array(FmgrInfo, bdesc->bd_tupdesc->natts);
604
605 /*
606 * Make room for per-attribute lists of scan keys that we'll pass to the
607 * consistent support procedure. We don't know which attributes have scan
608 * keys, so we allocate space for all attributes. That may use more memory
609 * but it's probably cheaper than determining which attributes are used.
610 *
611 * We keep null and regular keys separate, so that we can pass just the
612 * regular keys to the consistent function easily.
613 *
614 * To reduce the allocation overhead, we allocate one big chunk and then
615 * carve it into smaller arrays ourselves. All the pieces have exactly the
616 * same lifetime, so that's OK.
617 *
618 * XXX The widest index can have 32 attributes, so the amount of wasted
619 * memory is negligible. We could invent a more compact approach (with
620 * just space for used attributes) but that would make the matching more
621 * complex so it's not a good trade-off.
622 */
623 len =
624 MAXALIGN(sizeof(ScanKey *) * bdesc->bd_tupdesc->natts) + /* regular keys */
625 MAXALIGN(sizeof(ScanKey) * scan->numberOfKeys) * bdesc->bd_tupdesc->natts +
626 MAXALIGN(sizeof(int) * bdesc->bd_tupdesc->natts) +
627 MAXALIGN(sizeof(ScanKey *) * bdesc->bd_tupdesc->natts) + /* NULL keys */
628 MAXALIGN(sizeof(ScanKey) * scan->numberOfKeys) * bdesc->bd_tupdesc->natts +
629 MAXALIGN(sizeof(int) * bdesc->bd_tupdesc->natts);
630
631 ptr = palloc(len);
632 tmp = ptr;
633
634 keys = (ScanKey **) ptr;
635 ptr += MAXALIGN(sizeof(ScanKey *) * bdesc->bd_tupdesc->natts);
636
637 nullkeys = (ScanKey **) ptr;
638 ptr += MAXALIGN(sizeof(ScanKey *) * bdesc->bd_tupdesc->natts);
639
640 nkeys = (int *) ptr;
641 ptr += MAXALIGN(sizeof(int) * bdesc->bd_tupdesc->natts);
642
643 nnullkeys = (int *) ptr;
644 ptr += MAXALIGN(sizeof(int) * bdesc->bd_tupdesc->natts);
645
646 for (int i = 0; i < bdesc->bd_tupdesc->natts; i++)
647 {
648 keys[i] = (ScanKey *) ptr;
649 ptr += MAXALIGN(sizeof(ScanKey) * scan->numberOfKeys);
650
651 nullkeys[i] = (ScanKey *) ptr;
652 ptr += MAXALIGN(sizeof(ScanKey) * scan->numberOfKeys);
653 }
654
655 Assert(tmp + len == ptr);
656
657 /* zero the number of keys */
658 memset(nkeys, 0, sizeof(int) * bdesc->bd_tupdesc->natts);
659 memset(nnullkeys, 0, sizeof(int) * bdesc->bd_tupdesc->natts);
660
661 /* Preprocess the scan keys - split them into per-attribute arrays. */
662 for (int keyno = 0; keyno < scan->numberOfKeys; keyno++)
663 {
664 ScanKey key = &scan->keyData[keyno];
665 AttrNumber keyattno = key->sk_attno;
666
667 /*
668 * The collation of the scan key must match the collation used in the
669 * index column (but only if the search is not IS NULL/ IS NOT NULL).
670 * Otherwise we shouldn't be using this index ...
671 */
672 Assert((key->sk_flags & SK_ISNULL) ||
673 (key->sk_collation ==
675 keyattno - 1)->attcollation));
676
677 /*
678 * First time we see this index attribute, so init as needed.
679 *
680 * This is a bit of an overkill - we don't know how many scan keys are
681 * there for this attribute, so we simply allocate the largest number
682 * possible (as if all keys were for this attribute). This may waste a
683 * bit of memory, but we only expect small number of scan keys in
684 * general, so this should be negligible, and repeated repalloc calls
685 * are not free either.
686 */
687 if (consistentFn[keyattno - 1].fn_oid == InvalidOid)
688 {
689 FmgrInfo *tmp;
690
691 /* First time we see this attribute, so no key/null keys. */
692 Assert(nkeys[keyattno - 1] == 0);
693 Assert(nnullkeys[keyattno - 1] == 0);
694
695 tmp = index_getprocinfo(idxRel, keyattno,
697 fmgr_info_copy(&consistentFn[keyattno - 1], tmp,
699 }
700
701 /* Add key to the proper per-attribute array. */
702 if (key->sk_flags & SK_ISNULL)
703 {
704 nullkeys[keyattno - 1][nnullkeys[keyattno - 1]] = key;
705 nnullkeys[keyattno - 1]++;
706 }
707 else
708 {
709 keys[keyattno - 1][nkeys[keyattno - 1]] = key;
710 nkeys[keyattno - 1]++;
711 }
712 }
713
714 /* allocate an initial in-memory tuple, out of the per-range memcxt */
715 dtup = brin_new_memtuple(bdesc);
716
717 /*
718 * Setup and use a per-range memory context, which is reset every time we
719 * loop below. This avoids having to free the tuples within the loop.
720 */
722 "bringetbitmap cxt",
724 oldcxt = MemoryContextSwitchTo(perRangeCxt);
725
726 /*
727 * Now scan the revmap. We start by querying for heap page 0,
728 * incrementing by the number of pages per range; this gives us a full
729 * view of the table.
730 */
731 for (heapBlk = 0; heapBlk < nblocks; heapBlk += opaque->bo_pagesPerRange)
732 {
733 bool addrange;
734 bool gottuple = false;
735 BrinTuple *tup;
736 OffsetNumber off;
737 Size size;
738
740
741 MemoryContextReset(perRangeCxt);
742
743 tup = brinGetTupleForHeapBlock(opaque->bo_rmAccess, heapBlk, &buf,
744 &off, &size, BUFFER_LOCK_SHARE);
745 if (tup)
746 {
747 gottuple = true;
748 btup = brin_copy_tuple(tup, size, btup, &btupsz);
750 }
751
752 /*
753 * For page ranges with no indexed tuple, we must return the whole
754 * range; otherwise, compare it to the scan keys.
755 */
756 if (!gottuple)
757 {
758 addrange = true;
759 }
760 else
761 {
762 dtup = brin_deform_tuple(bdesc, btup, dtup);
763 if (dtup->bt_placeholder)
764 {
765 /*
766 * Placeholder tuples are always returned, regardless of the
767 * values stored in them.
768 */
769 addrange = true;
770 }
771 else
772 {
773 int attno;
774
775 /*
776 * Compare scan keys with summary values stored for the range.
777 * If scan keys are matched, the page range must be added to
778 * the bitmap. We initially assume the range needs to be
779 * added; in particular this serves the case where there are
780 * no keys.
781 */
782 addrange = true;
783 for (attno = 1; attno <= bdesc->bd_tupdesc->natts; attno++)
784 {
785 BrinValues *bval;
786 Datum add;
787 Oid collation;
788
789 /*
790 * skip attributes without any scan keys (both regular and
791 * IS [NOT] NULL)
792 */
793 if (nkeys[attno - 1] == 0 && nnullkeys[attno - 1] == 0)
794 continue;
795
796 bval = &dtup->bt_columns[attno - 1];
797
798 /*
799 * If the BRIN tuple indicates that this range is empty,
800 * we can skip it: there's nothing to match. We don't
801 * need to examine the next columns.
802 */
803 if (dtup->bt_empty_range)
804 {
805 addrange = false;
806 break;
807 }
808
809 /*
810 * First check if there are any IS [NOT] NULL scan keys,
811 * and if we're violating them. In that case we can
812 * terminate early, without invoking the support function.
813 *
814 * As there may be more keys, we can only determine
815 * mismatch within this loop.
816 */
817 if (bdesc->bd_info[attno - 1]->oi_regular_nulls &&
818 !check_null_keys(bval, nullkeys[attno - 1],
819 nnullkeys[attno - 1]))
820 {
821 /*
822 * If any of the IS [NOT] NULL keys failed, the page
823 * range as a whole can't pass. So terminate the loop.
824 */
825 addrange = false;
826 break;
827 }
828
829 /*
830 * So either there are no IS [NOT] NULL keys, or all
831 * passed. If there are no regular scan keys, we're done -
832 * the page range matches. If there are regular keys, but
833 * the page range is marked as 'all nulls' it can't
834 * possibly pass (we're assuming the operators are
835 * strict).
836 */
837
838 /* No regular scan keys - page range as a whole passes. */
839 if (!nkeys[attno - 1])
840 continue;
841
842 Assert((nkeys[attno - 1] > 0) &&
843 (nkeys[attno - 1] <= scan->numberOfKeys));
844
845 /* If it is all nulls, it cannot possibly be consistent. */
846 if (bval->bv_allnulls)
847 {
848 addrange = false;
849 break;
850 }
851
852 /*
853 * Collation from the first key (has to be the same for
854 * all keys for the same attribute).
855 */
856 collation = keys[attno - 1][0]->sk_collation;
857
858 /*
859 * Check whether the scan key is consistent with the page
860 * range values; if so, have the pages in the range added
861 * to the output bitmap.
862 *
863 * The opclass may or may not support processing of
864 * multiple scan keys. We can determine that based on the
865 * number of arguments - functions with extra parameter
866 * (number of scan keys) do support this, otherwise we
867 * have to simply pass the scan keys one by one.
868 */
869 if (consistentFn[attno - 1].fn_nargs >= 4)
870 {
871 /* Check all keys at once */
872 add = FunctionCall4Coll(&consistentFn[attno - 1],
873 collation,
874 PointerGetDatum(bdesc),
875 PointerGetDatum(bval),
876 PointerGetDatum(keys[attno - 1]),
877 Int32GetDatum(nkeys[attno - 1]));
878 addrange = DatumGetBool(add);
879 }
880 else
881 {
882 /*
883 * Check keys one by one
884 *
885 * When there are multiple scan keys, failure to meet
886 * the criteria for a single one of them is enough to
887 * discard the range as a whole, so break out of the
888 * loop as soon as a false return value is obtained.
889 */
890 int keyno;
891
892 for (keyno = 0; keyno < nkeys[attno - 1]; keyno++)
893 {
894 add = FunctionCall3Coll(&consistentFn[attno - 1],
895 keys[attno - 1][keyno]->sk_collation,
896 PointerGetDatum(bdesc),
897 PointerGetDatum(bval),
898 PointerGetDatum(keys[attno - 1][keyno]));
899 addrange = DatumGetBool(add);
900 if (!addrange)
901 break;
902 }
903 }
904
905 /*
906 * If we found a scan key eliminating the range, no need
907 * to check additional ones.
908 */
909 if (!addrange)
910 break;
911 }
912 }
913 }
914
915 /* add the pages in the range to the output bitmap, if needed */
916 if (addrange)
917 {
918 BlockNumber pageno;
919
920 for (pageno = heapBlk;
921 pageno <= Min(nblocks, heapBlk + opaque->bo_pagesPerRange) - 1;
922 pageno++)
923 {
924 MemoryContextSwitchTo(oldcxt);
925 tbm_add_page(tbm, pageno);
926 totalpages++;
927 MemoryContextSwitchTo(perRangeCxt);
928 }
929 }
930 }
931
932 MemoryContextSwitchTo(oldcxt);
933 MemoryContextDelete(perRangeCxt);
934
935 if (buf != InvalidBuffer)
937
938 /*
939 * XXX We have an approximation of the number of *pages* that our scan
940 * returns, but we don't have a precise idea of the number of heap tuples
941 * involved.
942 */
943 return totalpages * 10;
944}
945
946/*
947 * Re-initialize state for a BRIN index scan
948 */
949void
950brinrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
951 ScanKey orderbys, int norderbys)
952{
953 /*
954 * Other index AMs preprocess the scan keys at this point, or sometime
955 * early during the scan; this lets them optimize by removing redundant
956 * keys, or doing early returns when they are impossible to satisfy; see
957 * _bt_preprocess_keys for an example. Something like that could be added
958 * here someday, too.
959 */
960
961 if (scankey && scan->numberOfKeys > 0)
962 memcpy(scan->keyData, scankey, scan->numberOfKeys * sizeof(ScanKeyData));
963}
964
965/*
966 * Close down a BRIN index scan
967 */
968void
970{
971 BrinOpaque *opaque = (BrinOpaque *) scan->opaque;
972
974 brin_free_desc(opaque->bo_bdesc);
975 pfree(opaque);
976}
977
978/*
979 * Per-heap-tuple callback for table_index_build_scan.
980 *
981 * Note we don't worry about the page range at the end of the table here; it is
982 * present in the build state struct after we're called the last time, but not
983 * inserted into the index. Caller must ensure to do so, if appropriate.
984 */
985static void
987 ItemPointer tid,
988 Datum *values,
989 bool *isnull,
990 bool tupleIsAlive,
991 void *brstate)
992{
993 BrinBuildState *state = (BrinBuildState *) brstate;
994 BlockNumber thisblock;
995
996 thisblock = ItemPointerGetBlockNumber(tid);
997
998 /*
999 * If we're in a block that belongs to a future range, summarize what
1000 * we've got and start afresh. Note the scan might have skipped many
1001 * pages, if they were devoid of live tuples; make sure to insert index
1002 * tuples for those too.
1003 */
1004 while (thisblock > state->bs_currRangeStart + state->bs_pagesPerRange - 1)
1005 {
1006
1008 "brinbuildCallback: completed a range: %u--%u",
1009 state->bs_currRangeStart,
1010 state->bs_currRangeStart + state->bs_pagesPerRange));
1011
1012 /* create the index tuple and insert it */
1014
1015 /* set state to correspond to the next range */
1016 state->bs_currRangeStart += state->bs_pagesPerRange;
1017
1018 /* re-initialize state for it */
1019 brin_memtuple_initialize(state->bs_dtuple, state->bs_bdesc);
1020 }
1021
1022 /* Accumulate the current tuple into the running state */
1023 (void) add_values_to_range(index, state->bs_bdesc, state->bs_dtuple,
1024 values, isnull);
1025}
1026
1027/*
1028 * Per-heap-tuple callback for table_index_build_scan with parallelism.
1029 *
1030 * A version of the callback used by parallel index builds. The main difference
1031 * is that instead of writing the BRIN tuples into the index, we write them
1032 * into a shared tuplesort, and leave the insertion up to the leader (which may
1033 * reorder them a bit etc.). The callback also does not generate empty ranges,
1034 * those will be added by the leader when merging results from workers.
1035 */
1036static void
1038 ItemPointer tid,
1039 Datum *values,
1040 bool *isnull,
1041 bool tupleIsAlive,
1042 void *brstate)
1043{
1044 BrinBuildState *state = (BrinBuildState *) brstate;
1045 BlockNumber thisblock;
1046
1047 thisblock = ItemPointerGetBlockNumber(tid);
1048
1049 /*
1050 * If we're in a block that belongs to a different range, summarize what
1051 * we've got and start afresh. Note the scan might have skipped many
1052 * pages, if they were devoid of live tuples; we do not create empty BRIN
1053 * ranges here - the leader is responsible for filling them in.
1054 *
1055 * Unlike serial builds, parallel index builds allow synchronized seqscans
1056 * (because that's what parallel scans do). This means the block may wrap
1057 * around to the beginning of the relation, so the condition needs to
1058 * check for both future and past ranges.
1059 */
1060 if ((thisblock < state->bs_currRangeStart) ||
1061 (thisblock > state->bs_currRangeStart + state->bs_pagesPerRange - 1))
1062 {
1063
1065 "brinbuildCallbackParallel: completed a range: %u--%u",
1066 state->bs_currRangeStart,
1067 state->bs_currRangeStart + state->bs_pagesPerRange));
1068
1069 /* create the index tuple and write it into the tuplesort */
1071
1072 /*
1073 * Set state to correspond to the next range (for this block).
1074 *
1075 * This skips ranges that are either empty (and so we don't get any
1076 * tuples to summarize), or processed by other workers. We can't
1077 * differentiate those cases here easily, so we leave it up to the
1078 * leader to fill empty ranges where needed.
1079 */
1080 state->bs_currRangeStart
1081 = state->bs_pagesPerRange * (thisblock / state->bs_pagesPerRange);
1082
1083 /* re-initialize state for it */
1084 brin_memtuple_initialize(state->bs_dtuple, state->bs_bdesc);
1085 }
1086
1087 /* Accumulate the current tuple into the running state */
1088 (void) add_values_to_range(index, state->bs_bdesc, state->bs_dtuple,
1089 values, isnull);
1090}
1091
1092/*
1093 * brinbuild() -- build a new BRIN index.
1094 */
1097{
1098 IndexBuildResult *result;
1099 double reltuples;
1100 double idxtuples;
1101 BrinRevmap *revmap;
1103 Buffer meta;
1104 BlockNumber pagesPerRange;
1105
1106 /*
1107 * We expect to be called exactly once for any index relation.
1108 */
1110 elog(ERROR, "index \"%s\" already contains data",
1112
1113 /*
1114 * Critical section not required, because on error the creation of the
1115 * whole relation will be rolled back.
1116 */
1117
1121
1124 MarkBufferDirty(meta);
1125
1127 {
1128 xl_brin_createidx xlrec;
1129 XLogRecPtr recptr;
1130 Page page;
1131
1134
1136 XLogRegisterData((char *) &xlrec, SizeOfBrinCreateIdx);
1138
1139 recptr = XLogInsert(RM_BRIN_ID, XLOG_BRIN_CREATE_INDEX);
1140
1141 page = BufferGetPage(meta);
1142 PageSetLSN(page, recptr);
1143 }
1144
1145 UnlockReleaseBuffer(meta);
1146
1147 /*
1148 * Initialize our state, including the deformed tuple state.
1149 */
1150 revmap = brinRevmapInitialize(index, &pagesPerRange);
1151 state = initialize_brin_buildstate(index, revmap, pagesPerRange,
1153
1154 /*
1155 * Attempt to launch parallel worker scan when required
1156 *
1157 * XXX plan_create_index_workers makes the number of workers dependent on
1158 * maintenance_work_mem, requiring 32MB for each worker. That makes sense
1159 * for btree, but not for BRIN, which can do with much less memory. So
1160 * maybe make that somehow less strict, optionally?
1161 */
1162 if (indexInfo->ii_ParallelWorkers > 0)
1163 _brin_begin_parallel(state, heap, index, indexInfo->ii_Concurrent,
1164 indexInfo->ii_ParallelWorkers);
1165
1166 /*
1167 * If parallel build requested and at least one worker process was
1168 * successfully launched, set up coordination state, wait for workers to
1169 * complete. Then read all tuples from the shared tuplesort and insert
1170 * them into the index.
1171 *
1172 * In serial mode, simply scan the table and build the index one index
1173 * tuple at a time.
1174 */
1175 if (state->bs_leader)
1176 {
1177 SortCoordinate coordinate;
1178
1179 coordinate = (SortCoordinate) palloc0(sizeof(SortCoordinateData));
1180 coordinate->isWorker = false;
1181 coordinate->nParticipants =
1182 state->bs_leader->nparticipanttuplesorts;
1183 coordinate->sharedsort = state->bs_leader->sharedsort;
1184
1185 /*
1186 * Begin leader tuplesort.
1187 *
1188 * In cases where parallelism is involved, the leader receives the
1189 * same share of maintenance_work_mem as a serial sort (it is
1190 * generally treated in the same way as a serial sort once we return).
1191 * Parallel worker Tuplesortstates will have received only a fraction
1192 * of maintenance_work_mem, though.
1193 *
1194 * We rely on the lifetime of the Leader Tuplesortstate almost not
1195 * overlapping with any worker Tuplesortstate's lifetime. There may
1196 * be some small overlap, but that's okay because we rely on leader
1197 * Tuplesortstate only allocating a small, fixed amount of memory
1198 * here. When its tuplesort_performsort() is called (by our caller),
1199 * and significant amounts of memory are likely to be used, all
1200 * workers must have already freed almost all memory held by their
1201 * Tuplesortstates (they are about to go away completely, too). The
1202 * overall effect is that maintenance_work_mem always represents an
1203 * absolute high watermark on the amount of memory used by a CREATE
1204 * INDEX operation, regardless of the use of parallelism or any other
1205 * factor.
1206 */
1207 state->bs_sortstate =
1210
1211 /* scan the relation and merge per-worker results */
1212 reltuples = _brin_parallel_merge(state);
1213
1214 _brin_end_parallel(state->bs_leader, state);
1215 }
1216 else /* no parallel index build */
1217 {
1218 /*
1219 * Now scan the relation. No syncscan allowed here because we want
1220 * the heap blocks in physical order (we want to produce the ranges
1221 * starting from block 0, and the callback also relies on this to not
1222 * generate summary for the same range twice).
1223 */
1224 reltuples = table_index_build_scan(heap, index, indexInfo, false, true,
1225 brinbuildCallback, state, NULL);
1226
1227 /*
1228 * process the final batch
1229 *
1230 * XXX Note this does not update state->bs_currRangeStart, i.e. it
1231 * stays set to the last range added to the index. This is OK, because
1232 * that's what brin_fill_empty_ranges expects.
1233 */
1235
1236 /*
1237 * Backfill the final ranges with empty data.
1238 *
1239 * This saves us from doing what amounts to full table scans when the
1240 * index with a predicate like WHERE (nonnull_column IS NULL), or
1241 * other very selective predicates.
1242 */
1244 state->bs_currRangeStart,
1245 state->bs_maxRangeStart);
1246 }
1247
1248 /* release resources */
1249 idxtuples = state->bs_numtuples;
1250 brinRevmapTerminate(state->bs_rmAccess);
1252
1253 /*
1254 * Return statistics
1255 */
1257
1258 result->heap_tuples = reltuples;
1259 result->index_tuples = idxtuples;
1260
1261 return result;
1262}
1263
1264void
1266{
1267 Buffer metabuf;
1268
1269 /* An empty BRIN index has a metapage only. */
1270 metabuf = ExtendBufferedRel(BMR_REL(index), INIT_FORKNUM, NULL,
1272
1273 /* Initialize and xlog metabuffer. */
1277 MarkBufferDirty(metabuf);
1278 log_newpage_buffer(metabuf, true);
1280
1281 UnlockReleaseBuffer(metabuf);
1282}
1283
1284/*
1285 * brinbulkdelete
1286 * Since there are no per-heap-tuple index tuples in BRIN indexes,
1287 * there's not a lot we can do here.
1288 *
1289 * XXX we could mark item tuples as "dirty" (when a minimum or maximum heap
1290 * tuple is deleted), meaning the need to re-run summarization on the affected
1291 * range. Would need to add an extra flag in brintuples for that.
1292 */
1295 IndexBulkDeleteCallback callback, void *callback_state)
1296{
1297 /* allocate stats if first time through, else re-use existing struct */
1298 if (stats == NULL)
1300
1301 return stats;
1302}
1303
1304/*
1305 * This routine is in charge of "vacuuming" a BRIN index: we just summarize
1306 * ranges that are currently unsummarized.
1307 */
1310{
1311 Relation heapRel;
1312
1313 /* No-op in ANALYZE ONLY mode */
1314 if (info->analyze_only)
1315 return stats;
1316
1317 if (!stats)
1320 /* rest of stats is initialized by zeroing */
1321
1322 heapRel = table_open(IndexGetRelation(RelationGetRelid(info->index), false),
1324
1325 brin_vacuum_scan(info->index, info->strategy);
1326
1327 brinsummarize(info->index, heapRel, BRIN_ALL_BLOCKRANGES, false,
1328 &stats->num_index_tuples, &stats->num_index_tuples);
1329
1330 table_close(heapRel, AccessShareLock);
1331
1332 return stats;
1333}
1334
1335/*
1336 * reloptions processor for BRIN indexes
1337 */
1338bytea *
1339brinoptions(Datum reloptions, bool validate)
1340{
1341 static const relopt_parse_elt tab[] = {
1342 {"pages_per_range", RELOPT_TYPE_INT, offsetof(BrinOptions, pagesPerRange)},
1343 {"autosummarize", RELOPT_TYPE_BOOL, offsetof(BrinOptions, autosummarize)}
1344 };
1345
1346 return (bytea *) build_reloptions(reloptions, validate,
1348 sizeof(BrinOptions),
1349 tab, lengthof(tab));
1350}
1351
1352/*
1353 * SQL-callable function to scan through an index and summarize all ranges
1354 * that are not currently summarized.
1355 */
1356Datum
1358{
1359 Datum relation = PG_GETARG_DATUM(0);
1360
1362 relation,
1364}
1365
1366/*
1367 * SQL-callable function to summarize the indicated page range, if not already
1368 * summarized. If the second argument is BRIN_ALL_BLOCKRANGES, all
1369 * unsummarized ranges are summarized.
1370 */
1371Datum
1373{
1374 Oid indexoid = PG_GETARG_OID(0);
1375 int64 heapBlk64 = PG_GETARG_INT64(1);
1376 BlockNumber heapBlk;
1377 Oid heapoid;
1378 Relation indexRel;
1379 Relation heapRel;
1380 Oid save_userid;
1381 int save_sec_context;
1382 int save_nestlevel;
1383 double numSummarized = 0;
1384
1385 if (RecoveryInProgress())
1386 ereport(ERROR,
1387 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1388 errmsg("recovery is in progress"),
1389 errhint("BRIN control functions cannot be executed during recovery.")));
1390
1391 if (heapBlk64 > BRIN_ALL_BLOCKRANGES || heapBlk64 < 0)
1392 ereport(ERROR,
1393 (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
1394 errmsg("block number out of range: %lld",
1395 (long long) heapBlk64)));
1396 heapBlk = (BlockNumber) heapBlk64;
1397
1398 /*
1399 * We must lock table before index to avoid deadlocks. However, if the
1400 * passed indexoid isn't an index then IndexGetRelation() will fail.
1401 * Rather than emitting a not-very-helpful error message, postpone
1402 * complaining, expecting that the is-it-an-index test below will fail.
1403 */
1404 heapoid = IndexGetRelation(indexoid, true);
1405 if (OidIsValid(heapoid))
1406 {
1407 heapRel = table_open(heapoid, ShareUpdateExclusiveLock);
1408
1409 /*
1410 * Autovacuum calls us. For its benefit, switch to the table owner's
1411 * userid, so that any index functions are run as that user. Also
1412 * lock down security-restricted operations and arrange to make GUC
1413 * variable changes local to this command. This is harmless, albeit
1414 * unnecessary, when called from SQL, because we fail shortly if the
1415 * user does not own the index.
1416 */
1417 GetUserIdAndSecContext(&save_userid, &save_sec_context);
1418 SetUserIdAndSecContext(heapRel->rd_rel->relowner,
1419 save_sec_context | SECURITY_RESTRICTED_OPERATION);
1420 save_nestlevel = NewGUCNestLevel();
1422 }
1423 else
1424 {
1425 heapRel = NULL;
1426 /* Set these just to suppress "uninitialized variable" warnings */
1427 save_userid = InvalidOid;
1428 save_sec_context = -1;
1429 save_nestlevel = -1;
1430 }
1431
1432 indexRel = index_open(indexoid, ShareUpdateExclusiveLock);
1433
1434 /* Must be a BRIN index */
1435 if (indexRel->rd_rel->relkind != RELKIND_INDEX ||
1436 indexRel->rd_rel->relam != BRIN_AM_OID)
1437 ereport(ERROR,
1438 (errcode(ERRCODE_WRONG_OBJECT_TYPE),
1439 errmsg("\"%s\" is not a BRIN index",
1440 RelationGetRelationName(indexRel))));
1441
1442 /* User must own the index (comparable to privileges needed for VACUUM) */
1443 if (heapRel != NULL && !object_ownercheck(RelationRelationId, indexoid, save_userid))
1445 RelationGetRelationName(indexRel));
1446
1447 /*
1448 * Since we did the IndexGetRelation call above without any lock, it's
1449 * barely possible that a race against an index drop/recreation could have
1450 * netted us the wrong table. Recheck.
1451 */
1452 if (heapRel == NULL || heapoid != IndexGetRelation(indexoid, false))
1453 ereport(ERROR,
1455 errmsg("could not open parent table of index \"%s\"",
1456 RelationGetRelationName(indexRel))));
1457
1458 /* see gin_clean_pending_list() */
1459 if (indexRel->rd_index->indisvalid)
1460 brinsummarize(indexRel, heapRel, heapBlk, true, &numSummarized, NULL);
1461 else
1463 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1464 errmsg("index \"%s\" is not valid",
1465 RelationGetRelationName(indexRel))));
1466
1467 /* Roll back any GUC changes executed by index functions */
1468 AtEOXact_GUC(false, save_nestlevel);
1469
1470 /* Restore userid and security context */
1471 SetUserIdAndSecContext(save_userid, save_sec_context);
1472
1475
1476 PG_RETURN_INT32((int32) numSummarized);
1477}
1478
1479/*
1480 * SQL-callable interface to mark a range as no longer summarized
1481 */
1482Datum
1484{
1485 Oid indexoid = PG_GETARG_OID(0);
1486 int64 heapBlk64 = PG_GETARG_INT64(1);
1487 BlockNumber heapBlk;
1488 Oid heapoid;
1489 Relation heapRel;
1490 Relation indexRel;
1491 bool done;
1492
1493 if (RecoveryInProgress())
1494 ereport(ERROR,
1495 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1496 errmsg("recovery is in progress"),
1497 errhint("BRIN control functions cannot be executed during recovery.")));
1498
1499 if (heapBlk64 > MaxBlockNumber || heapBlk64 < 0)
1500 ereport(ERROR,
1501 (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
1502 errmsg("block number out of range: %lld",
1503 (long long) heapBlk64)));
1504 heapBlk = (BlockNumber) heapBlk64;
1505
1506 /*
1507 * We must lock table before index to avoid deadlocks. However, if the
1508 * passed indexoid isn't an index then IndexGetRelation() will fail.
1509 * Rather than emitting a not-very-helpful error message, postpone
1510 * complaining, expecting that the is-it-an-index test below will fail.
1511 *
1512 * Unlike brin_summarize_range(), autovacuum never calls this. Hence, we
1513 * don't switch userid.
1514 */
1515 heapoid = IndexGetRelation(indexoid, true);
1516 if (OidIsValid(heapoid))
1517 heapRel = table_open(heapoid, ShareUpdateExclusiveLock);
1518 else
1519 heapRel = NULL;
1520
1521 indexRel = index_open(indexoid, ShareUpdateExclusiveLock);
1522
1523 /* Must be a BRIN index */
1524 if (indexRel->rd_rel->relkind != RELKIND_INDEX ||
1525 indexRel->rd_rel->relam != BRIN_AM_OID)
1526 ereport(ERROR,
1527 (errcode(ERRCODE_WRONG_OBJECT_TYPE),
1528 errmsg("\"%s\" is not a BRIN index",
1529 RelationGetRelationName(indexRel))));
1530
1531 /* User must own the index (comparable to privileges needed for VACUUM) */
1532 if (!object_ownercheck(RelationRelationId, indexoid, GetUserId()))
1534 RelationGetRelationName(indexRel));
1535
1536 /*
1537 * Since we did the IndexGetRelation call above without any lock, it's
1538 * barely possible that a race against an index drop/recreation could have
1539 * netted us the wrong table. Recheck.
1540 */
1541 if (heapRel == NULL || heapoid != IndexGetRelation(indexoid, false))
1542 ereport(ERROR,
1544 errmsg("could not open parent table of index \"%s\"",
1545 RelationGetRelationName(indexRel))));
1546
1547 /* see gin_clean_pending_list() */
1548 if (indexRel->rd_index->indisvalid)
1549 {
1550 /* the revmap does the hard work */
1551 do
1552 {
1553 done = brinRevmapDesummarizeRange(indexRel, heapBlk);
1554 }
1555 while (!done);
1556 }
1557 else
1559 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1560 errmsg("index \"%s\" is not valid",
1561 RelationGetRelationName(indexRel))));
1562
1565
1567}
1568
1569/*
1570 * Build a BrinDesc used to create or scan a BRIN index
1571 */
1572BrinDesc *
1574{
1575 BrinOpcInfo **opcinfo;
1576 BrinDesc *bdesc;
1577 TupleDesc tupdesc;
1578 int totalstored = 0;
1579 int keyno;
1580 long totalsize;
1581 MemoryContext cxt;
1582 MemoryContext oldcxt;
1583
1585 "brin desc cxt",
1587 oldcxt = MemoryContextSwitchTo(cxt);
1588 tupdesc = RelationGetDescr(rel);
1589
1590 /*
1591 * Obtain BrinOpcInfo for each indexed column. While at it, accumulate
1592 * the number of columns stored, since the number is opclass-defined.
1593 */
1594 opcinfo = palloc_array(BrinOpcInfo *, tupdesc->natts);
1595 for (keyno = 0; keyno < tupdesc->natts; keyno++)
1596 {
1597 FmgrInfo *opcInfoFn;
1598 Form_pg_attribute attr = TupleDescAttr(tupdesc, keyno);
1599
1600 opcInfoFn = index_getprocinfo(rel, keyno + 1, BRIN_PROCNUM_OPCINFO);
1601
1602 opcinfo[keyno] = (BrinOpcInfo *)
1603 DatumGetPointer(FunctionCall1(opcInfoFn, attr->atttypid));
1604 totalstored += opcinfo[keyno]->oi_nstored;
1605 }
1606
1607 /* Allocate our result struct and fill it in */
1608 totalsize = offsetof(BrinDesc, bd_info) +
1609 sizeof(BrinOpcInfo *) * tupdesc->natts;
1610
1611 bdesc = palloc(totalsize);
1612 bdesc->bd_context = cxt;
1613 bdesc->bd_index = rel;
1614 bdesc->bd_tupdesc = tupdesc;
1615 bdesc->bd_disktdesc = NULL; /* generated lazily */
1616 bdesc->bd_totalstored = totalstored;
1617
1618 for (keyno = 0; keyno < tupdesc->natts; keyno++)
1619 bdesc->bd_info[keyno] = opcinfo[keyno];
1620 pfree(opcinfo);
1621
1622 MemoryContextSwitchTo(oldcxt);
1623
1624 return bdesc;
1625}
1626
1627void
1629{
1630 /* make sure the tupdesc is still valid */
1631 Assert(bdesc->bd_tupdesc->tdrefcount >= 1);
1632 /* no need for retail pfree */
1634}
1635
1636/*
1637 * Fetch index's statistical data into *stats
1638 */
1639void
1641{
1642 Buffer metabuffer;
1643 Page metapage;
1644 BrinMetaPageData *metadata;
1645
1646 metabuffer = ReadBuffer(index, BRIN_METAPAGE_BLKNO);
1647 LockBuffer(metabuffer, BUFFER_LOCK_SHARE);
1648 metapage = BufferGetPage(metabuffer);
1649 metadata = (BrinMetaPageData *) PageGetContents(metapage);
1650
1651 stats->pagesPerRange = metadata->pagesPerRange;
1652 stats->revmapNumPages = metadata->lastRevmapPage - 1;
1653
1654 UnlockReleaseBuffer(metabuffer);
1655}
1656
1657/*
1658 * Initialize a BrinBuildState appropriate to create tuples on the given index.
1659 */
1660static BrinBuildState *
1662 BlockNumber pagesPerRange, BlockNumber tablePages)
1663{
1665 BlockNumber lastRange = 0;
1666
1668
1669 state->bs_irel = idxRel;
1670 state->bs_numtuples = 0;
1671 state->bs_reltuples = 0;
1672 state->bs_currentInsertBuf = InvalidBuffer;
1673 state->bs_pagesPerRange = pagesPerRange;
1674 state->bs_currRangeStart = 0;
1675 state->bs_rmAccess = revmap;
1676 state->bs_bdesc = brin_build_desc(idxRel);
1677 state->bs_dtuple = brin_new_memtuple(state->bs_bdesc);
1678 state->bs_leader = NULL;
1679 state->bs_worker_id = 0;
1680 state->bs_sortstate = NULL;
1681 state->bs_context = CurrentMemoryContext;
1682 state->bs_emptyTuple = NULL;
1683 state->bs_emptyTupleLen = 0;
1684
1685 /* Remember the memory context to use for an empty tuple, if needed. */
1686 state->bs_context = CurrentMemoryContext;
1687 state->bs_emptyTuple = NULL;
1688 state->bs_emptyTupleLen = 0;
1689
1690 /*
1691 * Calculate the start of the last page range. Page numbers are 0-based,
1692 * so to calculate the index we need to subtract one. The integer division
1693 * gives us the index of the page range.
1694 */
1695 if (tablePages > 0)
1696 lastRange = ((tablePages - 1) / pagesPerRange) * pagesPerRange;
1697
1698 /* Now calculate the start of the next range. */
1699 state->bs_maxRangeStart = lastRange + state->bs_pagesPerRange;
1700
1701 return state;
1702}
1703
1704/*
1705 * Release resources associated with a BrinBuildState.
1706 */
1707static void
1709{
1710 /*
1711 * Release the last index buffer used. We might as well ensure that
1712 * whatever free space remains in that page is available in FSM, too.
1713 */
1714 if (!BufferIsInvalid(state->bs_currentInsertBuf))
1715 {
1716 Page page;
1717 Size freespace;
1718 BlockNumber blk;
1719
1720 page = BufferGetPage(state->bs_currentInsertBuf);
1721 freespace = PageGetFreeSpace(page);
1722 blk = BufferGetBlockNumber(state->bs_currentInsertBuf);
1723 ReleaseBuffer(state->bs_currentInsertBuf);
1724 RecordPageWithFreeSpace(state->bs_irel, blk, freespace);
1725 FreeSpaceMapVacuumRange(state->bs_irel, blk, blk + 1);
1726 }
1727
1728 brin_free_desc(state->bs_bdesc);
1729 pfree(state->bs_dtuple);
1730 pfree(state);
1731}
1732
1733/*
1734 * On the given BRIN index, summarize the heap page range that corresponds
1735 * to the heap block number given.
1736 *
1737 * This routine can run in parallel with insertions into the heap. To avoid
1738 * missing those values from the summary tuple, we first insert a placeholder
1739 * index tuple into the index, then execute the heap scan; transactions
1740 * concurrent with the scan update the placeholder tuple. After the scan, we
1741 * union the placeholder tuple with the one computed by this routine. The
1742 * update of the index value happens in a loop, so that if somebody updates
1743 * the placeholder tuple after we read it, we detect the case and try again.
1744 * This ensures that the concurrently inserted tuples are not lost.
1745 *
1746 * A further corner case is this routine being asked to summarize the partial
1747 * range at the end of the table. heapNumBlocks is the (possibly outdated)
1748 * table size; if we notice that the requested range lies beyond that size,
1749 * we re-compute the table size after inserting the placeholder tuple, to
1750 * avoid missing pages that were appended recently.
1751 */
1752static void
1754 BlockNumber heapBlk, BlockNumber heapNumBlks)
1755{
1756 Buffer phbuf;
1757 BrinTuple *phtup;
1758 Size phsz;
1759 OffsetNumber offset;
1760 BlockNumber scanNumBlks;
1761
1762 /*
1763 * Insert the placeholder tuple
1764 */
1765 phbuf = InvalidBuffer;
1766 phtup = brin_form_placeholder_tuple(state->bs_bdesc, heapBlk, &phsz);
1767 offset = brin_doinsert(state->bs_irel, state->bs_pagesPerRange,
1768 state->bs_rmAccess, &phbuf,
1769 heapBlk, phtup, phsz);
1770
1771 /*
1772 * Compute range end. We hold ShareUpdateExclusive lock on table, so it
1773 * cannot shrink concurrently (but it can grow).
1774 */
1775 Assert(heapBlk % state->bs_pagesPerRange == 0);
1776 if (heapBlk + state->bs_pagesPerRange > heapNumBlks)
1777 {
1778 /*
1779 * If we're asked to scan what we believe to be the final range on the
1780 * table (i.e. a range that might be partial) we need to recompute our
1781 * idea of what the latest page is after inserting the placeholder
1782 * tuple. Anyone that grows the table later will update the
1783 * placeholder tuple, so it doesn't matter that we won't scan these
1784 * pages ourselves. Careful: the table might have been extended
1785 * beyond the current range, so clamp our result.
1786 *
1787 * Fortunately, this should occur infrequently.
1788 */
1789 scanNumBlks = Min(RelationGetNumberOfBlocks(heapRel) - heapBlk,
1790 state->bs_pagesPerRange);
1791 }
1792 else
1793 {
1794 /* Easy case: range is known to be complete */
1795 scanNumBlks = state->bs_pagesPerRange;
1796 }
1797
1798 /*
1799 * Execute the partial heap scan covering the heap blocks in the specified
1800 * page range, summarizing the heap tuples in it. This scan stops just
1801 * short of brinbuildCallback creating the new index entry.
1802 *
1803 * Note that it is critical we use the "any visible" mode of
1804 * table_index_build_range_scan here: otherwise, we would miss tuples
1805 * inserted by transactions that are still in progress, among other corner
1806 * cases.
1807 */
1808 state->bs_currRangeStart = heapBlk;
1809 table_index_build_range_scan(heapRel, state->bs_irel, indexInfo, false, true, false,
1810 heapBlk, scanNumBlks,
1811 brinbuildCallback, state, NULL);
1812
1813 /*
1814 * Now we update the values obtained by the scan with the placeholder
1815 * tuple. We do this in a loop which only terminates if we're able to
1816 * update the placeholder tuple successfully; if we are not, this means
1817 * somebody else modified the placeholder tuple after we read it.
1818 */
1819 for (;;)
1820 {
1821 BrinTuple *newtup;
1822 Size newsize;
1823 bool didupdate;
1824 bool samepage;
1825
1827
1828 /*
1829 * Update the summary tuple and try to update.
1830 */
1831 newtup = brin_form_tuple(state->bs_bdesc,
1832 heapBlk, state->bs_dtuple, &newsize);
1833 samepage = brin_can_do_samepage_update(phbuf, phsz, newsize);
1834 didupdate =
1835 brin_doupdate(state->bs_irel, state->bs_pagesPerRange,
1836 state->bs_rmAccess, heapBlk, phbuf, offset,
1837 phtup, phsz, newtup, newsize, samepage);
1838 brin_free_tuple(phtup);
1839 brin_free_tuple(newtup);
1840
1841 /* If the update succeeded, we're done. */
1842 if (didupdate)
1843 break;
1844
1845 /*
1846 * If the update didn't work, it might be because somebody updated the
1847 * placeholder tuple concurrently. Extract the new version, union it
1848 * with the values we have from the scan, and start over. (There are
1849 * other reasons for the update to fail, but it's simple to treat them
1850 * the same.)
1851 */
1852 phtup = brinGetTupleForHeapBlock(state->bs_rmAccess, heapBlk, &phbuf,
1853 &offset, &phsz, BUFFER_LOCK_SHARE);
1854 /* the placeholder tuple must exist */
1855 if (phtup == NULL)
1856 elog(ERROR, "missing placeholder tuple");
1857 phtup = brin_copy_tuple(phtup, phsz, NULL, NULL);
1859
1860 /* merge it into the tuple from the heap scan */
1861 union_tuples(state->bs_bdesc, state->bs_dtuple, phtup);
1862 }
1863
1864 ReleaseBuffer(phbuf);
1865}
1866
1867/*
1868 * Summarize page ranges that are not already summarized. If pageRange is
1869 * BRIN_ALL_BLOCKRANGES then the whole table is scanned; otherwise, only the
1870 * page range containing the given heap page number is scanned.
1871 * If include_partial is true, then the partial range at the end of the table
1872 * is summarized, otherwise not.
1873 *
1874 * For each new index tuple inserted, *numSummarized (if not NULL) is
1875 * incremented; for each existing tuple, *numExisting (if not NULL) is
1876 * incremented.
1877 */
1878static void
1880 bool include_partial, double *numSummarized, double *numExisting)
1881{
1882 BrinRevmap *revmap;
1883 BrinBuildState *state = NULL;
1884 IndexInfo *indexInfo = NULL;
1885 BlockNumber heapNumBlocks;
1886 BlockNumber pagesPerRange;
1887 Buffer buf;
1888 BlockNumber startBlk;
1889
1890 revmap = brinRevmapInitialize(index, &pagesPerRange);
1891
1892 /* determine range of pages to process */
1893 heapNumBlocks = RelationGetNumberOfBlocks(heapRel);
1894 if (pageRange == BRIN_ALL_BLOCKRANGES)
1895 startBlk = 0;
1896 else
1897 {
1898 startBlk = (pageRange / pagesPerRange) * pagesPerRange;
1899 heapNumBlocks = Min(heapNumBlocks, startBlk + pagesPerRange);
1900 }
1901 if (startBlk > heapNumBlocks)
1902 {
1903 /* Nothing to do if start point is beyond end of table */
1904 brinRevmapTerminate(revmap);
1905 return;
1906 }
1907
1908 /*
1909 * Scan the revmap to find unsummarized items.
1910 */
1912 for (; startBlk < heapNumBlocks; startBlk += pagesPerRange)
1913 {
1914 BrinTuple *tup;
1915 OffsetNumber off;
1916
1917 /*
1918 * Unless requested to summarize even a partial range, go away now if
1919 * we think the next range is partial. Caller would pass true when it
1920 * is typically run once bulk data loading is done
1921 * (brin_summarize_new_values), and false when it is typically the
1922 * result of arbitrarily-scheduled maintenance command (vacuuming).
1923 */
1924 if (!include_partial &&
1925 (startBlk + pagesPerRange > heapNumBlocks))
1926 break;
1927
1929
1930 tup = brinGetTupleForHeapBlock(revmap, startBlk, &buf, &off, NULL,
1932 if (tup == NULL)
1933 {
1934 /* no revmap entry for this heap range. Summarize it. */
1935 if (state == NULL)
1936 {
1937 /* first time through */
1938 Assert(!indexInfo);
1940 pagesPerRange,
1942 indexInfo = BuildIndexInfo(index);
1943 }
1944 summarize_range(indexInfo, state, heapRel, startBlk, heapNumBlocks);
1945
1946 /* and re-initialize state for the next range */
1947 brin_memtuple_initialize(state->bs_dtuple, state->bs_bdesc);
1948
1949 if (numSummarized)
1950 *numSummarized += 1.0;
1951 }
1952 else
1953 {
1954 if (numExisting)
1955 *numExisting += 1.0;
1957 }
1958 }
1959
1960 if (BufferIsValid(buf))
1962
1963 /* free resources */
1964 brinRevmapTerminate(revmap);
1965 if (state)
1966 {
1968 pfree(indexInfo);
1969 }
1970}
1971
1972/*
1973 * Given a deformed tuple in the build state, convert it into the on-disk
1974 * format and insert it into the index, making the revmap point to it.
1975 */
1976static void
1978{
1979 BrinTuple *tup;
1980 Size size;
1981
1982 tup = brin_form_tuple(state->bs_bdesc, state->bs_currRangeStart,
1983 state->bs_dtuple, &size);
1984 brin_doinsert(state->bs_irel, state->bs_pagesPerRange, state->bs_rmAccess,
1985 &state->bs_currentInsertBuf, state->bs_currRangeStart,
1986 tup, size);
1987 state->bs_numtuples++;
1988
1989 pfree(tup);
1990}
1991
1992/*
1993 * Given a deformed tuple in the build state, convert it into the on-disk
1994 * format and write it to a (shared) tuplesort (the leader will insert it
1995 * into the index later).
1996 */
1997static void
1999{
2000 BrinTuple *tup;
2001 Size size;
2002
2003 /* don't insert empty tuples in parallel build */
2004 if (state->bs_dtuple->bt_empty_range)
2005 return;
2006
2007 tup = brin_form_tuple(state->bs_bdesc, state->bs_currRangeStart,
2008 state->bs_dtuple, &size);
2009
2010 /* write the BRIN tuple to the tuplesort */
2011 tuplesort_putbrintuple(state->bs_sortstate, tup, size);
2012
2013 state->bs_numtuples++;
2014
2015 pfree(tup);
2016}
2017
2018/*
2019 * Given two deformed tuples, adjust the first one so that it's consistent
2020 * with the summary values in both.
2021 */
2022static void
2024{
2025 int keyno;
2026 BrinMemTuple *db;
2027 MemoryContext cxt;
2028 MemoryContext oldcxt;
2029
2030 /* Use our own memory context to avoid retail pfree */
2032 "brin union",
2034 oldcxt = MemoryContextSwitchTo(cxt);
2035 db = brin_deform_tuple(bdesc, b, NULL);
2036 MemoryContextSwitchTo(oldcxt);
2037
2038 /*
2039 * Check if the ranges are empty.
2040 *
2041 * If at least one of them is empty, we don't need to call per-key union
2042 * functions at all. If "b" is empty, we just use "a" as the result (it
2043 * might be empty fine, but that's fine). If "a" is empty but "b" is not,
2044 * we use "b" as the result (but we have to copy the data into "a" first).
2045 *
2046 * Only when both ranges are non-empty, we actually do the per-key merge.
2047 */
2048
2049 /* If "b" is empty - ignore it and just use "a" (even if it's empty etc.). */
2050 if (db->bt_empty_range)
2051 {
2052 /* skip the per-key merge */
2054 return;
2055 }
2056
2057 /*
2058 * Now we know "b" is not empty. If "a" is empty, then "b" is the result.
2059 * But we need to copy the data from "b" to "a" first, because that's how
2060 * we pass result out.
2061 *
2062 * We have to copy all the global/per-key flags etc. too.
2063 */
2064 if (a->bt_empty_range)
2065 {
2066 for (keyno = 0; keyno < bdesc->bd_tupdesc->natts; keyno++)
2067 {
2068 int i;
2069 BrinValues *col_a = &a->bt_columns[keyno];
2070 BrinValues *col_b = &db->bt_columns[keyno];
2071 BrinOpcInfo *opcinfo = bdesc->bd_info[keyno];
2072
2073 col_a->bv_allnulls = col_b->bv_allnulls;
2074 col_a->bv_hasnulls = col_b->bv_hasnulls;
2075
2076 /* If "b" has no data, we're done. */
2077 if (col_b->bv_allnulls)
2078 continue;
2079
2080 for (i = 0; i < opcinfo->oi_nstored; i++)
2081 col_a->bv_values[i] =
2082 datumCopy(col_b->bv_values[i],
2083 opcinfo->oi_typcache[i]->typbyval,
2084 opcinfo->oi_typcache[i]->typlen);
2085 }
2086
2087 /* "a" started empty, but "b" was not empty, so remember that */
2088 a->bt_empty_range = false;
2089
2090 /* skip the per-key merge */
2092 return;
2093 }
2094
2095 /* Now we know neither range is empty. */
2096 for (keyno = 0; keyno < bdesc->bd_tupdesc->natts; keyno++)
2097 {
2098 FmgrInfo *unionFn;
2099 BrinValues *col_a = &a->bt_columns[keyno];
2100 BrinValues *col_b = &db->bt_columns[keyno];
2101 BrinOpcInfo *opcinfo = bdesc->bd_info[keyno];
2102
2103 if (opcinfo->oi_regular_nulls)
2104 {
2105 /* Does the "b" summary represent any NULL values? */
2106 bool b_has_nulls = (col_b->bv_hasnulls || col_b->bv_allnulls);
2107
2108 /* Adjust "hasnulls". */
2109 if (!col_a->bv_allnulls && b_has_nulls)
2110 col_a->bv_hasnulls = true;
2111
2112 /* If there are no values in B, there's nothing left to do. */
2113 if (col_b->bv_allnulls)
2114 continue;
2115
2116 /*
2117 * Adjust "allnulls". If A doesn't have values, just copy the
2118 * values from B into A, and we're done. We cannot run the
2119 * operators in this case, because values in A might contain
2120 * garbage. Note we already established that B contains values.
2121 *
2122 * Also adjust "hasnulls" in order not to forget the summary
2123 * represents NULL values. This is not redundant with the earlier
2124 * update, because that only happens when allnulls=false.
2125 */
2126 if (col_a->bv_allnulls)
2127 {
2128 int i;
2129
2130 col_a->bv_allnulls = false;
2131 col_a->bv_hasnulls = true;
2132
2133 for (i = 0; i < opcinfo->oi_nstored; i++)
2134 col_a->bv_values[i] =
2135 datumCopy(col_b->bv_values[i],
2136 opcinfo->oi_typcache[i]->typbyval,
2137 opcinfo->oi_typcache[i]->typlen);
2138
2139 continue;
2140 }
2141 }
2142
2143 unionFn = index_getprocinfo(bdesc->bd_index, keyno + 1,
2145 FunctionCall3Coll(unionFn,
2146 bdesc->bd_index->rd_indcollation[keyno],
2147 PointerGetDatum(bdesc),
2148 PointerGetDatum(col_a),
2149 PointerGetDatum(col_b));
2150 }
2151
2153}
2154
2155/*
2156 * brin_vacuum_scan
2157 * Do a complete scan of the index during VACUUM.
2158 *
2159 * This routine scans the complete index looking for uncataloged index pages,
2160 * i.e. those that might have been lost due to a crash after index extension
2161 * and such.
2162 */
2163static void
2165{
2166 BlockNumber nblocks;
2167 BlockNumber blkno;
2168
2169 /*
2170 * Scan the index in physical order, and clean up any possible mess in
2171 * each page.
2172 */
2173 nblocks = RelationGetNumberOfBlocks(idxrel);
2174 for (blkno = 0; blkno < nblocks; blkno++)
2175 {
2176 Buffer buf;
2177
2179
2180 buf = ReadBufferExtended(idxrel, MAIN_FORKNUM, blkno,
2181 RBM_NORMAL, strategy);
2182
2183 brin_page_cleanup(idxrel, buf);
2184
2186 }
2187
2188 /*
2189 * Update all upper pages in the index's FSM, as well. This ensures not
2190 * only that we propagate leaf-page FSM updates made by brin_page_cleanup,
2191 * but also that any pre-existing damage or out-of-dateness is repaired.
2192 */
2193 FreeSpaceMapVacuum(idxrel);
2194}
2195
2196static bool
2198 const Datum *values, const bool *nulls)
2199{
2200 int keyno;
2201
2202 /* If the range starts empty, we're certainly going to modify it. */
2203 bool modified = dtup->bt_empty_range;
2204
2205 /*
2206 * Compare the key values of the new tuple to the stored index values; our
2207 * deformed tuple will get updated if the new tuple doesn't fit the
2208 * original range (note this means we can't break out of the loop early).
2209 * Make a note of whether this happens, so that we know to insert the
2210 * modified tuple later.
2211 */
2212 for (keyno = 0; keyno < bdesc->bd_tupdesc->natts; keyno++)
2213 {
2214 Datum result;
2215 BrinValues *bval;
2216 FmgrInfo *addValue;
2217 bool has_nulls;
2218
2219 bval = &dtup->bt_columns[keyno];
2220
2221 /*
2222 * Does the range have actual NULL values? Either of the flags can be
2223 * set, but we ignore the state before adding first row.
2224 *
2225 * We have to remember this, because we'll modify the flags and we
2226 * need to know if the range started as empty.
2227 */
2228 has_nulls = ((!dtup->bt_empty_range) &&
2229 (bval->bv_hasnulls || bval->bv_allnulls));
2230
2231 /*
2232 * If the value we're adding is NULL, handle it locally. Otherwise
2233 * call the BRIN_PROCNUM_ADDVALUE procedure.
2234 */
2235 if (bdesc->bd_info[keyno]->oi_regular_nulls && nulls[keyno])
2236 {
2237 /*
2238 * If the new value is null, we record that we saw it if it's the
2239 * first one; otherwise, there's nothing to do.
2240 */
2241 if (!bval->bv_hasnulls)
2242 {
2243 bval->bv_hasnulls = true;
2244 modified = true;
2245 }
2246
2247 continue;
2248 }
2249
2250 addValue = index_getprocinfo(idxRel, keyno + 1,
2252 result = FunctionCall4Coll(addValue,
2253 idxRel->rd_indcollation[keyno],
2254 PointerGetDatum(bdesc),
2255 PointerGetDatum(bval),
2256 values[keyno],
2257 nulls[keyno]);
2258 /* if that returned true, we need to insert the updated tuple */
2259 modified |= DatumGetBool(result);
2260
2261 /*
2262 * If the range was had actual NULL values (i.e. did not start empty),
2263 * make sure we don't forget about the NULL values. Either the
2264 * allnulls flag is still set to true, or (if the opclass cleared it)
2265 * we need to set hasnulls=true.
2266 *
2267 * XXX This can only happen when the opclass modified the tuple, so
2268 * the modified flag should be set.
2269 */
2270 if (has_nulls && !(bval->bv_hasnulls || bval->bv_allnulls))
2271 {
2272 Assert(modified);
2273 bval->bv_hasnulls = true;
2274 }
2275 }
2276
2277 /*
2278 * After updating summaries for all the keys, mark it as not empty.
2279 *
2280 * If we're actually changing the flag value (i.e. tuple started as
2281 * empty), we should have modified the tuple. So we should not see empty
2282 * range that was not modified.
2283 */
2284 Assert(!dtup->bt_empty_range || modified);
2285 dtup->bt_empty_range = false;
2286
2287 return modified;
2288}
2289
2290static bool
2291check_null_keys(BrinValues *bval, ScanKey *nullkeys, int nnullkeys)
2292{
2293 int keyno;
2294
2295 /*
2296 * First check if there are any IS [NOT] NULL scan keys, and if we're
2297 * violating them.
2298 */
2299 for (keyno = 0; keyno < nnullkeys; keyno++)
2300 {
2301 ScanKey key = nullkeys[keyno];
2302
2303 Assert(key->sk_attno == bval->bv_attno);
2304
2305 /* Handle only IS NULL/IS NOT NULL tests */
2306 if (!(key->sk_flags & SK_ISNULL))
2307 continue;
2308
2309 if (key->sk_flags & SK_SEARCHNULL)
2310 {
2311 /* IS NULL scan key, but range has no NULLs */
2312 if (!bval->bv_allnulls && !bval->bv_hasnulls)
2313 return false;
2314 }
2315 else if (key->sk_flags & SK_SEARCHNOTNULL)
2316 {
2317 /*
2318 * For IS NOT NULL, we can only skip ranges that are known to have
2319 * only nulls.
2320 */
2321 if (bval->bv_allnulls)
2322 return false;
2323 }
2324 else
2325 {
2326 /*
2327 * Neither IS NULL nor IS NOT NULL was used; assume all indexable
2328 * operators are strict and thus return false with NULL value in
2329 * the scan key.
2330 */
2331 return false;
2332 }
2333 }
2334
2335 return true;
2336}
2337
2338/*
2339 * Create parallel context, and launch workers for leader.
2340 *
2341 * buildstate argument should be initialized (with the exception of the
2342 * tuplesort states, which may later be created based on shared
2343 * state initially set up here).
2344 *
2345 * isconcurrent indicates if operation is CREATE INDEX CONCURRENTLY.
2346 *
2347 * request is the target number of parallel worker processes to launch.
2348 *
2349 * Sets buildstate's BrinLeader, which caller must use to shut down parallel
2350 * mode by passing it to _brin_end_parallel() at the very end of its index
2351 * build. If not even a single worker process can be launched, this is
2352 * never set, and caller should proceed with a serial index build.
2353 */
2354static void
2356 bool isconcurrent, int request)
2357{
2358 ParallelContext *pcxt;
2359 int scantuplesortstates;
2360 Snapshot snapshot;
2361 Size estbrinshared;
2362 Size estsort;
2363 BrinShared *brinshared;
2364 Sharedsort *sharedsort;
2365 BrinLeader *brinleader = (BrinLeader *) palloc0(sizeof(BrinLeader));
2366 WalUsage *walusage;
2367 BufferUsage *bufferusage;
2368 bool leaderparticipates = true;
2369 int querylen;
2370
2371#ifdef DISABLE_LEADER_PARTICIPATION
2372 leaderparticipates = false;
2373#endif
2374
2375 /*
2376 * Enter parallel mode, and create context for parallel build of brin
2377 * index
2378 */
2380 Assert(request > 0);
2381 pcxt = CreateParallelContext("postgres", "_brin_parallel_build_main",
2382 request);
2383
2384 scantuplesortstates = leaderparticipates ? request + 1 : request;
2385
2386 /*
2387 * Prepare for scan of the base relation. In a normal index build, we use
2388 * SnapshotAny because we must retrieve all tuples and do our own time
2389 * qual checks (because we have to index RECENTLY_DEAD tuples). In a
2390 * concurrent build, we take a regular MVCC snapshot and index whatever's
2391 * live according to that.
2392 */
2393 if (!isconcurrent)
2394 snapshot = SnapshotAny;
2395 else
2397
2398 /*
2399 * Estimate size for our own PARALLEL_KEY_BRIN_SHARED workspace.
2400 */
2401 estbrinshared = _brin_parallel_estimate_shared(heap, snapshot);
2402 shm_toc_estimate_chunk(&pcxt->estimator, estbrinshared);
2403 estsort = tuplesort_estimate_shared(scantuplesortstates);
2404 shm_toc_estimate_chunk(&pcxt->estimator, estsort);
2405
2407
2408 /*
2409 * Estimate space for WalUsage and BufferUsage -- PARALLEL_KEY_WAL_USAGE
2410 * and PARALLEL_KEY_BUFFER_USAGE.
2411 *
2412 * If there are no extensions loaded that care, we could skip this. We
2413 * have no way of knowing whether anyone's looking at pgWalUsage or
2414 * pgBufferUsage, so do it unconditionally.
2415 */
2417 mul_size(sizeof(WalUsage), pcxt->nworkers));
2420 mul_size(sizeof(BufferUsage), pcxt->nworkers));
2422
2423 /* Finally, estimate PARALLEL_KEY_QUERY_TEXT space */
2425 {
2426 querylen = strlen(debug_query_string);
2427 shm_toc_estimate_chunk(&pcxt->estimator, querylen + 1);
2429 }
2430 else
2431 querylen = 0; /* keep compiler quiet */
2432
2433 /* Everyone's had a chance to ask for space, so now create the DSM */
2435
2436 /* If no DSM segment was available, back out (do serial build) */
2437 if (pcxt->seg == NULL)
2438 {
2439 if (IsMVCCSnapshot(snapshot))
2440 UnregisterSnapshot(snapshot);
2443 return;
2444 }
2445
2446 /* Store shared build state, for which we reserved space */
2447 brinshared = (BrinShared *) shm_toc_allocate(pcxt->toc, estbrinshared);
2448 /* Initialize immutable state */
2449 brinshared->heaprelid = RelationGetRelid(heap);
2450 brinshared->indexrelid = RelationGetRelid(index);
2451 brinshared->isconcurrent = isconcurrent;
2452 brinshared->scantuplesortstates = scantuplesortstates;
2453 brinshared->pagesPerRange = buildstate->bs_pagesPerRange;
2454 brinshared->queryid = pgstat_get_my_query_id();
2456 SpinLockInit(&brinshared->mutex);
2457
2458 /* Initialize mutable state */
2459 brinshared->nparticipantsdone = 0;
2460 brinshared->reltuples = 0.0;
2461 brinshared->indtuples = 0.0;
2462
2465 snapshot);
2466
2467 /*
2468 * Store shared tuplesort-private state, for which we reserved space.
2469 * Then, initialize opaque state using tuplesort routine.
2470 */
2471 sharedsort = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);
2472 tuplesort_initialize_shared(sharedsort, scantuplesortstates,
2473 pcxt->seg);
2474
2475 /*
2476 * Store shared tuplesort-private state, for which we reserved space.
2477 * Then, initialize opaque state using tuplesort routine.
2478 */
2479 shm_toc_insert(pcxt->toc, PARALLEL_KEY_BRIN_SHARED, brinshared);
2480 shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT, sharedsort);
2481
2482 /* Store query string for workers */
2484 {
2485 char *sharedquery;
2486
2487 sharedquery = (char *) shm_toc_allocate(pcxt->toc, querylen + 1);
2488 memcpy(sharedquery, debug_query_string, querylen + 1);
2489 shm_toc_insert(pcxt->toc, PARALLEL_KEY_QUERY_TEXT, sharedquery);
2490 }
2491
2492 /*
2493 * Allocate space for each worker's WalUsage and BufferUsage; no need to
2494 * initialize.
2495 */
2496 walusage = shm_toc_allocate(pcxt->toc,
2497 mul_size(sizeof(WalUsage), pcxt->nworkers));
2498 shm_toc_insert(pcxt->toc, PARALLEL_KEY_WAL_USAGE, walusage);
2499 bufferusage = shm_toc_allocate(pcxt->toc,
2500 mul_size(sizeof(BufferUsage), pcxt->nworkers));
2501 shm_toc_insert(pcxt->toc, PARALLEL_KEY_BUFFER_USAGE, bufferusage);
2502
2503 /* Launch workers, saving status for leader/caller */
2505 brinleader->pcxt = pcxt;
2506 brinleader->nparticipanttuplesorts = pcxt->nworkers_launched;
2507 if (leaderparticipates)
2508 brinleader->nparticipanttuplesorts++;
2509 brinleader->brinshared = brinshared;
2510 brinleader->sharedsort = sharedsort;
2511 brinleader->snapshot = snapshot;
2512 brinleader->walusage = walusage;
2513 brinleader->bufferusage = bufferusage;
2514
2515 /* If no workers were successfully launched, back out (do serial build) */
2516 if (pcxt->nworkers_launched == 0)
2517 {
2518 _brin_end_parallel(brinleader, NULL);
2519 return;
2520 }
2521
2522 /* Save leader state now that it's clear build will be parallel */
2523 buildstate->bs_leader = brinleader;
2524
2525 /* Join heap scan ourselves */
2526 if (leaderparticipates)
2527 _brin_leader_participate_as_worker(buildstate, heap, index);
2528
2529 /*
2530 * Caller needs to wait for all launched workers when we return. Make
2531 * sure that the failure-to-start case will not hang forever.
2532 */
2534}
2535
2536/*
2537 * Shut down workers, destroy parallel context, and end parallel mode.
2538 */
2539static void
2541{
2542 int i;
2543
2544 /* Shutdown worker processes */
2546
2547 /*
2548 * Next, accumulate WAL usage. (This must wait for the workers to finish,
2549 * or we might get incomplete data.)
2550 */
2551 for (i = 0; i < brinleader->pcxt->nworkers_launched; i++)
2552 InstrAccumParallelQuery(&brinleader->bufferusage[i], &brinleader->walusage[i]);
2553
2554 /* Free last reference to MVCC snapshot, if one was used */
2555 if (IsMVCCSnapshot(brinleader->snapshot))
2556 UnregisterSnapshot(brinleader->snapshot);
2557 DestroyParallelContext(brinleader->pcxt);
2559}
2560
2561/*
2562 * Within leader, wait for end of heap scan.
2563 *
2564 * When called, parallel heap scan started by _brin_begin_parallel() will
2565 * already be underway within worker processes (when leader participates
2566 * as a worker, we should end up here just as workers are finishing).
2567 *
2568 * Returns the total number of heap tuples scanned.
2569 */
2570static double
2572{
2573 BrinShared *brinshared = state->bs_leader->brinshared;
2574 int nparticipanttuplesorts;
2575
2576 nparticipanttuplesorts = state->bs_leader->nparticipanttuplesorts;
2577 for (;;)
2578 {
2579 SpinLockAcquire(&brinshared->mutex);
2580 if (brinshared->nparticipantsdone == nparticipanttuplesorts)
2581 {
2582 /* copy the data into leader state */
2583 state->bs_reltuples = brinshared->reltuples;
2584 state->bs_numtuples = brinshared->indtuples;
2585
2586 SpinLockRelease(&brinshared->mutex);
2587 break;
2588 }
2589 SpinLockRelease(&brinshared->mutex);
2590
2592 WAIT_EVENT_PARALLEL_CREATE_INDEX_SCAN);
2593 }
2594
2596
2597 return state->bs_reltuples;
2598}
2599
2600/*
2601 * Within leader, wait for end of heap scan and merge per-worker results.
2602 *
2603 * After waiting for all workers to finish, merge the per-worker results into
2604 * the complete index. The results from each worker are sorted by block number
2605 * (start of the page range). While combining the per-worker results we merge
2606 * summaries for the same page range, and also fill-in empty summaries for
2607 * ranges without any tuples.
2608 *
2609 * Returns the total number of heap tuples scanned.
2610 */
2611static double
2613{
2614 BrinTuple *btup;
2615 BrinMemTuple *memtuple = NULL;
2616 Size tuplen;
2617 BlockNumber prevblkno = InvalidBlockNumber;
2618 MemoryContext rangeCxt,
2619 oldCxt;
2620 double reltuples;
2621
2622 /* wait for workers to scan table and produce partial results */
2623 reltuples = _brin_parallel_heapscan(state);
2624
2625 /* do the actual sort in the leader */
2626 tuplesort_performsort(state->bs_sortstate);
2627
2628 /*
2629 * Initialize BrinMemTuple we'll use to union summaries from workers (in
2630 * case they happened to produce parts of the same page range).
2631 */
2632 memtuple = brin_new_memtuple(state->bs_bdesc);
2633
2634 /*
2635 * Create a memory context we'll reset to combine results for a single
2636 * page range (received from the workers). We don't expect huge number of
2637 * overlaps under regular circumstances, because for large tables the
2638 * chunk size is likely larger than the BRIN page range), but it can
2639 * happen, and the union functions may do all kinds of stuff. So we better
2640 * reset the context once in a while.
2641 */
2643 "brin union",
2645 oldCxt = MemoryContextSwitchTo(rangeCxt);
2646
2647 /*
2648 * Read the BRIN tuples from the shared tuplesort, sorted by block number.
2649 * That probably gives us an index that is cheaper to scan, thanks to
2650 * mostly getting data from the same index page as before.
2651 */
2652 while ((btup = tuplesort_getbrintuple(state->bs_sortstate, &tuplen, true)) != NULL)
2653 {
2654 /* Ranges should be multiples of pages_per_range for the index. */
2655 Assert(btup->bt_blkno % state->bs_leader->brinshared->pagesPerRange == 0);
2656
2657 /*
2658 * Do we need to union summaries for the same page range?
2659 *
2660 * If this is the first brin tuple we read, then just deform it into
2661 * the memtuple, and continue with the next one from tuplesort. We
2662 * however may need to insert empty summaries into the index.
2663 *
2664 * If it's the same block as the last we saw, we simply union the brin
2665 * tuple into it, and we're done - we don't even need to insert empty
2666 * ranges, because that was done earlier when we saw the first brin
2667 * tuple (for this range).
2668 *
2669 * Finally, if it's not the first brin tuple, and it's not the same
2670 * page range, we need to do the insert and then deform the tuple into
2671 * the memtuple. Then we'll insert empty ranges before the new brin
2672 * tuple, if needed.
2673 */
2674 if (prevblkno == InvalidBlockNumber)
2675 {
2676 /* First brin tuples, just deform into memtuple. */
2677 memtuple = brin_deform_tuple(state->bs_bdesc, btup, memtuple);
2678
2679 /* continue to insert empty pages before thisblock */
2680 }
2681 else if (memtuple->bt_blkno == btup->bt_blkno)
2682 {
2683 /*
2684 * Not the first brin tuple, but same page range as the previous
2685 * one, so we can merge it into the memtuple.
2686 */
2687 union_tuples(state->bs_bdesc, memtuple, btup);
2688 continue;
2689 }
2690 else
2691 {
2692 BrinTuple *tmp;
2693 Size len;
2694
2695 /*
2696 * We got brin tuple for a different page range, so form a brin
2697 * tuple from the memtuple, insert it, and re-init the memtuple
2698 * from the new brin tuple.
2699 */
2700 tmp = brin_form_tuple(state->bs_bdesc, memtuple->bt_blkno,
2701 memtuple, &len);
2702
2703 brin_doinsert(state->bs_irel, state->bs_pagesPerRange, state->bs_rmAccess,
2704 &state->bs_currentInsertBuf, tmp->bt_blkno, tmp, len);
2705
2706 /*
2707 * Reset the per-output-range context. This frees all the memory
2708 * possibly allocated by the union functions, and also the BRIN
2709 * tuple we just formed and inserted.
2710 */
2711 MemoryContextReset(rangeCxt);
2712
2713 memtuple = brin_deform_tuple(state->bs_bdesc, btup, memtuple);
2714
2715 /* continue to insert empty pages before thisblock */
2716 }
2717
2718 /* Fill empty ranges for all ranges missing in the tuplesort. */
2719 brin_fill_empty_ranges(state, prevblkno, btup->bt_blkno);
2720
2721 prevblkno = btup->bt_blkno;
2722 }
2723
2724 tuplesort_end(state->bs_sortstate);
2725
2726 /* Fill the BRIN tuple for the last page range with data. */
2727 if (prevblkno != InvalidBlockNumber)
2728 {
2729 BrinTuple *tmp;
2730 Size len;
2731
2732 tmp = brin_form_tuple(state->bs_bdesc, memtuple->bt_blkno,
2733 memtuple, &len);
2734
2735 brin_doinsert(state->bs_irel, state->bs_pagesPerRange, state->bs_rmAccess,
2736 &state->bs_currentInsertBuf, tmp->bt_blkno, tmp, len);
2737
2738 pfree(tmp);
2739 }
2740
2741 /* Fill empty ranges at the end, for all ranges missing in the tuplesort. */
2742 brin_fill_empty_ranges(state, prevblkno, state->bs_maxRangeStart);
2743
2744 /*
2745 * Switch back to the original memory context, and destroy the one we
2746 * created to isolate the union_tuple calls.
2747 */
2748 MemoryContextSwitchTo(oldCxt);
2749 MemoryContextDelete(rangeCxt);
2750
2751 return reltuples;
2752}
2753
2754/*
2755 * Returns size of shared memory required to store state for a parallel
2756 * brin index build based on the snapshot its parallel scan will use.
2757 */
2758static Size
2760{
2761 /* c.f. shm_toc_allocate as to why BUFFERALIGN is used */
2762 return add_size(BUFFERALIGN(sizeof(BrinShared)),
2763 table_parallelscan_estimate(heap, snapshot));
2764}
2765
2766/*
2767 * Within leader, participate as a parallel worker.
2768 */
2769static void
2771{
2772 BrinLeader *brinleader = buildstate->bs_leader;
2773 int sortmem;
2774
2775 /*
2776 * Might as well use reliable figure when doling out maintenance_work_mem
2777 * (when requested number of workers were not launched, this will be
2778 * somewhat higher than it is for other workers).
2779 */
2780 sortmem = maintenance_work_mem / brinleader->nparticipanttuplesorts;
2781
2782 /* Perform work common to all participants */
2783 _brin_parallel_scan_and_build(buildstate, brinleader->brinshared,
2784 brinleader->sharedsort, heap, index, sortmem, true);
2785}
2786
2787/*
2788 * Perform a worker's portion of a parallel sort.
2789 *
2790 * This generates a tuplesort for the worker portion of the table.
2791 *
2792 * sortmem is the amount of working memory to use within each worker,
2793 * expressed in KBs.
2794 *
2795 * When this returns, workers are done, and need only release resources.
2796 */
2797static void
2799 BrinShared *brinshared, Sharedsort *sharedsort,
2800 Relation heap, Relation index,
2801 int sortmem, bool progress)
2802{
2803 SortCoordinate coordinate;
2804 TableScanDesc scan;
2805 double reltuples;
2806 IndexInfo *indexInfo;
2807
2808 /* Initialize local tuplesort coordination state */
2809 coordinate = palloc0(sizeof(SortCoordinateData));
2810 coordinate->isWorker = true;
2811 coordinate->nParticipants = -1;
2812 coordinate->sharedsort = sharedsort;
2813
2814 /* Begin "partial" tuplesort */
2815 state->bs_sortstate = tuplesort_begin_index_brin(sortmem, coordinate,
2817
2818 /* Join parallel scan */
2819 indexInfo = BuildIndexInfo(index);
2820 indexInfo->ii_Concurrent = brinshared->isconcurrent;
2821
2822 scan = table_beginscan_parallel(heap,
2824
2825 reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
2827
2828 /* insert the last item */
2830
2831 /* sort the BRIN ranges built by this worker */
2832 tuplesort_performsort(state->bs_sortstate);
2833
2834 state->bs_reltuples += reltuples;
2835
2836 /*
2837 * Done. Record ambuild statistics.
2838 */
2839 SpinLockAcquire(&brinshared->mutex);
2840 brinshared->nparticipantsdone++;
2841 brinshared->reltuples += state->bs_reltuples;
2842 brinshared->indtuples += state->bs_numtuples;
2843 SpinLockRelease(&brinshared->mutex);
2844
2845 /* Notify leader */
2847
2848 tuplesort_end(state->bs_sortstate);
2849}
2850
2851/*
2852 * Perform work within a launched parallel process.
2853 */
2854void
2856{
2857 char *sharedquery;
2858 BrinShared *brinshared;
2859 Sharedsort *sharedsort;
2860 BrinBuildState *buildstate;
2861 Relation heapRel;
2862 Relation indexRel;
2863 LOCKMODE heapLockmode;
2864 LOCKMODE indexLockmode;
2865 WalUsage *walusage;
2866 BufferUsage *bufferusage;
2867 int sortmem;
2868
2869 /*
2870 * The only possible status flag that can be set to the parallel worker is
2871 * PROC_IN_SAFE_IC.
2872 */
2873 Assert((MyProc->statusFlags == 0) ||
2875
2876 /* Set debug_query_string for individual workers first */
2877 sharedquery = shm_toc_lookup(toc, PARALLEL_KEY_QUERY_TEXT, true);
2878 debug_query_string = sharedquery;
2879
2880 /* Report the query string from leader */
2882
2883 /* Look up brin shared state */
2884 brinshared = shm_toc_lookup(toc, PARALLEL_KEY_BRIN_SHARED, false);
2885
2886 /* Open relations using lock modes known to be obtained by index.c */
2887 if (!brinshared->isconcurrent)
2888 {
2889 heapLockmode = ShareLock;
2890 indexLockmode = AccessExclusiveLock;
2891 }
2892 else
2893 {
2894 heapLockmode = ShareUpdateExclusiveLock;
2895 indexLockmode = RowExclusiveLock;
2896 }
2897
2898 /* Track query ID */
2899 pgstat_report_query_id(brinshared->queryid, false);
2900
2901 /* Open relations within worker */
2902 heapRel = table_open(brinshared->heaprelid, heapLockmode);
2903 indexRel = index_open(brinshared->indexrelid, indexLockmode);
2904
2905 buildstate = initialize_brin_buildstate(indexRel, NULL,
2906 brinshared->pagesPerRange,
2908
2909 /* Look up shared state private to tuplesort.c */
2910 sharedsort = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT, false);
2911 tuplesort_attach_shared(sharedsort, seg);
2912
2913 /* Prepare to track buffer usage during parallel execution */
2915
2916 /*
2917 * Might as well use reliable figure when doling out maintenance_work_mem
2918 * (when requested number of workers were not launched, this will be
2919 * somewhat higher than it is for other workers).
2920 */
2921 sortmem = maintenance_work_mem / brinshared->scantuplesortstates;
2922
2923 _brin_parallel_scan_and_build(buildstate, brinshared, sharedsort,
2924 heapRel, indexRel, sortmem, false);
2925
2926 /* Report WAL/buffer usage during parallel execution */
2927 bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false);
2928 walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false);
2930 &walusage[ParallelWorkerNumber]);
2931
2932 index_close(indexRel, indexLockmode);
2933 table_close(heapRel, heapLockmode);
2934}
2935
2936/*
2937 * brin_build_empty_tuple
2938 * Maybe initialize a BRIN tuple representing empty range.
2939 *
2940 * Returns a BRIN tuple representing an empty page range starting at the
2941 * specified block number. The empty tuple is initialized only once, when it's
2942 * needed for the first time, stored in the memory context bs_context to ensure
2943 * proper life span, and reused on following calls. All empty tuples are
2944 * exactly the same except for the bt_blkno field, which is set to the value
2945 * in blkno parameter.
2946 */
2947static void
2949{
2950 /* First time an empty tuple is requested? If yes, initialize it. */
2951 if (state->bs_emptyTuple == NULL)
2952 {
2953 MemoryContext oldcxt;
2954 BrinMemTuple *dtuple = brin_new_memtuple(state->bs_bdesc);
2955
2956 /* Allocate the tuple in context for the whole index build. */
2957 oldcxt = MemoryContextSwitchTo(state->bs_context);
2958
2959 state->bs_emptyTuple = brin_form_tuple(state->bs_bdesc, blkno, dtuple,
2960 &state->bs_emptyTupleLen);
2961
2962 MemoryContextSwitchTo(oldcxt);
2963 }
2964 else
2965 {
2966 /* If we already have an empty tuple, just update the block. */
2967 state->bs_emptyTuple->bt_blkno = blkno;
2968 }
2969}
2970
2971/*
2972 * brin_fill_empty_ranges
2973 * Add BRIN index tuples representing empty page ranges.
2974 *
2975 * prevRange/nextRange determine for which page ranges to add empty summaries.
2976 * Both boundaries are exclusive, i.e. only ranges starting at blkno for which
2977 * (prevRange < blkno < nextRange) will be added to the index.
2978 *
2979 * If prevRange is InvalidBlockNumber, this means there was no previous page
2980 * range (i.e. the first empty range to add is for blkno=0).
2981 *
2982 * The empty tuple is built only once, and then reused for all future calls.
2983 */
2984static void
2986 BlockNumber prevRange, BlockNumber nextRange)
2987{
2988 BlockNumber blkno;
2989
2990 /*
2991 * If we already summarized some ranges, we need to start with the next
2992 * one. Otherwise start from the first range of the table.
2993 */
2994 blkno = (prevRange == InvalidBlockNumber) ? 0 : (prevRange + state->bs_pagesPerRange);
2995
2996 /* Generate empty ranges until we hit the next non-empty range. */
2997 while (blkno < nextRange)
2998 {
2999 /* Did we already build the empty tuple? If not, do it now. */
3001
3002 brin_doinsert(state->bs_irel, state->bs_pagesPerRange, state->bs_rmAccess,
3003 &state->bs_currentInsertBuf,
3004 blkno, state->bs_emptyTuple, state->bs_emptyTupleLen);
3005
3006 /* try next page range */
3007 blkno += state->bs_pagesPerRange;
3008 }
3009}
@ ACLCHECK_NOT_OWNER
Definition: acl.h:185
void aclcheck_error(AclResult aclerr, ObjectType objtype, const char *objectname)
Definition: aclchk.c:2622
bool object_ownercheck(Oid classid, Oid objectid, Oid roleid)
Definition: aclchk.c:4058
int16 AttrNumber
Definition: attnum.h:21
bool AutoVacuumRequestWork(AutoVacuumWorkItemType type, Oid relationId, BlockNumber blkno)
Definition: autovacuum.c:3219
@ AVW_BRINSummarizeRange
Definition: autovacuum.h:25
int ParallelWorkerNumber
Definition: parallel.c:114
void InitializeParallelDSM(ParallelContext *pcxt)
Definition: parallel.c:207
void WaitForParallelWorkersToFinish(ParallelContext *pcxt)
Definition: parallel.c:792
void LaunchParallelWorkers(ParallelContext *pcxt)
Definition: parallel.c:569
void DestroyParallelContext(ParallelContext *pcxt)
Definition: parallel.c:946
ParallelContext * CreateParallelContext(const char *library_name, const char *function_name, int nworkers)
Definition: parallel.c:169
void WaitForParallelWorkersToAttach(ParallelContext *pcxt)
Definition: parallel.c:689
uint64 pgstat_get_my_query_id(void)
void pgstat_report_query_id(uint64 query_id, bool force)
void pgstat_report_activity(BackendState state, const char *cmd_str)
@ STATE_RUNNING
uint32 BlockNumber
Definition: block.h:31
#define InvalidBlockNumber
Definition: block.h:33
#define MaxBlockNumber
Definition: block.h:35
static Datum values[MAXATTR]
Definition: bootstrap.c:151
#define PARALLEL_KEY_BUFFER_USAGE
Definition: brin.c:51
void brininsertcleanup(Relation index, IndexInfo *indexInfo)
Definition: brin.c:507
static double _brin_parallel_merge(BrinBuildState *state)
Definition: brin.c:2612
static void brin_vacuum_scan(Relation idxrel, BufferAccessStrategy strategy)
Definition: brin.c:2164
Datum brin_desummarize_range(PG_FUNCTION_ARGS)
Definition: brin.c:1483
void brinrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, ScanKey orderbys, int norderbys)
Definition: brin.c:950
static void terminate_brin_buildstate(BrinBuildState *state)
Definition: brin.c:1708
#define PARALLEL_KEY_BRIN_SHARED
Definition: brin.c:47
Datum brin_summarize_range(PG_FUNCTION_ARGS)
Definition: brin.c:1372
IndexBulkDeleteResult * brinbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state)
Definition: brin.c:1294
static void form_and_spill_tuple(BrinBuildState *state)
Definition: brin.c:1998
#define BRIN_ALL_BLOCKRANGES
Definition: brin.c:209
struct BrinShared BrinShared
Datum brin_summarize_new_values(PG_FUNCTION_ARGS)
Definition: brin.c:1357
IndexScanDesc brinbeginscan(Relation r, int nkeys, int norderbys)
Definition: brin.c:532
bytea * brinoptions(Datum reloptions, bool validate)
Definition: brin.c:1339
int64 bringetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
Definition: brin.c:560
static void brinsummarize(Relation index, Relation heapRel, BlockNumber pageRange, bool include_partial, double *numSummarized, double *numExisting)
Definition: brin.c:1879
static void form_and_insert_tuple(BrinBuildState *state)
Definition: brin.c:1977
void brinbuildempty(Relation index)
Definition: brin.c:1265
void brin_free_desc(BrinDesc *bdesc)
Definition: brin.c:1628
struct BrinInsertState BrinInsertState
static void union_tuples(BrinDesc *bdesc, BrinMemTuple *a, BrinTuple *b)
Definition: brin.c:2023
static void _brin_parallel_scan_and_build(BrinBuildState *state, BrinShared *brinshared, Sharedsort *sharedsort, Relation heap, Relation index, int sortmem, bool progress)
Definition: brin.c:2798
static BrinBuildState * initialize_brin_buildstate(Relation idxRel, BrinRevmap *revmap, BlockNumber pagesPerRange, BlockNumber tablePages)
Definition: brin.c:1661
static void _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, bool isconcurrent, int request)
Definition: brin.c:2355
void brinGetStats(Relation index, BrinStatsData *stats)
Definition: brin.c:1640
static void _brin_leader_participate_as_worker(BrinBuildState *buildstate, Relation heap, Relation index)
Definition: brin.c:2770
static bool add_values_to_range(Relation idxRel, BrinDesc *bdesc, BrinMemTuple *dtup, const Datum *values, const bool *nulls)
Definition: brin.c:2197
static void _brin_end_parallel(BrinLeader *brinleader, BrinBuildState *state)
Definition: brin.c:2540
static Size _brin_parallel_estimate_shared(Relation heap, Snapshot snapshot)
Definition: brin.c:2759
struct BrinBuildState BrinBuildState
static void brin_fill_empty_ranges(BrinBuildState *state, BlockNumber prevRange, BlockNumber nextRange)
Definition: brin.c:2985
IndexBuildResult * brinbuild(Relation heap, Relation index, IndexInfo *indexInfo)
Definition: brin.c:1096
IndexBulkDeleteResult * brinvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
Definition: brin.c:1309
struct BrinLeader BrinLeader
struct BrinOpaque BrinOpaque
static void summarize_range(IndexInfo *indexInfo, BrinBuildState *state, Relation heapRel, BlockNumber heapBlk, BlockNumber heapNumBlks)
Definition: brin.c:1753
#define ParallelTableScanFromBrinShared(shared)
Definition: brin.c:116
#define PARALLEL_KEY_TUPLESORT
Definition: brin.c:48
static void brinbuildCallbackParallel(Relation index, ItemPointer tid, Datum *values, bool *isnull, bool tupleIsAlive, void *brstate)
Definition: brin.c:1037
bool brininsert(Relation idxRel, Datum *values, bool *nulls, ItemPointer heaptid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, IndexInfo *indexInfo)
Definition: brin.c:339
#define PARALLEL_KEY_QUERY_TEXT
Definition: brin.c:49
Datum brinhandler(PG_FUNCTION_ARGS)
Definition: brin.c:250
BrinDesc * brin_build_desc(Relation rel)
Definition: brin.c:1573
void _brin_parallel_build_main(dsm_segment *seg, shm_toc *toc)
Definition: brin.c:2855
static void brin_build_empty_tuple(BrinBuildState *state, BlockNumber blkno)
Definition: brin.c:2948
#define PARALLEL_KEY_WAL_USAGE
Definition: brin.c:50
static double _brin_parallel_heapscan(BrinBuildState *state)
Definition: brin.c:2571
static BrinInsertState * initialize_brin_insertstate(Relation idxRel, IndexInfo *indexInfo)
Definition: brin.c:310
static void brinbuildCallback(Relation index, ItemPointer tid, Datum *values, bool *isnull, bool tupleIsAlive, void *brstate)
Definition: brin.c:986
void brinendscan(IndexScanDesc scan)
Definition: brin.c:969
static bool check_null_keys(BrinValues *bval, ScanKey *nullkeys, int nnullkeys)
Definition: brin.c:2291
#define BrinGetPagesPerRange(relation)
Definition: brin.h:40
#define BrinGetAutoSummarize(relation)
Definition: brin.h:46
#define BRIN_LAST_OPTIONAL_PROCNUM
Definition: brin_internal.h:78
#define BRIN_PROCNUM_UNION
Definition: brin_internal.h:73
#define BRIN_PROCNUM_OPTIONS
Definition: brin_internal.h:75
#define BRIN_PROCNUM_OPCINFO
Definition: brin_internal.h:70
#define BRIN_PROCNUM_CONSISTENT
Definition: brin_internal.h:72
#define BRIN_elog(args)
Definition: brin_internal.h:85
#define BRIN_PROCNUM_ADDVALUE
Definition: brin_internal.h:71
#define BRIN_CURRENT_VERSION
Definition: brin_page.h:72
#define BRIN_METAPAGE_BLKNO
Definition: brin_page.h:75
bool brin_doupdate(Relation idxrel, BlockNumber pagesPerRange, BrinRevmap *revmap, BlockNumber heapBlk, Buffer oldbuf, OffsetNumber oldoff, const BrinTuple *origtup, Size origsz, const BrinTuple *newtup, Size newsz, bool samepage)
Definition: brin_pageops.c:53
void brin_page_cleanup(Relation idxrel, Buffer buf)
Definition: brin_pageops.c:624
OffsetNumber brin_doinsert(Relation idxrel, BlockNumber pagesPerRange, BrinRevmap *revmap, Buffer *buffer, BlockNumber heapBlk, BrinTuple *tup, Size itemsz)
Definition: brin_pageops.c:342
void brin_metapage_init(Page page, BlockNumber pagesPerRange, uint16 version)
Definition: brin_pageops.c:486
bool brin_can_do_samepage_update(Buffer buffer, Size origsz, Size newsz)
Definition: brin_pageops.c:323
bool brinRevmapDesummarizeRange(Relation idxrel, BlockNumber heapBlk)
Definition: brin_revmap.c:323
void brinRevmapTerminate(BrinRevmap *revmap)
Definition: brin_revmap.c:100
BrinRevmap * brinRevmapInitialize(Relation idxrel, BlockNumber *pagesPerRange)
Definition: brin_revmap.c:70
BrinTuple * brinGetTupleForHeapBlock(BrinRevmap *revmap, BlockNumber heapBlk, Buffer *buf, OffsetNumber *off, Size *size, int mode)
Definition: brin_revmap.c:194
BrinTuple * brin_copy_tuple(BrinTuple *tuple, Size len, BrinTuple *dest, Size *destsz)
Definition: brin_tuple.c:446
BrinTuple * brin_form_tuple(BrinDesc *brdesc, BlockNumber blkno, BrinMemTuple *tuple, Size *size)
Definition: brin_tuple.c:99
BrinMemTuple * brin_new_memtuple(BrinDesc *brdesc)
Definition: brin_tuple.c:482
void brin_free_tuple(BrinTuple *tuple)
Definition: brin_tuple.c:433
BrinTuple * brin_form_placeholder_tuple(BrinDesc *brdesc, BlockNumber blkno, Size *size)
Definition: brin_tuple.c:388
BrinMemTuple * brin_memtuple_initialize(BrinMemTuple *dtuple, BrinDesc *brdesc)
Definition: brin_tuple.c:511
BrinMemTuple * brin_deform_tuple(BrinDesc *brdesc, BrinTuple *tuple, BrinMemTuple *dMemtuple)
Definition: brin_tuple.c:553
bool brinvalidate(Oid opclassoid)
Definition: brin_validate.c:37
#define SizeOfBrinCreateIdx
Definition: brin_xlog.h:55
#define XLOG_BRIN_CREATE_INDEX
Definition: brin_xlog.h:31
int Buffer
Definition: buf.h:23
#define BufferIsInvalid(buffer)
Definition: buf.h:31
#define InvalidBuffer
Definition: buf.h:25
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:3724
Buffer ExtendBufferedRel(BufferManagerRelation bmr, ForkNumber forkNum, BufferAccessStrategy strategy, uint32 flags)
Definition: bufmgr.c:846
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:4924
void UnlockReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:4941
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:2532
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:5158
Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition: bufmgr.c:793
Buffer ReadBuffer(Relation reln, BlockNumber blockNum)
Definition: bufmgr.c:746
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:189
#define BUFFER_LOCK_SHARE
Definition: bufmgr.h:190
#define RelationGetNumberOfBlocks(reln)
Definition: bufmgr.h:273
static Page BufferGetPage(Buffer buffer)
Definition: bufmgr.h:400
@ EB_SKIP_EXTENSION_LOCK
Definition: bufmgr.h:74
@ EB_LOCK_FIRST
Definition: bufmgr.h:86
@ RBM_NORMAL
Definition: bufmgr.h:45
#define BMR_REL(p_rel)
Definition: bufmgr.h:107
static bool BufferIsValid(Buffer bufnum)
Definition: bufmgr.h:351
Size PageGetFreeSpace(Page page)
Definition: bufpage.c:896
Pointer Page
Definition: bufpage.h:81
static char * PageGetContents(Page page)
Definition: bufpage.h:257
static ItemId PageGetItemId(Page page, OffsetNumber offsetNumber)
Definition: bufpage.h:243
static void PageSetLSN(Page page, XLogRecPtr lsn)
Definition: bufpage.h:391
#define Min(x, y)
Definition: c.h:961
#define MAXALIGN(LEN)
Definition: c.h:768
#define PG_USED_FOR_ASSERTS_ONLY
Definition: c.h:204
#define BUFFERALIGN(LEN)
Definition: c.h:770
#define Assert(condition)
Definition: c.h:815
int64_t int64
Definition: c.h:485
int32_t int32
Definition: c.h:484
uint64_t uint64
Definition: c.h:489
#define lengthof(array)
Definition: c.h:745
#define OidIsValid(objectId)
Definition: c.h:732
size_t Size
Definition: c.h:562
bool ConditionVariableCancelSleep(void)
void ConditionVariableInit(ConditionVariable *cv)
void ConditionVariableSleep(ConditionVariable *cv, uint32 wait_event_info)
void ConditionVariableSignal(ConditionVariable *cv)
Datum datumCopy(Datum value, bool typByVal, int typLen)
Definition: datum.c:132
int errhint(const char *fmt,...)
Definition: elog.c:1317
int errcode(int sqlerrcode)
Definition: elog.c:853
int errmsg(const char *fmt,...)
Definition: elog.c:1070
#define LOG
Definition: elog.h:31
#define DEBUG2
Definition: elog.h:29
#define DEBUG1
Definition: elog.h:30
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:225
#define ereport(elevel,...)
Definition: elog.h:149
#define palloc_object(type)
Definition: fe_memutils.h:74
#define palloc_array(type, count)
Definition: fe_memutils.h:76
#define palloc0_array(type, count)
Definition: fe_memutils.h:77
#define palloc0_object(type)
Definition: fe_memutils.h:75
Datum FunctionCall4Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, Datum arg2, Datum arg3, Datum arg4)
Definition: fmgr.c:1196
Datum Int64GetDatum(int64 X)
Definition: fmgr.c:1807
Datum FunctionCall3Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, Datum arg2, Datum arg3)
Definition: fmgr.c:1171
void fmgr_info_copy(FmgrInfo *dstinfo, FmgrInfo *srcinfo, MemoryContext destcxt)
Definition: fmgr.c:580
#define PG_RETURN_VOID()
Definition: fmgr.h:349
#define PG_GETARG_OID(n)
Definition: fmgr.h:275
#define DirectFunctionCall2(func, arg1, arg2)
Definition: fmgr.h:643
#define PG_GETARG_DATUM(n)
Definition: fmgr.h:268
#define PG_GETARG_INT64(n)
Definition: fmgr.h:283
#define FunctionCall1(flinfo, arg1)
Definition: fmgr.h:659
#define PG_RETURN_INT32(x)
Definition: fmgr.h:354
#define PG_RETURN_POINTER(x)
Definition: fmgr.h:361
#define PG_FUNCTION_ARGS
Definition: fmgr.h:193
void FreeSpaceMapVacuumRange(Relation rel, BlockNumber start, BlockNumber end)
Definition: freespace.c:377
void FreeSpaceMapVacuum(Relation rel)
Definition: freespace.c:358
void RecordPageWithFreeSpace(Relation rel, BlockNumber heapBlk, Size spaceAvail)
Definition: freespace.c:194
IndexScanDesc RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys)
Definition: genam.c:80
bool(* IndexBulkDeleteCallback)(ItemPointer itemptr, void *state)
Definition: genam.h:89
IndexUniqueCheck
Definition: genam.h:118
int maintenance_work_mem
Definition: globals.c:132
int NewGUCNestLevel(void)
Definition: guc.c:2235
void RestrictSearchPath(void)
Definition: guc.c:2246
void AtEOXact_GUC(bool isCommit, int nestLevel)
Definition: guc.c:2262
Oid IndexGetRelation(Oid indexId, bool missing_ok)
Definition: index.c:3554
IndexInfo * BuildIndexInfo(Relation index)
Definition: index.c:2427
FmgrInfo * index_getprocinfo(Relation irel, AttrNumber attnum, uint16 procnum)
Definition: indexam.c:862
void index_close(Relation relation, LOCKMODE lockmode)
Definition: indexam.c:177
Relation index_open(Oid relationId, LOCKMODE lockmode)
Definition: indexam.c:133
void InstrAccumParallelQuery(BufferUsage *bufusage, WalUsage *walusage)
Definition: instrument.c:218
void InstrEndParallelQuery(BufferUsage *bufusage, WalUsage *walusage)
Definition: instrument.c:208
void InstrStartParallelQuery(void)
Definition: instrument.c:200
int b
Definition: isn.c:69
int a
Definition: isn.c:68
int i
Definition: isn.c:72
if(TABLE==NULL||TABLE_index==NULL)
Definition: isn.c:76
#define ItemIdGetLength(itemId)
Definition: itemid.h:59
static OffsetNumber ItemPointerGetOffsetNumber(const ItemPointerData *pointer)
Definition: itemptr.h:124
static BlockNumber ItemPointerGetBlockNumber(const ItemPointerData *pointer)
Definition: itemptr.h:103
int LOCKMODE
Definition: lockdefs.h:26
#define AccessExclusiveLock
Definition: lockdefs.h:43
#define AccessShareLock
Definition: lockdefs.h:36
#define ShareUpdateExclusiveLock
Definition: lockdefs.h:39
#define ShareLock
Definition: lockdefs.h:40
#define RowExclusiveLock
Definition: lockdefs.h:38
void MemoryContextReset(MemoryContext context)
Definition: mcxt.c:383
void pfree(void *pointer)
Definition: mcxt.c:1521
void * palloc0(Size size)
Definition: mcxt.c:1347
void * palloc(Size size)
Definition: mcxt.c:1317
MemoryContext CurrentMemoryContext
Definition: mcxt.c:143
void MemoryContextDelete(MemoryContext context)
Definition: mcxt.c:454
#define AllocSetContextCreate
Definition: memutils.h:129
#define ALLOCSET_DEFAULT_SIZES
Definition: memutils.h:160
#define ALLOCSET_SMALL_SIZES
Definition: memutils.h:170
#define SECURITY_RESTRICTED_OPERATION
Definition: miscadmin.h:318
#define START_CRIT_SECTION()
Definition: miscadmin.h:149
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:122
#define END_CRIT_SECTION()
Definition: miscadmin.h:151
void GetUserIdAndSecContext(Oid *userid, int *sec_context)
Definition: miscinit.c:660
Oid GetUserId(void)
Definition: miscinit.c:517
void SetUserIdAndSecContext(Oid userid, int sec_context)
Definition: miscinit.c:667
#define makeNode(_type_)
Definition: nodes.h:155
uint16 OffsetNumber
Definition: off.h:24
#define FirstOffsetNumber
Definition: off.h:27
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
Definition: palloc.h:124
@ OBJECT_INDEX
Definition: parsenodes.h:2332
FormData_pg_attribute * Form_pg_attribute
Definition: pg_attribute.h:200
const void size_t len
static char * buf
Definition: pg_test_fsync.c:72
static int progress
Definition: pgbench.c:261
#define ERRCODE_UNDEFINED_TABLE
Definition: pgbench.c:78
#define pgstat_count_index_scan(rel)
Definition: pgstat.h:674
const char * debug_query_string
Definition: postgres.c:87
static bool DatumGetBool(Datum X)
Definition: postgres.h:95
static Datum PointerGetDatum(const void *X)
Definition: postgres.h:327
uintptr_t Datum
Definition: postgres.h:69
static Pointer DatumGetPointer(Datum X)
Definition: postgres.h:317
static Datum Int32GetDatum(int32 X)
Definition: postgres.h:217
#define InvalidOid
Definition: postgres_ext.h:37
unsigned int Oid
Definition: postgres_ext.h:32
#define PROC_IN_SAFE_IC
Definition: proc.h:59
static void addrange(struct cvec *cv, chr from, chr to)
Definition: regc_cvec.c:90
#define RelationGetRelid(relation)
Definition: rel.h:505
#define RelationGetDescr(relation)
Definition: rel.h:531
#define RelationGetRelationName(relation)
Definition: rel.h:539
#define RelationNeedsWAL(relation)
Definition: rel.h:628
void * build_reloptions(Datum reloptions, bool validate, relopt_kind kind, Size relopt_struct_size, const relopt_parse_elt *relopt_elems, int num_relopt_elems)
Definition: reloptions.c:1908
@ RELOPT_KIND_BRIN
Definition: reloptions.h:52
@ RELOPT_TYPE_INT
Definition: reloptions.h:32
@ RELOPT_TYPE_BOOL
Definition: reloptions.h:31
@ MAIN_FORKNUM
Definition: relpath.h:58
@ INIT_FORKNUM
Definition: relpath.h:61
void brincostestimate(PlannerInfo *root, IndexPath *path, double loop_count, Cost *indexStartupCost, Cost *indexTotalCost, Selectivity *indexSelectivity, double *indexCorrelation, double *indexPages)
Definition: selfuncs.c:8007
void * shm_toc_allocate(shm_toc *toc, Size nbytes)
Definition: shm_toc.c:88
void shm_toc_insert(shm_toc *toc, uint64 key, void *address)
Definition: shm_toc.c:171
void * shm_toc_lookup(shm_toc *toc, uint64 key, bool noError)
Definition: shm_toc.c:232
#define shm_toc_estimate_chunk(e, sz)
Definition: shm_toc.h:51
#define shm_toc_estimate_keys(e, cnt)
Definition: shm_toc.h:53
Size add_size(Size s1, Size s2)
Definition: shmem.c:488
Size mul_size(Size s1, Size s2)
Definition: shmem.c:505
#define SK_SEARCHNOTNULL
Definition: skey.h:122
#define SK_SEARCHNULL
Definition: skey.h:121
#define SK_ISNULL
Definition: skey.h:115
static pg_noinline void Size size
Definition: slab.c:607
Snapshot GetTransactionSnapshot(void)
Definition: snapmgr.c:212
void UnregisterSnapshot(Snapshot snapshot)
Definition: snapmgr.c:794
Snapshot RegisterSnapshot(Snapshot snapshot)
Definition: snapmgr.c:752
#define SnapshotAny
Definition: snapmgr.h:33
#define IsMVCCSnapshot(snapshot)
Definition: snapmgr.h:55
#define SpinLockInit(lock)
Definition: spin.h:57
#define SpinLockRelease(lock)
Definition: spin.h:61
#define SpinLockAcquire(lock)
Definition: spin.h:59
void relation_close(Relation relation, LOCKMODE lockmode)
Definition: relation.c:205
PGPROC * MyProc
Definition: proc.c:66
BlockNumber bs_maxRangeStart
Definition: brin.c:163
Size bs_emptyTupleLen
Definition: brin.c:169
MemoryContext bs_context
Definition: brin.c:170
BrinMemTuple * bs_dtuple
Definition: brin.c:166
Relation bs_irel
Definition: brin.c:157
BlockNumber bs_pagesPerRange
Definition: brin.c:161
double bs_numtuples
Definition: brin.c:158
Buffer bs_currentInsertBuf
Definition: brin.c:160
BrinRevmap * bs_rmAccess
Definition: brin.c:164
Tuplesortstate * bs_sortstate
Definition: brin.c:185
BrinLeader * bs_leader
Definition: brin.c:177
int bs_worker_id
Definition: brin.c:178
BlockNumber bs_currRangeStart
Definition: brin.c:162
double bs_reltuples
Definition: brin.c:159
BrinDesc * bs_bdesc
Definition: brin.c:165
BrinTuple * bs_emptyTuple
Definition: brin.c:168
int bd_totalstored
Definition: brin_internal.h:59
TupleDesc bd_tupdesc
Definition: brin_internal.h:53
BrinOpcInfo * bd_info[FLEXIBLE_ARRAY_MEMBER]
Definition: brin_internal.h:62
Relation bd_index
Definition: brin_internal.h:50
MemoryContext bd_context
Definition: brin_internal.h:47
TupleDesc bd_disktdesc
Definition: brin_internal.h:56
BrinDesc * bis_desc
Definition: brin.c:195
BrinRevmap * bis_rmAccess
Definition: brin.c:194
BlockNumber bis_pages_per_range
Definition: brin.c:196
int nparticipanttuplesorts
Definition: brin.c:133
WalUsage * walusage
Definition: brin.c:147
BrinShared * brinshared
Definition: brin.c:144
BufferUsage * bufferusage
Definition: brin.c:148
Snapshot snapshot
Definition: brin.c:146
Sharedsort * sharedsort
Definition: brin.c:145
ParallelContext * pcxt
Definition: brin.c:125
BrinValues bt_columns[FLEXIBLE_ARRAY_MEMBER]
Definition: brin_tuple.h:55
BlockNumber bt_blkno
Definition: brin_tuple.h:48
bool bt_placeholder
Definition: brin_tuple.h:46
bool bt_empty_range
Definition: brin_tuple.h:47
BlockNumber lastRevmapPage
Definition: brin_page.h:69
BlockNumber pagesPerRange
Definition: brin_page.h:68
BlockNumber bo_pagesPerRange
Definition: brin.c:204
BrinDesc * bo_bdesc
Definition: brin.c:206
BrinRevmap * bo_rmAccess
Definition: brin.c:205
TypeCacheEntry * oi_typcache[FLEXIBLE_ARRAY_MEMBER]
Definition: brin_internal.h:37
uint16 oi_nstored
Definition: brin_internal.h:28
bool oi_regular_nulls
Definition: brin_internal.h:31
slock_t mutex
Definition: brin.c:87
int scantuplesortstates
Definition: brin.c:68
int nparticipantsdone
Definition: brin.c:99
Oid heaprelid
Definition: brin.c:64
BlockNumber pagesPerRange
Definition: brin.c:67
uint64 queryid
Definition: brin.c:71
ConditionVariable workersdonecv
Definition: brin.c:79
Oid indexrelid
Definition: brin.c:65
bool isconcurrent
Definition: brin.c:66
double indtuples
Definition: brin.c:101
double reltuples
Definition: brin.c:100
BlockNumber revmapNumPages
Definition: brin.h:35
BlockNumber pagesPerRange
Definition: brin.h:34
BlockNumber bt_blkno
Definition: brin_tuple.h:66
bool bv_hasnulls
Definition: brin_tuple.h:32
Datum * bv_values
Definition: brin_tuple.h:34
AttrNumber bv_attno
Definition: brin_tuple.h:31
bool bv_allnulls
Definition: brin_tuple.h:33
Definition: fmgr.h:57
ambuildphasename_function ambuildphasename
Definition: amapi.h:289
ambuildempty_function ambuildempty
Definition: amapi.h:279
amvacuumcleanup_function amvacuumcleanup
Definition: amapi.h:283
bool amclusterable
Definition: amapi.h:253
amoptions_function amoptions
Definition: amapi.h:287
amestimateparallelscan_function amestimateparallelscan
Definition: amapi.h:301
amrestrpos_function amrestrpos
Definition: amapi.h:298
aminsert_function aminsert
Definition: amapi.h:280
amendscan_function amendscan
Definition: amapi.h:296
uint16 amoptsprocnum
Definition: amapi.h:233
amparallelrescan_function amparallelrescan
Definition: amapi.h:303
Oid amkeytype
Definition: amapi.h:269
bool ampredlocks
Definition: amapi.h:255
uint16 amsupport
Definition: amapi.h:231
amcostestimate_function amcostestimate
Definition: amapi.h:285
bool amcanorderbyop
Definition: amapi.h:237
amadjustmembers_function amadjustmembers
Definition: amapi.h:291
ambuild_function ambuild
Definition: amapi.h:278
bool amstorage
Definition: amapi.h:251
uint16 amstrategies
Definition: amapi.h:229
bool amoptionalkey
Definition: amapi.h:245
amgettuple_function amgettuple
Definition: amapi.h:294
amcanreturn_function amcanreturn
Definition: amapi.h:284
bool amcanunique
Definition: amapi.h:241
amgetbitmap_function amgetbitmap
Definition: amapi.h:295
amproperty_function amproperty
Definition: amapi.h:288
ambulkdelete_function ambulkdelete
Definition: amapi.h:282
bool amsearcharray
Definition: amapi.h:247
bool amsummarizing
Definition: amapi.h:265
amvalidate_function amvalidate
Definition: amapi.h:290
ammarkpos_function ammarkpos
Definition: amapi.h:297
bool amcanmulticol
Definition: amapi.h:243
bool amusemaintenanceworkmem
Definition: amapi.h:263
ambeginscan_function ambeginscan
Definition: amapi.h:292
bool amcanparallel
Definition: amapi.h:257
amrescan_function amrescan
Definition: amapi.h:293
bool amcanorder
Definition: amapi.h:235
bool amcanbuildparallel
Definition: amapi.h:259
aminitparallelscan_function aminitparallelscan
Definition: amapi.h:302
uint8 amparallelvacuumoptions
Definition: amapi.h:267
aminsertcleanup_function aminsertcleanup
Definition: amapi.h:281
bool amcanbackward
Definition: amapi.h:239
amgettreeheight_function amgettreeheight
Definition: amapi.h:286
bool amcaninclude
Definition: amapi.h:261
bool amsearchnulls
Definition: amapi.h:249
double heap_tuples
Definition: genam.h:34
double index_tuples
Definition: genam.h:35
BlockNumber num_pages
Definition: genam.h:79
double num_index_tuples
Definition: genam.h:81
void * ii_AmCache
Definition: execnodes.h:219
int ii_ParallelWorkers
Definition: execnodes.h:217
bool ii_Concurrent
Definition: execnodes.h:213
MemoryContext ii_Context
Definition: execnodes.h:220
struct ScanKeyData * keyData
Definition: relscan.h:139
Relation indexRelation
Definition: relscan.h:135
Relation index
Definition: genam.h:48
bool analyze_only
Definition: genam.h:50
BufferAccessStrategy strategy
Definition: genam.h:55
uint8 statusFlags
Definition: proc.h:242
dsm_segment * seg
Definition: parallel.h:42
shm_toc_estimator estimator
Definition: parallel.h:41
shm_toc * toc
Definition: parallel.h:44
int nworkers_launched
Definition: parallel.h:37
Form_pg_index rd_index
Definition: rel.h:192
Oid * rd_indcollation
Definition: rel.h:217
Form_pg_class rd_rel
Definition: rel.h:111
Oid sk_collation
Definition: skey.h:70
Sharedsort * sharedsort
Definition: tuplesort.h:58
int tdrefcount
Definition: tupdesc.h:133
bool typbyval
Definition: typcache.h:40
int16 typlen
Definition: typcache.h:39
Definition: type.h:96
Definition: regguts.h:323
Definition: c.h:644
BlockNumber pagesPerRange
Definition: brin_xlog.h:52
void table_close(Relation relation, LOCKMODE lockmode)
Definition: table.c:126
Relation table_open(Oid relationId, LOCKMODE lockmode)
Definition: table.c:40
TableScanDesc table_beginscan_parallel(Relation relation, ParallelTableScanDesc pscan)
Definition: tableam.c:165
Size table_parallelscan_estimate(Relation rel, Snapshot snapshot)
Definition: tableam.c:130
void table_parallelscan_initialize(Relation rel, ParallelTableScanDesc pscan, Snapshot snapshot)
Definition: tableam.c:145
static double table_index_build_range_scan(Relation table_rel, Relation index_rel, struct IndexInfo *index_info, bool allow_sync, bool anyvisible, bool progress, BlockNumber start_blockno, BlockNumber numblocks, IndexBuildCallback callback, void *callback_state, TableScanDesc scan)
Definition: tableam.h:1813
static double table_index_build_scan(Relation table_rel, Relation index_rel, struct IndexInfo *index_info, bool allow_sync, bool progress, IndexBuildCallback callback, void *callback_state, TableScanDesc scan)
Definition: tableam.h:1780
static void callback(struct sockaddr *addr, struct sockaddr *mask, void *unused)
Definition: test_ifaddrs.c:46
void tbm_add_page(TIDBitmap *tbm, BlockNumber pageno)
Definition: tidbitmap.c:443
static FormData_pg_attribute * TupleDescAttr(TupleDesc tupdesc, int i)
Definition: tupdesc.h:153
void tuplesort_performsort(Tuplesortstate *state)
Definition: tuplesort.c:1363
void tuplesort_initialize_shared(Sharedsort *shared, int nWorkers, dsm_segment *seg)
Definition: tuplesort.c:2938
Size tuplesort_estimate_shared(int nWorkers)
Definition: tuplesort.c:2917
void tuplesort_end(Tuplesortstate *state)
Definition: tuplesort.c:951
void tuplesort_attach_shared(Sharedsort *shared, dsm_segment *seg)
Definition: tuplesort.c:2961
struct SortCoordinateData * SortCoordinate
Definition: tuplesort.h:61
#define TUPLESORT_NONE
Definition: tuplesort.h:93
Tuplesortstate * tuplesort_begin_index_brin(int workMem, SortCoordinate coordinate, int sortopt)
BrinTuple * tuplesort_getbrintuple(Tuplesortstate *state, Size *len, bool forward)
void tuplesort_putbrintuple(Tuplesortstate *state, BrinTuple *tuple, Size size)
#define VACUUM_OPTION_PARALLEL_CLEANUP
Definition: vacuum.h:63
void ExitParallelMode(void)
Definition: xact.c:1063
void EnterParallelMode(void)
Definition: xact.c:1050
bool RecoveryInProgress(void)
Definition: xlog.c:6334
uint64 XLogRecPtr
Definition: xlogdefs.h:21
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:474
void XLogRegisterData(const char *data, uint32 len)
Definition: xloginsert.c:364
XLogRecPtr log_newpage_buffer(Buffer buffer, bool page_std)
Definition: xloginsert.c:1237
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition: xloginsert.c:242
void XLogBeginInsert(void)
Definition: xloginsert.c:149
#define REGBUF_STANDARD
Definition: xloginsert.h:34
#define REGBUF_WILL_INIT
Definition: xloginsert.h:33