PostgreSQL Source Code git master
Loading...
Searching...
No Matches
brin.c
Go to the documentation of this file.
1/*
2 * brin.c
3 * Implementation of BRIN indexes for Postgres
4 *
5 * See src/backend/access/brin/README for details.
6 *
7 * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
8 * Portions Copyright (c) 1994, Regents of the University of California
9 *
10 * IDENTIFICATION
11 * src/backend/access/brin/brin.c
12 *
13 * TODO
14 * * ScalarArrayOpExpr (amsearcharray -> SK_SEARCHARRAY)
15 */
16#include "postgres.h"
17
18#include "access/brin.h"
19#include "access/brin_page.h"
20#include "access/brin_pageops.h"
21#include "access/brin_xlog.h"
22#include "access/relation.h"
23#include "access/reloptions.h"
24#include "access/relscan.h"
25#include "access/table.h"
26#include "access/tableam.h"
27#include "access/xloginsert.h"
28#include "catalog/index.h"
29#include "catalog/pg_am.h"
30#include "commands/vacuum.h"
31#include "miscadmin.h"
32#include "pgstat.h"
34#include "storage/bufmgr.h"
35#include "storage/freespace.h"
36#include "storage/proc.h"
37#include "tcop/tcopprot.h"
38#include "utils/acl.h"
39#include "utils/datum.h"
40#include "utils/fmgrprotos.h"
41#include "utils/guc.h"
43#include "utils/memutils.h"
44#include "utils/rel.h"
45#include "utils/tuplesort.h"
46#include "utils/wait_event.h"
47
48/* Magic numbers for parallel state sharing */
49#define PARALLEL_KEY_BRIN_SHARED UINT64CONST(0xB000000000000001)
50#define PARALLEL_KEY_TUPLESORT UINT64CONST(0xB000000000000002)
51#define PARALLEL_KEY_QUERY_TEXT UINT64CONST(0xB000000000000003)
52#define PARALLEL_KEY_WAL_USAGE UINT64CONST(0xB000000000000004)
53#define PARALLEL_KEY_BUFFER_USAGE UINT64CONST(0xB000000000000005)
54
55/*
56 * Status for index builds performed in parallel. This is allocated in a
57 * dynamic shared memory segment.
58 */
59typedef struct BrinShared
60{
61 /*
62 * These fields are not modified during the build. They primarily exist
63 * for the benefit of worker processes that need to create state
64 * corresponding to that used by the leader.
65 */
71
72 /* Query ID, for report in worker processes */
74
75 /*
76 * workersdonecv is used to monitor the progress of workers. All parallel
77 * participants must indicate that they are done before leader can use
78 * results built by the workers (and before leader can write the data into
79 * the index).
80 */
82
83 /*
84 * mutex protects all fields before heapdesc.
85 *
86 * These fields contain status information of interest to BRIN index
87 * builds that must work just the same when an index is built in parallel.
88 */
90
91 /*
92 * Mutable state that is maintained by workers, and reported back to
93 * leader at end of the scans.
94 *
95 * nparticipantsdone is number of worker processes finished.
96 *
97 * reltuples is the total number of input heap tuples.
98 *
99 * indtuples is the total number of tuples that made it into the index.
100 */
102 double reltuples;
103 double indtuples;
104
105 /*
106 * ParallelTableScanDescData data follows. Can't directly embed here, as
107 * implementations of the parallel table scan desc interface might need
108 * stronger alignment.
109 */
111
112/*
113 * Return pointer to a BrinShared's parallel table scan.
114 *
115 * c.f. shm_toc_allocate as to why BUFFERALIGN is used, rather than just
116 * MAXALIGN.
117 */
118#define ParallelTableScanFromBrinShared(shared) \
119 (ParallelTableScanDesc) ((char *) (shared) + BUFFERALIGN(sizeof(BrinShared)))
120
121/*
122 * Status for leader in parallel index build.
123 */
124typedef struct BrinLeader
125{
126 /* parallel context itself */
128
129 /*
130 * nparticipanttuplesorts is the exact number of worker processes
131 * successfully launched, plus one leader process if it participates as a
132 * worker (only DISABLE_LEADER_PARTICIPATION builds avoid leader
133 * participating as a worker).
134 */
136
137 /*
138 * Leader process convenience pointers to shared state (leader avoids TOC
139 * lookups).
140 *
141 * brinshared is the shared state for entire build. sharedsort is the
142 * shared, tuplesort-managed state passed to each process tuplesort.
143 * snapshot is the snapshot used by the scan iff an MVCC snapshot is
144 * required.
145 */
152
153/*
154 * We use a BrinBuildState during initial construction of a BRIN index.
155 * The running state is kept in a BrinMemTuple.
156 */
157typedef struct BrinBuildState
158{
169
173
174 /*
175 * bs_leader is only present when a parallel index build is performed, and
176 * only in the leader process. (Actually, only the leader process has a
177 * BrinBuildState.)
178 */
181
182 /*
183 * The sortstate is used by workers (including the leader). It has to be
184 * part of the build state, because that's the only thing passed to the
185 * build callback etc.
186 */
189
190/*
191 * We use a BrinInsertState to capture running state spanning multiple
192 * brininsert invocations, within the same command.
193 */
200
201/*
202 * Struct used as "opaque" during index scans
203 */
210
211#define BRIN_ALL_BLOCKRANGES InvalidBlockNumber
212
215 BlockNumber pagesPerRange,
220 bool include_partial, double *numSummarized, double *numExisting);
224 BrinTuple *b);
227 BrinMemTuple *dtup, const Datum *values, const bool *nulls);
228static bool check_null_keys(BrinValues *bval, ScanKey *nullkeys, int nnullkeys);
231
232/* parallel index builds */
234 bool isconcurrent, int request);
240 Relation heap, Relation index);
242 BrinShared *brinshared,
243 Sharedsort *sharedsort,
244 Relation heap, Relation index,
245 int sortmem, bool progress);
246
247/*
248 * BRIN handler function: return IndexAmRoutine with access method parameters
249 * and callbacks.
250 */
251Datum
253{
254 static const IndexAmRoutine amroutine = {
256 .amstrategies = 0,
257 .amsupport = BRIN_LAST_OPTIONAL_PROCNUM,
258 .amoptsprocnum = BRIN_PROCNUM_OPTIONS,
259 .amcanorder = false,
260 .amcanorderbyop = false,
261 .amcanhash = false,
262 .amconsistentequality = false,
263 .amconsistentordering = false,
264 .amcanbackward = false,
265 .amcanunique = false,
266 .amcanmulticol = true,
267 .amoptionalkey = true,
268 .amsearcharray = false,
269 .amsearchnulls = true,
270 .amstorage = true,
271 .amclusterable = false,
272 .ampredlocks = false,
273 .amcanparallel = false,
274 .amcanbuildparallel = true,
275 .amcaninclude = false,
276 .amusemaintenanceworkmem = false,
277 .amsummarizing = true,
278 .amparallelvacuumoptions =
280 .amkeytype = InvalidOid,
281
282 .ambuild = brinbuild,
283 .ambuildempty = brinbuildempty,
284 .aminsert = brininsert,
285 .aminsertcleanup = brininsertcleanup,
286 .ambulkdelete = brinbulkdelete,
287 .amvacuumcleanup = brinvacuumcleanup,
288 .amcanreturn = NULL,
289 .amcostestimate = brincostestimate,
290 .amgettreeheight = NULL,
291 .amoptions = brinoptions,
292 .amproperty = NULL,
293 .ambuildphasename = NULL,
294 .amvalidate = brinvalidate,
295 .amadjustmembers = NULL,
296 .ambeginscan = brinbeginscan,
297 .amrescan = brinrescan,
298 .amgettuple = NULL,
299 .amgetbitmap = bringetbitmap,
300 .amendscan = brinendscan,
301 .ammarkpos = NULL,
302 .amrestrpos = NULL,
303 .amestimateparallelscan = NULL,
304 .aminitparallelscan = NULL,
305 .amparallelrescan = NULL,
306 .amtranslatestrategy = NULL,
307 .amtranslatecmptype = NULL,
308 };
309
311}
312
313/*
314 * Initialize a BrinInsertState to maintain state to be used across multiple
315 * tuple inserts, within the same command.
316 */
317static BrinInsertState *
319{
320 BrinInsertState *bistate;
322
325 bistate->bis_desc = brin_build_desc(idxRel);
327 &bistate->bis_pages_per_range);
328 indexInfo->ii_AmCache = bistate;
330
331 return bistate;
332}
333
334/*
335 * A tuple in the heap is being inserted. To keep a brin index up to date,
336 * we need to obtain the relevant index tuple and compare its stored values
337 * with those of the new tuple. If the tuple values are not consistent with
338 * the summary tuple, we need to update the index tuple.
339 *
340 * If autosummarization is enabled, check if we need to summarize the previous
341 * page range.
342 *
343 * If the range is not currently summarized (i.e. the revmap returns NULL for
344 * it), there's nothing to do for this tuple.
345 */
346bool
350 bool indexUnchanged,
351 IndexInfo *indexInfo)
352{
353 BlockNumber pagesPerRange;
355 BlockNumber heapBlk;
356 BrinInsertState *bistate = (BrinInsertState *) indexInfo->ii_AmCache;
362 bool autosummarize = BrinGetAutoSummarize(idxRel);
363
364 /*
365 * If first time through in this statement, initialize the insert state
366 * that we keep for all the inserts in the command.
367 */
368 if (!bistate)
369 bistate = initialize_brin_insertstate(idxRel, indexInfo);
370
371 revmap = bistate->bis_rmAccess;
372 bdesc = bistate->bis_desc;
373 pagesPerRange = bistate->bis_pages_per_range;
374
375 /*
376 * origHeapBlk is the block number where the insertion occurred. heapBlk
377 * is the first block in the corresponding page range.
378 */
380 heapBlk = (origHeapBlk / pagesPerRange) * pagesPerRange;
381
382 for (;;)
383 {
384 bool need_insert = false;
385 OffsetNumber off;
388
390
391 /*
392 * If auto-summarization is enabled and we just inserted the first
393 * tuple into the first block of a new non-first page range, request a
394 * summarization run of the previous range.
395 */
396 if (autosummarize &&
397 heapBlk > 0 &&
398 heapBlk == origHeapBlk &&
400 {
401 BlockNumber lastPageRange = heapBlk - 1;
403
407 if (!lastPageTuple)
408 {
409 bool recorded;
410
414 if (!recorded)
415 ereport(LOG,
417 errmsg("request for BRIN range summarization for index \"%s\" page %u was not recorded",
419 lastPageRange)));
420 }
421 else
423 }
424
425 brtup = brinGetTupleForHeapBlock(revmap, heapBlk, &buf, &off,
427
428 /* if range is unsummarized, there's nothing to do */
429 if (!brtup)
430 break;
431
432 /* First time through in this brininsert call? */
433 if (tupcxt == NULL)
434 {
436 "brininsert cxt",
439 }
440
442
444
445 if (!need_insert)
446 {
447 /*
448 * The tuple is consistent with the new values, so there's nothing
449 * to do.
450 */
452 }
453 else
454 {
455 Page page = BufferGetPage(buf);
456 ItemId lp = PageGetItemId(page, off);
457 Size origsz;
459 Size newsz;
461 bool samepage;
462
463 /*
464 * Make a copy of the old tuple, so that we can compare it after
465 * re-acquiring the lock.
466 */
469
470 /*
471 * Before releasing the lock, check if we can attempt a same-page
472 * update. Another process could insert a tuple concurrently in
473 * the same page though, so downstream we must be prepared to cope
474 * if this turns out to not be possible after all.
475 */
476 newtup = brin_form_tuple(bdesc, heapBlk, dtup, &newsz);
479
480 /*
481 * Try to update the tuple. If this doesn't work for whatever
482 * reason, we need to restart from the top; the revmap might be
483 * pointing at a different tuple for this block now, so we need to
484 * recompute to ensure both our new heap tuple and the other
485 * inserter's are covered by the combined tuple. It might be that
486 * we don't need to update at all.
487 */
488 if (!brin_doupdate(idxRel, pagesPerRange, revmap, heapBlk,
489 buf, off, origtup, origsz, newtup, newsz,
490 samepage))
491 {
492 /* no luck; start over */
494 continue;
495 }
496 }
497
498 /* success! */
499 break;
500 }
501
502 if (BufferIsValid(buf))
505 if (tupcxt != NULL)
507
508 return false;
509}
510
511/*
512 * Callback to clean up the BrinInsertState once all tuple inserts are done.
513 */
514void
516{
517 BrinInsertState *bistate = (BrinInsertState *) indexInfo->ii_AmCache;
518
519 /* bail out if cache not initialized */
520 if (bistate == NULL)
521 return;
522
523 /* do this first to avoid dangling pointer if we fail partway through */
524 indexInfo->ii_AmCache = NULL;
525
526 /*
527 * Clean up the revmap. Note that the brinDesc has already been cleaned up
528 * as part of its own memory context.
529 */
531 pfree(bistate);
532}
533
534/*
535 * Initialize state for a BRIN index scan.
536 *
537 * We read the metapage here to determine the pages-per-range number that this
538 * index was built with. Note that since this cannot be changed while we're
539 * holding lock on index, it's not necessary to recompute it during brinrescan.
540 */
542brinbeginscan(Relation r, int nkeys, int norderbys)
543{
544 IndexScanDesc scan;
545 BrinOpaque *opaque;
546
547 scan = RelationGetIndexScan(r, nkeys, norderbys);
548
549 opaque = palloc_object(BrinOpaque);
551 opaque->bo_bdesc = brin_build_desc(r);
552 scan->opaque = opaque;
553
554 return scan;
555}
556
557/*
558 * Execute the index scan.
559 *
560 * This works by reading index TIDs from the revmap, and obtaining the index
561 * tuples pointed to by them; the summary values in the index tuples are
562 * compared to the scan keys. We return into the TID bitmap all the pages in
563 * ranges corresponding to index tuples that match the scan keys.
564 *
565 * If a TID from the revmap is read as InvalidTID, we know that range is
566 * unsummarized. Pages in those ranges need to be returned regardless of scan
567 * keys.
568 */
569int64
571{
575 Oid heapOid;
576 Relation heapRel;
577 BrinOpaque *opaque;
578 BlockNumber nblocks;
579 int64 totalpages = 0;
580 FmgrInfo *consistentFn;
585 Size btupsz = 0;
586 ScanKey **keys,
587 **nullkeys;
588 int *nkeys,
589 *nnullkeys;
590 char *ptr;
591 Size len;
592 char *tmp PG_USED_FOR_ASSERTS_ONLY;
593
594 opaque = (BrinOpaque *) scan->opaque;
595 bdesc = opaque->bo_bdesc;
597 if (scan->instrument)
598 scan->instrument->nsearches++;
599
600 /*
601 * We need to know the size of the table so that we know how long to
602 * iterate on the revmap.
603 */
604 heapOid = IndexGetRelation(RelationGetRelid(idxRel), false);
605 heapRel = table_open(heapOid, AccessShareLock);
606 nblocks = RelationGetNumberOfBlocks(heapRel);
608
609 /*
610 * Make room for the consistent support procedures of indexed columns. We
611 * don't look them up here; we do that lazily the first time we see a scan
612 * key reference each of them. We rely on zeroing fn_oid to InvalidOid.
613 */
614 consistentFn = palloc0_array(FmgrInfo, bdesc->bd_tupdesc->natts);
615
616 /*
617 * Make room for per-attribute lists of scan keys that we'll pass to the
618 * consistent support procedure. We don't know which attributes have scan
619 * keys, so we allocate space for all attributes. That may use more memory
620 * but it's probably cheaper than determining which attributes are used.
621 *
622 * We keep null and regular keys separate, so that we can pass just the
623 * regular keys to the consistent function easily.
624 *
625 * To reduce the allocation overhead, we allocate one big chunk and then
626 * carve it into smaller arrays ourselves. All the pieces have exactly the
627 * same lifetime, so that's OK.
628 *
629 * XXX The widest index can have 32 attributes, so the amount of wasted
630 * memory is negligible. We could invent a more compact approach (with
631 * just space for used attributes) but that would make the matching more
632 * complex so it's not a good trade-off.
633 */
634 len =
635 MAXALIGN(sizeof(ScanKey *) * bdesc->bd_tupdesc->natts) + /* regular keys */
636 MAXALIGN(sizeof(ScanKey) * scan->numberOfKeys) * bdesc->bd_tupdesc->natts +
637 MAXALIGN(sizeof(int) * bdesc->bd_tupdesc->natts) +
638 MAXALIGN(sizeof(ScanKey *) * bdesc->bd_tupdesc->natts) + /* NULL keys */
639 MAXALIGN(sizeof(ScanKey) * scan->numberOfKeys) * bdesc->bd_tupdesc->natts +
640 MAXALIGN(sizeof(int) * bdesc->bd_tupdesc->natts);
641
642 ptr = palloc(len);
643 tmp = ptr;
644
645 keys = (ScanKey **) ptr;
646 ptr += MAXALIGN(sizeof(ScanKey *) * bdesc->bd_tupdesc->natts);
647
648 nullkeys = (ScanKey **) ptr;
649 ptr += MAXALIGN(sizeof(ScanKey *) * bdesc->bd_tupdesc->natts);
650
651 nkeys = (int *) ptr;
652 ptr += MAXALIGN(sizeof(int) * bdesc->bd_tupdesc->natts);
653
654 nnullkeys = (int *) ptr;
655 ptr += MAXALIGN(sizeof(int) * bdesc->bd_tupdesc->natts);
656
657 for (int i = 0; i < bdesc->bd_tupdesc->natts; i++)
658 {
659 keys[i] = (ScanKey *) ptr;
660 ptr += MAXALIGN(sizeof(ScanKey) * scan->numberOfKeys);
661
662 nullkeys[i] = (ScanKey *) ptr;
663 ptr += MAXALIGN(sizeof(ScanKey) * scan->numberOfKeys);
664 }
665
666 Assert(tmp + len == ptr);
667
668 /* zero the number of keys */
669 memset(nkeys, 0, sizeof(int) * bdesc->bd_tupdesc->natts);
670 memset(nnullkeys, 0, sizeof(int) * bdesc->bd_tupdesc->natts);
671
672 /* Preprocess the scan keys - split them into per-attribute arrays. */
673 for (int keyno = 0; keyno < scan->numberOfKeys; keyno++)
674 {
675 ScanKey key = &scan->keyData[keyno];
676 AttrNumber keyattno = key->sk_attno;
677
678 /*
679 * The collation of the scan key must match the collation used in the
680 * index column (but only if the search is not IS NULL/ IS NOT NULL).
681 * Otherwise we shouldn't be using this index ...
682 */
683 Assert((key->sk_flags & SK_ISNULL) ||
684 (key->sk_collation ==
685 TupleDescAttr(bdesc->bd_tupdesc,
686 keyattno - 1)->attcollation));
687
688 /*
689 * First time we see this index attribute, so init as needed.
690 *
691 * This is a bit of an overkill - we don't know how many scan keys are
692 * there for this attribute, so we simply allocate the largest number
693 * possible (as if all keys were for this attribute). This may waste a
694 * bit of memory, but we only expect small number of scan keys in
695 * general, so this should be negligible, and repeated repalloc calls
696 * are not free either.
697 */
698 if (consistentFn[keyattno - 1].fn_oid == InvalidOid)
699 {
700 FmgrInfo *tmp;
701
702 /* First time we see this attribute, so no key/null keys. */
703 Assert(nkeys[keyattno - 1] == 0);
704 Assert(nnullkeys[keyattno - 1] == 0);
705
708 fmgr_info_copy(&consistentFn[keyattno - 1], tmp,
710 }
711
712 /* Add key to the proper per-attribute array. */
713 if (key->sk_flags & SK_ISNULL)
714 {
715 nullkeys[keyattno - 1][nnullkeys[keyattno - 1]] = key;
716 nnullkeys[keyattno - 1]++;
717 }
718 else
719 {
720 keys[keyattno - 1][nkeys[keyattno - 1]] = key;
721 nkeys[keyattno - 1]++;
722 }
723 }
724
725 /* allocate an initial in-memory tuple, out of the per-range memcxt */
727
728 /*
729 * Setup and use a per-range memory context, which is reset every time we
730 * loop below. This avoids having to free the tuples within the loop.
731 */
733 "bringetbitmap cxt",
736
737 /*
738 * Now scan the revmap. We start by querying for heap page 0,
739 * incrementing by the number of pages per range; this gives us a full
740 * view of the table. We make use of uint64 for heapBlk as a BlockNumber
741 * could wrap for tables with close to 2^32 pages.
742 */
743 for (uint64 heapBlk = 0; heapBlk < nblocks; heapBlk += opaque->bo_pagesPerRange)
744 {
745 bool addrange;
746 bool gottuple = false;
747 BrinTuple *tup;
748 OffsetNumber off;
749 Size size;
750
752
754
756 &off, &size, BUFFER_LOCK_SHARE);
757 if (tup)
758 {
759 gottuple = true;
760 btup = brin_copy_tuple(tup, size, btup, &btupsz);
762 }
763
764 /*
765 * For page ranges with no indexed tuple, we must return the whole
766 * range; otherwise, compare it to the scan keys.
767 */
768 if (!gottuple)
769 {
770 addrange = true;
771 }
772 else
773 {
775 if (dtup->bt_placeholder)
776 {
777 /*
778 * Placeholder tuples are always returned, regardless of the
779 * values stored in them.
780 */
781 addrange = true;
782 }
783 else
784 {
785 int attno;
786
787 /*
788 * Compare scan keys with summary values stored for the range.
789 * If scan keys are matched, the page range must be added to
790 * the bitmap. We initially assume the range needs to be
791 * added; in particular this serves the case where there are
792 * no keys.
793 */
794 addrange = true;
795 for (attno = 1; attno <= bdesc->bd_tupdesc->natts; attno++)
796 {
797 BrinValues *bval;
798 Datum add;
799 Oid collation;
800
801 /*
802 * skip attributes without any scan keys (both regular and
803 * IS [NOT] NULL)
804 */
805 if (nkeys[attno - 1] == 0 && nnullkeys[attno - 1] == 0)
806 continue;
807
808 bval = &dtup->bt_columns[attno - 1];
809
810 /*
811 * If the BRIN tuple indicates that this range is empty,
812 * we can skip it: there's nothing to match. We don't
813 * need to examine the next columns.
814 */
815 if (dtup->bt_empty_range)
816 {
817 addrange = false;
818 break;
819 }
820
821 /*
822 * First check if there are any IS [NOT] NULL scan keys,
823 * and if we're violating them. In that case we can
824 * terminate early, without invoking the support function.
825 *
826 * As there may be more keys, we can only determine
827 * mismatch within this loop.
828 */
829 if (bdesc->bd_info[attno - 1]->oi_regular_nulls &&
830 !check_null_keys(bval, nullkeys[attno - 1],
831 nnullkeys[attno - 1]))
832 {
833 /*
834 * If any of the IS [NOT] NULL keys failed, the page
835 * range as a whole can't pass. So terminate the loop.
836 */
837 addrange = false;
838 break;
839 }
840
841 /*
842 * So either there are no IS [NOT] NULL keys, or all
843 * passed. If there are no regular scan keys, we're done -
844 * the page range matches. If there are regular keys, but
845 * the page range is marked as 'all nulls' it can't
846 * possibly pass (we're assuming the operators are
847 * strict).
848 */
849
850 /* No regular scan keys - page range as a whole passes. */
851 if (!nkeys[attno - 1])
852 continue;
853
854 Assert((nkeys[attno - 1] > 0) &&
855 (nkeys[attno - 1] <= scan->numberOfKeys));
856
857 /* If it is all nulls, it cannot possibly be consistent. */
858 if (bval->bv_allnulls)
859 {
860 addrange = false;
861 break;
862 }
863
864 /*
865 * Collation from the first key (has to be the same for
866 * all keys for the same attribute).
867 */
868 collation = keys[attno - 1][0]->sk_collation;
869
870 /*
871 * Check whether the scan key is consistent with the page
872 * range values; if so, have the pages in the range added
873 * to the output bitmap.
874 *
875 * The opclass may or may not support processing of
876 * multiple scan keys. We can determine that based on the
877 * number of arguments - functions with extra parameter
878 * (number of scan keys) do support this, otherwise we
879 * have to simply pass the scan keys one by one.
880 */
881 if (consistentFn[attno - 1].fn_nargs >= 4)
882 {
883 /* Check all keys at once */
884 add = FunctionCall4Coll(&consistentFn[attno - 1],
885 collation,
887 PointerGetDatum(bval),
888 PointerGetDatum(keys[attno - 1]),
889 Int32GetDatum(nkeys[attno - 1]));
891 }
892 else
893 {
894 /*
895 * Check keys one by one
896 *
897 * When there are multiple scan keys, failure to meet
898 * the criteria for a single one of them is enough to
899 * discard the range as a whole, so break out of the
900 * loop as soon as a false return value is obtained.
901 */
902 int keyno;
903
904 for (keyno = 0; keyno < nkeys[attno - 1]; keyno++)
905 {
906 add = FunctionCall3Coll(&consistentFn[attno - 1],
907 keys[attno - 1][keyno]->sk_collation,
909 PointerGetDatum(bval),
910 PointerGetDatum(keys[attno - 1][keyno]));
912 if (!addrange)
913 break;
914 }
915 }
916
917 /*
918 * If we found a scan key eliminating the range, no need
919 * to check additional ones.
920 */
921 if (!addrange)
922 break;
923 }
924 }
925 }
926
927 /* add the pages in the range to the output bitmap, if needed */
928 if (addrange)
929 {
930 uint64 pageno;
931
932 for (pageno = heapBlk;
933 pageno <= Min(nblocks, heapBlk + opaque->bo_pagesPerRange) - 1;
934 pageno++)
935 {
937 tbm_add_page(tbm, pageno);
938 totalpages++;
940 }
941 }
942 }
943
946
947 if (buf != InvalidBuffer)
949
950 /*
951 * XXX We have an approximation of the number of *pages* that our scan
952 * returns, but we don't have a precise idea of the number of heap tuples
953 * involved.
954 */
955 return totalpages * 10;
956}
957
958/*
959 * Re-initialize state for a BRIN index scan
960 */
961void
963 ScanKey orderbys, int norderbys)
964{
965 /*
966 * Other index AMs preprocess the scan keys at this point, or sometime
967 * early during the scan; this lets them optimize by removing redundant
968 * keys, or doing early returns when they are impossible to satisfy; see
969 * _bt_preprocess_keys for an example. Something like that could be added
970 * here someday, too.
971 */
972
973 if (scankey && scan->numberOfKeys > 0)
974 memcpy(scan->keyData, scankey, scan->numberOfKeys * sizeof(ScanKeyData));
975}
976
977/*
978 * Close down a BRIN index scan
979 */
980void
982{
983 BrinOpaque *opaque = (BrinOpaque *) scan->opaque;
984
986 brin_free_desc(opaque->bo_bdesc);
987 pfree(opaque);
988}
989
990/*
991 * Per-heap-tuple callback for table_index_build_scan.
992 *
993 * Note we don't worry about the page range at the end of the table here; it is
994 * present in the build state struct after we're called the last time, but not
995 * inserted into the index. Caller must ensure to do so, if appropriate.
996 */
997static void
999 ItemPointer tid,
1000 Datum *values,
1001 bool *isnull,
1002 bool tupleIsAlive,
1003 void *brstate)
1004{
1007
1009
1010 /*
1011 * If we're in a block that belongs to a future range, summarize what
1012 * we've got and start afresh. Note the scan might have skipped many
1013 * pages, if they were devoid of live tuples; make sure to insert index
1014 * tuples for those too.
1015 */
1016 while (thisblock > state->bs_currRangeStart + state->bs_pagesPerRange - 1)
1017 {
1018
1020 "brinbuildCallback: completed a range: %u--%u",
1021 state->bs_currRangeStart,
1022 state->bs_currRangeStart + state->bs_pagesPerRange));
1023
1024 /* create the index tuple and insert it */
1026
1027 /* set state to correspond to the next range */
1028 state->bs_currRangeStart += state->bs_pagesPerRange;
1029
1030 /* re-initialize state for it */
1031 brin_memtuple_initialize(state->bs_dtuple, state->bs_bdesc);
1032 }
1033
1034 /* Accumulate the current tuple into the running state */
1035 (void) add_values_to_range(index, state->bs_bdesc, state->bs_dtuple,
1036 values, isnull);
1037}
1038
1039/*
1040 * Per-heap-tuple callback for table_index_build_scan with parallelism.
1041 *
1042 * A version of the callback used by parallel index builds. The main difference
1043 * is that instead of writing the BRIN tuples into the index, we write them
1044 * into a shared tuplesort, and leave the insertion up to the leader (which may
1045 * reorder them a bit etc.). The callback also does not generate empty ranges,
1046 * those will be added by the leader when merging results from workers.
1047 */
1048static void
1050 ItemPointer tid,
1051 Datum *values,
1052 bool *isnull,
1053 bool tupleIsAlive,
1054 void *brstate)
1055{
1058
1060
1061 /*
1062 * If we're in a block that belongs to a different range, summarize what
1063 * we've got and start afresh. Note the scan might have skipped many
1064 * pages, if they were devoid of live tuples; we do not create empty BRIN
1065 * ranges here - the leader is responsible for filling them in.
1066 *
1067 * Unlike serial builds, parallel index builds allow synchronized seqscans
1068 * (because that's what parallel scans do). This means the block may wrap
1069 * around to the beginning of the relation, so the condition needs to
1070 * check for both future and past ranges.
1071 */
1072 if ((thisblock < state->bs_currRangeStart) ||
1073 (thisblock > state->bs_currRangeStart + state->bs_pagesPerRange - 1))
1074 {
1075
1077 "brinbuildCallbackParallel: completed a range: %u--%u",
1078 state->bs_currRangeStart,
1079 state->bs_currRangeStart + state->bs_pagesPerRange));
1080
1081 /* create the index tuple and write it into the tuplesort */
1083
1084 /*
1085 * Set state to correspond to the next range (for this block).
1086 *
1087 * This skips ranges that are either empty (and so we don't get any
1088 * tuples to summarize), or processed by other workers. We can't
1089 * differentiate those cases here easily, so we leave it up to the
1090 * leader to fill empty ranges where needed.
1091 */
1092 state->bs_currRangeStart
1093 = state->bs_pagesPerRange * (thisblock / state->bs_pagesPerRange);
1094
1095 /* re-initialize state for it */
1096 brin_memtuple_initialize(state->bs_dtuple, state->bs_bdesc);
1097 }
1098
1099 /* Accumulate the current tuple into the running state */
1100 (void) add_values_to_range(index, state->bs_bdesc, state->bs_dtuple,
1101 values, isnull);
1102}
1103
1104/*
1105 * brinbuild() -- build a new BRIN index.
1106 */
1109{
1110 IndexBuildResult *result;
1111 double reltuples;
1112 double idxtuples;
1115 Buffer meta;
1116 BlockNumber pagesPerRange;
1117
1118 /*
1119 * We expect to be called exactly once for any index relation.
1120 */
1122 elog(ERROR, "index \"%s\" already contains data",
1124
1125 /*
1126 * Critical section not required, because on error the creation of the
1127 * whole relation will be rolled back.
1128 */
1129
1133
1136 MarkBufferDirty(meta);
1137
1139 {
1142 Page page;
1143
1145 xlrec.pagesPerRange = BrinGetPagesPerRange(index);
1146
1150
1152
1153 page = BufferGetPage(meta);
1154 PageSetLSN(page, recptr);
1155 }
1156
1157 UnlockReleaseBuffer(meta);
1158
1159 /*
1160 * Initialize our state, including the deformed tuple state.
1161 */
1162 revmap = brinRevmapInitialize(index, &pagesPerRange);
1163 state = initialize_brin_buildstate(index, revmap, pagesPerRange,
1165
1166 /*
1167 * Attempt to launch parallel worker scan when required
1168 *
1169 * XXX plan_create_index_workers makes the number of workers dependent on
1170 * maintenance_work_mem, requiring 32MB for each worker. That makes sense
1171 * for btree, but not for BRIN, which can do with much less memory. So
1172 * maybe make that somehow less strict, optionally?
1173 */
1174 if (indexInfo->ii_ParallelWorkers > 0)
1175 _brin_begin_parallel(state, heap, index, indexInfo->ii_Concurrent,
1176 indexInfo->ii_ParallelWorkers);
1177
1178 /*
1179 * If parallel build requested and at least one worker process was
1180 * successfully launched, set up coordination state, wait for workers to
1181 * complete. Then read all tuples from the shared tuplesort and insert
1182 * them into the index.
1183 *
1184 * In serial mode, simply scan the table and build the index one index
1185 * tuple at a time.
1186 */
1187 if (state->bs_leader)
1188 {
1190
1192 coordinate->isWorker = false;
1193 coordinate->nParticipants =
1194 state->bs_leader->nparticipanttuplesorts;
1195 coordinate->sharedsort = state->bs_leader->sharedsort;
1196
1197 /*
1198 * Begin leader tuplesort.
1199 *
1200 * In cases where parallelism is involved, the leader receives the
1201 * same share of maintenance_work_mem as a serial sort (it is
1202 * generally treated in the same way as a serial sort once we return).
1203 * Parallel worker Tuplesortstates will have received only a fraction
1204 * of maintenance_work_mem, though.
1205 *
1206 * We rely on the lifetime of the Leader Tuplesortstate almost not
1207 * overlapping with any worker Tuplesortstate's lifetime. There may
1208 * be some small overlap, but that's okay because we rely on leader
1209 * Tuplesortstate only allocating a small, fixed amount of memory
1210 * here. When its tuplesort_performsort() is called (by our caller),
1211 * and significant amounts of memory are likely to be used, all
1212 * workers must have already freed almost all memory held by their
1213 * Tuplesortstates (they are about to go away completely, too). The
1214 * overall effect is that maintenance_work_mem always represents an
1215 * absolute high watermark on the amount of memory used by a CREATE
1216 * INDEX operation, regardless of the use of parallelism or any other
1217 * factor.
1218 */
1219 state->bs_sortstate =
1222
1223 /* scan the relation and merge per-worker results */
1224 reltuples = _brin_parallel_merge(state);
1225
1226 _brin_end_parallel(state->bs_leader, state);
1227 }
1228 else /* no parallel index build */
1229 {
1230 /*
1231 * Now scan the relation. No syncscan allowed here because we want
1232 * the heap blocks in physical order (we want to produce the ranges
1233 * starting from block 0, and the callback also relies on this to not
1234 * generate summary for the same range twice).
1235 */
1236 reltuples = table_index_build_scan(heap, index, indexInfo, false, true,
1238
1239 /*
1240 * process the final batch
1241 *
1242 * XXX Note this does not update state->bs_currRangeStart, i.e. it
1243 * stays set to the last range added to the index. This is OK, because
1244 * that's what brin_fill_empty_ranges expects.
1245 */
1247
1248 /*
1249 * Backfill the final ranges with empty data.
1250 *
1251 * This saves us from doing what amounts to full table scans when the
1252 * index with a predicate like WHERE (nonnull_column IS NULL), or
1253 * other very selective predicates.
1254 */
1256 state->bs_currRangeStart,
1257 state->bs_maxRangeStart);
1258 }
1259
1260 /* release resources */
1261 idxtuples = state->bs_numtuples;
1262 brinRevmapTerminate(state->bs_rmAccess);
1264
1265 /*
1266 * Return statistics
1267 */
1269
1270 result->heap_tuples = reltuples;
1271 result->index_tuples = idxtuples;
1272
1273 return result;
1274}
1275
1276void
1278{
1280
1281 /* An empty BRIN index has a metapage only. */
1284
1285 /* Initialize and xlog metabuffer. */
1292
1294}
1295
1296/*
1297 * brinbulkdelete
1298 * Since there are no per-heap-tuple index tuples in BRIN indexes,
1299 * there's not a lot we can do here.
1300 *
1301 * XXX we could mark item tuples as "dirty" (when a minimum or maximum heap
1302 * tuple is deleted), meaning the need to re-run summarization on the affected
1303 * range. Would need to add an extra flag in brintuples for that.
1304 */
1307 IndexBulkDeleteCallback callback, void *callback_state)
1308{
1309 /* allocate stats if first time through, else re-use existing struct */
1310 if (stats == NULL)
1312
1313 return stats;
1314}
1315
1316/*
1317 * This routine is in charge of "vacuuming" a BRIN index: we just summarize
1318 * ranges that are currently unsummarized.
1319 */
1322{
1323 Relation heapRel;
1324
1325 /* No-op in ANALYZE ONLY mode */
1326 if (info->analyze_only)
1327 return stats;
1328
1329 if (!stats)
1332 /* rest of stats is initialized by zeroing */
1333
1334 heapRel = table_open(IndexGetRelation(RelationGetRelid(info->index), false),
1336
1337 brin_vacuum_scan(info->index, info->strategy);
1338
1339 brinsummarize(info->index, heapRel, BRIN_ALL_BLOCKRANGES, false,
1340 &stats->num_index_tuples, &stats->num_index_tuples);
1341
1342 table_close(heapRel, AccessShareLock);
1343
1344 return stats;
1345}
1346
1347/*
1348 * reloptions processor for BRIN indexes
1349 */
1350bytea *
1351brinoptions(Datum reloptions, bool validate)
1352{
1353 static const relopt_parse_elt tab[] = {
1354 {"pages_per_range", RELOPT_TYPE_INT, offsetof(BrinOptions, pagesPerRange)},
1355 {"autosummarize", RELOPT_TYPE_BOOL, offsetof(BrinOptions, autosummarize)}
1356 };
1357
1358 return (bytea *) build_reloptions(reloptions, validate,
1360 sizeof(BrinOptions),
1361 tab, lengthof(tab));
1362}
1363
1364/*
1365 * SQL-callable function to scan through an index and summarize all ranges
1366 * that are not currently summarized.
1367 */
1368Datum
1377
1378/*
1379 * SQL-callable function to summarize the indicated page range, if not already
1380 * summarized. If the second argument is BRIN_ALL_BLOCKRANGES, all
1381 * unsummarized ranges are summarized.
1382 */
1383Datum
1385{
1386 Oid indexoid = PG_GETARG_OID(0);
1388 BlockNumber heapBlk;
1389 Oid heapoid;
1390 Relation indexRel;
1391 Relation heapRel;
1392 Oid save_userid;
1393 int save_sec_context;
1394 int save_nestlevel;
1395 double numSummarized = 0;
1396
1397 if (RecoveryInProgress())
1398 ereport(ERROR,
1400 errmsg("recovery is in progress"),
1401 errhint("BRIN control functions cannot be executed during recovery.")));
1402
1404 ereport(ERROR,
1406 errmsg("block number out of range: %" PRId64, heapBlk64)));
1407 heapBlk = (BlockNumber) heapBlk64;
1408
1409 /*
1410 * We must lock table before index to avoid deadlocks. However, if the
1411 * passed indexoid isn't an index then IndexGetRelation() will fail.
1412 * Rather than emitting a not-very-helpful error message, postpone
1413 * complaining, expecting that the is-it-an-index test below will fail.
1414 */
1415 heapoid = IndexGetRelation(indexoid, true);
1416 if (OidIsValid(heapoid))
1417 {
1419
1420 /*
1421 * Autovacuum calls us. For its benefit, switch to the table owner's
1422 * userid, so that any index functions are run as that user. Also
1423 * lock down security-restricted operations and arrange to make GUC
1424 * variable changes local to this command. This is harmless, albeit
1425 * unnecessary, when called from SQL, because we fail shortly if the
1426 * user does not own the index.
1427 */
1428 GetUserIdAndSecContext(&save_userid, &save_sec_context);
1429 SetUserIdAndSecContext(heapRel->rd_rel->relowner,
1430 save_sec_context | SECURITY_RESTRICTED_OPERATION);
1431 save_nestlevel = NewGUCNestLevel();
1433 }
1434 else
1435 {
1436 heapRel = NULL;
1437 /* Set these just to suppress "uninitialized variable" warnings */
1438 save_userid = InvalidOid;
1439 save_sec_context = -1;
1440 save_nestlevel = -1;
1441 }
1442
1443 indexRel = index_open(indexoid, ShareUpdateExclusiveLock);
1444
1445 /* Must be a BRIN index */
1446 if (indexRel->rd_rel->relkind != RELKIND_INDEX ||
1447 indexRel->rd_rel->relam != BRIN_AM_OID)
1448 ereport(ERROR,
1450 errmsg("\"%s\" is not a BRIN index",
1451 RelationGetRelationName(indexRel))));
1452
1453 /* User must own the index (comparable to privileges needed for VACUUM) */
1454 if (heapRel != NULL && !object_ownercheck(RelationRelationId, indexoid, save_userid))
1456 RelationGetRelationName(indexRel));
1457
1458 /*
1459 * Since we did the IndexGetRelation call above without any lock, it's
1460 * barely possible that a race against an index drop/recreation could have
1461 * netted us the wrong table. Recheck.
1462 */
1463 if (heapRel == NULL || heapoid != IndexGetRelation(indexoid, false))
1464 ereport(ERROR,
1466 errmsg("could not open parent table of index \"%s\"",
1467 RelationGetRelationName(indexRel))));
1468
1469 /* see gin_clean_pending_list() */
1470 if (indexRel->rd_index->indisvalid)
1471 brinsummarize(indexRel, heapRel, heapBlk, true, &numSummarized, NULL);
1472 else
1475 errmsg("index \"%s\" is not valid",
1476 RelationGetRelationName(indexRel))));
1477
1478 /* Roll back any GUC changes executed by index functions */
1479 AtEOXact_GUC(false, save_nestlevel);
1480
1481 /* Restore userid and security context */
1482 SetUserIdAndSecContext(save_userid, save_sec_context);
1483
1486
1488}
1489
1490/*
1491 * SQL-callable interface to mark a range as no longer summarized
1492 */
1493Datum
1495{
1496 Oid indexoid = PG_GETARG_OID(0);
1498 BlockNumber heapBlk;
1499 Oid heapoid;
1500 Relation heapRel;
1501 Relation indexRel;
1502 bool done;
1503
1504 if (RecoveryInProgress())
1505 ereport(ERROR,
1507 errmsg("recovery is in progress"),
1508 errhint("BRIN control functions cannot be executed during recovery.")));
1509
1510 if (heapBlk64 > MaxBlockNumber || heapBlk64 < 0)
1511 ereport(ERROR,
1513 errmsg("block number out of range: %" PRId64,
1514 heapBlk64)));
1515 heapBlk = (BlockNumber) heapBlk64;
1516
1517 /*
1518 * We must lock table before index to avoid deadlocks. However, if the
1519 * passed indexoid isn't an index then IndexGetRelation() will fail.
1520 * Rather than emitting a not-very-helpful error message, postpone
1521 * complaining, expecting that the is-it-an-index test below will fail.
1522 *
1523 * Unlike brin_summarize_range(), autovacuum never calls this. Hence, we
1524 * don't switch userid.
1525 */
1526 heapoid = IndexGetRelation(indexoid, true);
1527 if (OidIsValid(heapoid))
1529 else
1530 heapRel = NULL;
1531
1532 indexRel = index_open(indexoid, ShareUpdateExclusiveLock);
1533
1534 /* Must be a BRIN index */
1535 if (indexRel->rd_rel->relkind != RELKIND_INDEX ||
1536 indexRel->rd_rel->relam != BRIN_AM_OID)
1537 ereport(ERROR,
1539 errmsg("\"%s\" is not a BRIN index",
1540 RelationGetRelationName(indexRel))));
1541
1542 /* User must own the index (comparable to privileges needed for VACUUM) */
1545 RelationGetRelationName(indexRel));
1546
1547 /*
1548 * Since we did the IndexGetRelation call above without any lock, it's
1549 * barely possible that a race against an index drop/recreation could have
1550 * netted us the wrong table. Recheck.
1551 */
1552 if (heapRel == NULL || heapoid != IndexGetRelation(indexoid, false))
1553 ereport(ERROR,
1555 errmsg("could not open parent table of index \"%s\"",
1556 RelationGetRelationName(indexRel))));
1557
1558 /* see gin_clean_pending_list() */
1559 if (indexRel->rd_index->indisvalid)
1560 {
1561 /* the revmap does the hard work */
1562 do
1563 {
1564 done = brinRevmapDesummarizeRange(indexRel, heapBlk);
1565 }
1566 while (!done);
1567 }
1568 else
1571 errmsg("index \"%s\" is not valid",
1572 RelationGetRelationName(indexRel))));
1573
1576
1578}
1579
1580/*
1581 * Build a BrinDesc used to create or scan a BRIN index
1582 */
1583BrinDesc *
1585{
1587 BrinDesc *bdesc;
1588 TupleDesc tupdesc;
1589 int totalstored = 0;
1590 int keyno;
1591 long totalsize;
1592 MemoryContext cxt;
1594
1596 "brin desc cxt",
1599 tupdesc = RelationGetDescr(rel);
1600
1601 /*
1602 * Obtain BrinOpcInfo for each indexed column. While at it, accumulate
1603 * the number of columns stored, since the number is opclass-defined.
1604 */
1605 opcinfo = palloc_array(BrinOpcInfo *, tupdesc->natts);
1606 for (keyno = 0; keyno < tupdesc->natts; keyno++)
1607 {
1609 Form_pg_attribute attr = TupleDescAttr(tupdesc, keyno);
1610
1612
1613 opcinfo[keyno] = (BrinOpcInfo *)
1615 totalstored += opcinfo[keyno]->oi_nstored;
1616 }
1617
1618 /* Allocate our result struct and fill it in */
1619 totalsize = offsetof(BrinDesc, bd_info) +
1620 sizeof(BrinOpcInfo *) * tupdesc->natts;
1621
1622 bdesc = palloc(totalsize);
1623 bdesc->bd_context = cxt;
1624 bdesc->bd_index = rel;
1625 bdesc->bd_tupdesc = tupdesc;
1626 bdesc->bd_disktdesc = NULL; /* generated lazily */
1627 bdesc->bd_totalstored = totalstored;
1628
1629 for (keyno = 0; keyno < tupdesc->natts; keyno++)
1630 bdesc->bd_info[keyno] = opcinfo[keyno];
1631 pfree(opcinfo);
1632
1634
1635 return bdesc;
1636}
1637
1638void
1640{
1641 /* make sure the tupdesc is still valid */
1642 Assert(bdesc->bd_tupdesc->tdrefcount >= 1);
1643 /* no need for retail pfree */
1644 MemoryContextDelete(bdesc->bd_context);
1645}
1646
1647/*
1648 * Fetch index's statistical data into *stats
1649 */
1650void
1667
1668/*
1669 * Initialize a BrinBuildState appropriate to create tuples on the given index.
1670 */
1671static BrinBuildState *
1673 BlockNumber pagesPerRange, BlockNumber tablePages)
1674{
1677
1679
1680 state->bs_irel = idxRel;
1681 state->bs_numtuples = 0;
1682 state->bs_reltuples = 0;
1683 state->bs_currentInsertBuf = InvalidBuffer;
1684 state->bs_pagesPerRange = pagesPerRange;
1685 state->bs_currRangeStart = 0;
1686 state->bs_rmAccess = revmap;
1687 state->bs_bdesc = brin_build_desc(idxRel);
1688 state->bs_dtuple = brin_new_memtuple(state->bs_bdesc);
1689 state->bs_leader = NULL;
1690 state->bs_worker_id = 0;
1691 state->bs_sortstate = NULL;
1692
1693 /* Remember the memory context to use for an empty tuple, if needed. */
1694 state->bs_context = CurrentMemoryContext;
1695 state->bs_emptyTuple = NULL;
1696 state->bs_emptyTupleLen = 0;
1697
1698 /*
1699 * Calculate the start of the last page range. Page numbers are 0-based,
1700 * so to calculate the index we need to subtract one. The integer division
1701 * gives us the index of the page range.
1702 */
1703 if (tablePages > 0)
1704 lastRange = ((tablePages - 1) / pagesPerRange) * pagesPerRange;
1705
1706 /* Now calculate the start of the next range. */
1707 state->bs_maxRangeStart = lastRange + state->bs_pagesPerRange;
1708
1709 return state;
1710}
1711
1712/*
1713 * Release resources associated with a BrinBuildState.
1714 */
1715static void
1717{
1718 /*
1719 * Release the last index buffer used. We might as well ensure that
1720 * whatever free space remains in that page is available in FSM, too.
1721 */
1722 if (!BufferIsInvalid(state->bs_currentInsertBuf))
1723 {
1724 Page page;
1725 Size freespace;
1727
1728 page = BufferGetPage(state->bs_currentInsertBuf);
1729 freespace = PageGetFreeSpace(page);
1730 blk = BufferGetBlockNumber(state->bs_currentInsertBuf);
1731 ReleaseBuffer(state->bs_currentInsertBuf);
1732 RecordPageWithFreeSpace(state->bs_irel, blk, freespace);
1733 FreeSpaceMapVacuumRange(state->bs_irel, blk, blk + 1);
1734 }
1735
1736 brin_free_desc(state->bs_bdesc);
1737 pfree(state->bs_dtuple);
1738 pfree(state);
1739}
1740
1741/*
1742 * On the given BRIN index, summarize the heap page range that corresponds
1743 * to the heap block number given.
1744 *
1745 * This routine can run in parallel with insertions into the heap. To avoid
1746 * missing those values from the summary tuple, we first insert a placeholder
1747 * index tuple into the index, then execute the heap scan; transactions
1748 * concurrent with the scan update the placeholder tuple. After the scan, we
1749 * union the placeholder tuple with the one computed by this routine. The
1750 * update of the index value happens in a loop, so that if somebody updates
1751 * the placeholder tuple after we read it, we detect the case and try again.
1752 * This ensures that the concurrently inserted tuples are not lost.
1753 *
1754 * A further corner case is this routine being asked to summarize the partial
1755 * range at the end of the table. heapNumBlocks is the (possibly outdated)
1756 * table size; if we notice that the requested range lies beyond that size,
1757 * we re-compute the table size after inserting the placeholder tuple, to
1758 * avoid missing pages that were appended recently.
1759 */
1760static void
1763{
1764 Buffer phbuf;
1766 Size phsz;
1767 OffsetNumber offset;
1769
1770 /*
1771 * Insert the placeholder tuple
1772 */
1774 phtup = brin_form_placeholder_tuple(state->bs_bdesc, heapBlk, &phsz);
1775 offset = brin_doinsert(state->bs_irel, state->bs_pagesPerRange,
1776 state->bs_rmAccess, &phbuf,
1777 heapBlk, phtup, phsz);
1778
1779 /*
1780 * Compute range end. We hold ShareUpdateExclusive lock on table, so it
1781 * cannot shrink concurrently (but it can grow).
1782 */
1783 Assert(heapBlk % state->bs_pagesPerRange == 0);
1784 if (heapBlk + state->bs_pagesPerRange > heapNumBlks)
1785 {
1786 /*
1787 * If we're asked to scan what we believe to be the final range on the
1788 * table (i.e. a range that might be partial) we need to recompute our
1789 * idea of what the latest page is after inserting the placeholder
1790 * tuple. Anyone that grows the table later will update the
1791 * placeholder tuple, so it doesn't matter that we won't scan these
1792 * pages ourselves. Careful: the table might have been extended
1793 * beyond the current range, so clamp our result.
1794 *
1795 * Fortunately, this should occur infrequently.
1796 */
1797 scanNumBlks = Min(RelationGetNumberOfBlocks(heapRel) - heapBlk,
1798 state->bs_pagesPerRange);
1799 }
1800 else
1801 {
1802 /* Easy case: range is known to be complete */
1803 scanNumBlks = state->bs_pagesPerRange;
1804 }
1805
1806 /*
1807 * Execute the partial heap scan covering the heap blocks in the specified
1808 * page range, summarizing the heap tuples in it. This scan stops just
1809 * short of brinbuildCallback creating the new index entry.
1810 *
1811 * Note that it is critical we use the "any visible" mode of
1812 * table_index_build_range_scan here: otherwise, we would miss tuples
1813 * inserted by transactions that are still in progress, among other corner
1814 * cases.
1815 */
1816 state->bs_currRangeStart = heapBlk;
1817 table_index_build_range_scan(heapRel, state->bs_irel, indexInfo, false, true, false,
1818 heapBlk, scanNumBlks,
1820
1821 /*
1822 * Now we update the values obtained by the scan with the placeholder
1823 * tuple. We do this in a loop which only terminates if we're able to
1824 * update the placeholder tuple successfully; if we are not, this means
1825 * somebody else modified the placeholder tuple after we read it.
1826 */
1827 for (;;)
1828 {
1830 Size newsize;
1831 bool didupdate;
1832 bool samepage;
1833
1835
1836 /*
1837 * Update the summary tuple and try to update.
1838 */
1839 newtup = brin_form_tuple(state->bs_bdesc,
1840 heapBlk, state->bs_dtuple, &newsize);
1842 didupdate =
1843 brin_doupdate(state->bs_irel, state->bs_pagesPerRange,
1844 state->bs_rmAccess, heapBlk, phbuf, offset,
1848
1849 /* If the update succeeded, we're done. */
1850 if (didupdate)
1851 break;
1852
1853 /*
1854 * If the update didn't work, it might be because somebody updated the
1855 * placeholder tuple concurrently. Extract the new version, union it
1856 * with the values we have from the scan, and start over. (There are
1857 * other reasons for the update to fail, but it's simple to treat them
1858 * the same.)
1859 */
1860 phtup = brinGetTupleForHeapBlock(state->bs_rmAccess, heapBlk, &phbuf,
1861 &offset, &phsz, BUFFER_LOCK_SHARE);
1862 /* the placeholder tuple must exist */
1863 if (phtup == NULL)
1864 elog(ERROR, "missing placeholder tuple");
1867
1868 /* merge it into the tuple from the heap scan */
1869 union_tuples(state->bs_bdesc, state->bs_dtuple, phtup);
1870 }
1871
1873}
1874
1875/*
1876 * Summarize page ranges that are not already summarized. If pageRange is
1877 * BRIN_ALL_BLOCKRANGES then the whole table is scanned; otherwise, only the
1878 * page range containing the given heap page number is scanned.
1879 * If include_partial is true, then the partial range at the end of the table
1880 * is summarized, otherwise not.
1881 *
1882 * For each new index tuple inserted, *numSummarized (if not NULL) is
1883 * incremented; for each existing tuple, *numExisting (if not NULL) is
1884 * incremented.
1885 */
1886static void
1888 bool include_partial, double *numSummarized, double *numExisting)
1889{
1892 IndexInfo *indexInfo = NULL;
1894 BlockNumber pagesPerRange;
1895 Buffer buf;
1897
1898 revmap = brinRevmapInitialize(index, &pagesPerRange);
1899
1900 /* determine range of pages to process */
1903 startBlk = 0;
1904 else
1905 {
1906 startBlk = (pageRange / pagesPerRange) * pagesPerRange;
1907 heapNumBlocks = Min(heapNumBlocks, startBlk + pagesPerRange);
1908 }
1909 if (startBlk > heapNumBlocks)
1910 {
1911 /* Nothing to do if start point is beyond end of table */
1913 return;
1914 }
1915
1916 /*
1917 * Scan the revmap to find unsummarized items.
1918 */
1920 for (; startBlk < heapNumBlocks; startBlk += pagesPerRange)
1921 {
1922 BrinTuple *tup;
1923 OffsetNumber off;
1924
1925 /*
1926 * Unless requested to summarize even a partial range, go away now if
1927 * we think the next range is partial. Caller would pass true when it
1928 * is typically run once bulk data loading is done
1929 * (brin_summarize_new_values), and false when it is typically the
1930 * result of arbitrarily-scheduled maintenance command (vacuuming).
1931 */
1932 if (!include_partial &&
1933 (startBlk + pagesPerRange > heapNumBlocks))
1934 break;
1935
1937
1940 if (tup == NULL)
1941 {
1942 /* no revmap entry for this heap range. Summarize it. */
1943 if (state == NULL)
1944 {
1945 /* first time through */
1946 Assert(!indexInfo);
1948 pagesPerRange,
1950 indexInfo = BuildIndexInfo(index);
1951 }
1952 summarize_range(indexInfo, state, heapRel, startBlk, heapNumBlocks);
1953
1954 /* and re-initialize state for the next range */
1955 brin_memtuple_initialize(state->bs_dtuple, state->bs_bdesc);
1956
1957 if (numSummarized)
1958 *numSummarized += 1.0;
1959 }
1960 else
1961 {
1962 if (numExisting)
1963 *numExisting += 1.0;
1965 }
1966 }
1967
1968 if (BufferIsValid(buf))
1970
1971 /* free resources */
1973 if (state)
1974 {
1976 pfree(indexInfo);
1977 }
1978}
1979
1980/*
1981 * Given a deformed tuple in the build state, convert it into the on-disk
1982 * format and insert it into the index, making the revmap point to it.
1983 */
1984static void
1986{
1987 BrinTuple *tup;
1988 Size size;
1989
1990 tup = brin_form_tuple(state->bs_bdesc, state->bs_currRangeStart,
1991 state->bs_dtuple, &size);
1992 brin_doinsert(state->bs_irel, state->bs_pagesPerRange, state->bs_rmAccess,
1993 &state->bs_currentInsertBuf, state->bs_currRangeStart,
1994 tup, size);
1995 state->bs_numtuples++;
1996
1997 pfree(tup);
1998}
1999
2000/*
2001 * Given a deformed tuple in the build state, convert it into the on-disk
2002 * format and write it to a (shared) tuplesort (the leader will insert it
2003 * into the index later).
2004 */
2005static void
2007{
2008 BrinTuple *tup;
2009 Size size;
2010
2011 /* don't insert empty tuples in parallel build */
2012 if (state->bs_dtuple->bt_empty_range)
2013 return;
2014
2015 tup = brin_form_tuple(state->bs_bdesc, state->bs_currRangeStart,
2016 state->bs_dtuple, &size);
2017
2018 /* write the BRIN tuple to the tuplesort */
2019 tuplesort_putbrintuple(state->bs_sortstate, tup, size);
2020
2021 state->bs_numtuples++;
2022
2023 pfree(tup);
2024}
2025
2026/*
2027 * Given two deformed tuples, adjust the first one so that it's consistent
2028 * with the summary values in both.
2029 */
2030static void
2032{
2033 int keyno;
2034 BrinMemTuple *db;
2035 MemoryContext cxt;
2037
2038 /* Use our own memory context to avoid retail pfree */
2040 "brin union",
2043 db = brin_deform_tuple(bdesc, b, NULL);
2045
2046 /*
2047 * Check if the ranges are empty.
2048 *
2049 * If at least one of them is empty, we don't need to call per-key union
2050 * functions at all. If "b" is empty, we just use "a" as the result (it
2051 * might be empty fine, but that's fine). If "a" is empty but "b" is not,
2052 * we use "b" as the result (but we have to copy the data into "a" first).
2053 *
2054 * Only when both ranges are non-empty, we actually do the per-key merge.
2055 */
2056
2057 /* If "b" is empty - ignore it and just use "a" (even if it's empty etc.). */
2058 if (db->bt_empty_range)
2059 {
2060 /* skip the per-key merge */
2062 return;
2063 }
2064
2065 /*
2066 * Now we know "b" is not empty. If "a" is empty, then "b" is the result.
2067 * But we need to copy the data from "b" to "a" first, because that's how
2068 * we pass result out.
2069 *
2070 * We have to copy all the global/per-key flags etc. too.
2071 */
2072 if (a->bt_empty_range)
2073 {
2074 for (keyno = 0; keyno < bdesc->bd_tupdesc->natts; keyno++)
2075 {
2076 int i;
2077 BrinValues *col_a = &a->bt_columns[keyno];
2078 BrinValues *col_b = &db->bt_columns[keyno];
2079 BrinOpcInfo *opcinfo = bdesc->bd_info[keyno];
2080
2081 col_a->bv_allnulls = col_b->bv_allnulls;
2082 col_a->bv_hasnulls = col_b->bv_hasnulls;
2083
2084 /* If "b" has no data, we're done. */
2085 if (col_b->bv_allnulls)
2086 continue;
2087
2088 for (i = 0; i < opcinfo->oi_nstored; i++)
2089 col_a->bv_values[i] =
2090 datumCopy(col_b->bv_values[i],
2091 opcinfo->oi_typcache[i]->typbyval,
2092 opcinfo->oi_typcache[i]->typlen);
2093 }
2094
2095 /* "a" started empty, but "b" was not empty, so remember that */
2096 a->bt_empty_range = false;
2097
2098 /* skip the per-key merge */
2100 return;
2101 }
2102
2103 /* Now we know neither range is empty. */
2104 for (keyno = 0; keyno < bdesc->bd_tupdesc->natts; keyno++)
2105 {
2106 FmgrInfo *unionFn;
2107 BrinValues *col_a = &a->bt_columns[keyno];
2108 BrinValues *col_b = &db->bt_columns[keyno];
2109 BrinOpcInfo *opcinfo = bdesc->bd_info[keyno];
2110
2111 if (opcinfo->oi_regular_nulls)
2112 {
2113 /* Does the "b" summary represent any NULL values? */
2114 bool b_has_nulls = (col_b->bv_hasnulls || col_b->bv_allnulls);
2115
2116 /* Adjust "hasnulls". */
2117 if (!col_a->bv_allnulls && b_has_nulls)
2118 col_a->bv_hasnulls = true;
2119
2120 /* If there are no values in B, there's nothing left to do. */
2121 if (col_b->bv_allnulls)
2122 continue;
2123
2124 /*
2125 * Adjust "allnulls". If A doesn't have values, just copy the
2126 * values from B into A, and we're done. We cannot run the
2127 * operators in this case, because values in A might contain
2128 * garbage. Note we already established that B contains values.
2129 *
2130 * Also adjust "hasnulls" in order not to forget the summary
2131 * represents NULL values. This is not redundant with the earlier
2132 * update, because that only happens when allnulls=false.
2133 */
2134 if (col_a->bv_allnulls)
2135 {
2136 int i;
2137
2138 col_a->bv_allnulls = false;
2139 col_a->bv_hasnulls = true;
2140
2141 for (i = 0; i < opcinfo->oi_nstored; i++)
2142 col_a->bv_values[i] =
2143 datumCopy(col_b->bv_values[i],
2144 opcinfo->oi_typcache[i]->typbyval,
2145 opcinfo->oi_typcache[i]->typlen);
2146
2147 continue;
2148 }
2149 }
2150
2151 unionFn = index_getprocinfo(bdesc->bd_index, keyno + 1,
2153 FunctionCall3Coll(unionFn,
2154 bdesc->bd_index->rd_indcollation[keyno],
2158 }
2159
2161}
2162
2163/*
2164 * brin_vacuum_scan
2165 * Do a complete scan of the index during VACUUM.
2166 *
2167 * This routine scans the complete index looking for uncataloged index pages,
2168 * i.e. those that might have been lost due to a crash after index extension
2169 * and such.
2170 */
2171static void
2173{
2175 ReadStream *stream;
2176 Buffer buf;
2177
2178 p.current_blocknum = 0;
2180
2181 /*
2182 * It is safe to use batchmode as block_range_read_stream_cb takes no
2183 * locks.
2184 */
2188 strategy,
2189 idxrel,
2192 &p,
2193 0);
2194
2195 /*
2196 * Scan the index in physical order, and clean up any possible mess in
2197 * each page.
2198 */
2199 while ((buf = read_stream_next_buffer(stream, NULL)) != InvalidBuffer)
2200 {
2202
2204
2206 }
2207
2208 read_stream_end(stream);
2209
2210 /*
2211 * Update all upper pages in the index's FSM, as well. This ensures not
2212 * only that we propagate leaf-page FSM updates made by brin_page_cleanup,
2213 * but also that any pre-existing damage or out-of-dateness is repaired.
2214 */
2216}
2217
2218static bool
2220 const Datum *values, const bool *nulls)
2221{
2222 int keyno;
2223
2224 /* If the range starts empty, we're certainly going to modify it. */
2225 bool modified = dtup->bt_empty_range;
2226
2227 /*
2228 * Compare the key values of the new tuple to the stored index values; our
2229 * deformed tuple will get updated if the new tuple doesn't fit the
2230 * original range (note this means we can't break out of the loop early).
2231 * Make a note of whether this happens, so that we know to insert the
2232 * modified tuple later.
2233 */
2234 for (keyno = 0; keyno < bdesc->bd_tupdesc->natts; keyno++)
2235 {
2236 Datum result;
2237 BrinValues *bval;
2239 bool has_nulls;
2240
2241 bval = &dtup->bt_columns[keyno];
2242
2243 /*
2244 * Does the range have actual NULL values? Either of the flags can be
2245 * set, but we ignore the state before adding first row.
2246 *
2247 * We have to remember this, because we'll modify the flags and we
2248 * need to know if the range started as empty.
2249 */
2250 has_nulls = ((!dtup->bt_empty_range) &&
2251 (bval->bv_hasnulls || bval->bv_allnulls));
2252
2253 /*
2254 * If the value we're adding is NULL, handle it locally. Otherwise
2255 * call the BRIN_PROCNUM_ADDVALUE procedure.
2256 */
2257 if (bdesc->bd_info[keyno]->oi_regular_nulls && nulls[keyno])
2258 {
2259 /*
2260 * If the new value is null, we record that we saw it if it's the
2261 * first one; otherwise, there's nothing to do.
2262 */
2263 if (!bval->bv_hasnulls)
2264 {
2265 bval->bv_hasnulls = true;
2266 modified = true;
2267 }
2268
2269 continue;
2270 }
2271
2272 addValue = index_getprocinfo(idxRel, keyno + 1,
2274 result = FunctionCall4Coll(addValue,
2275 idxRel->rd_indcollation[keyno],
2277 PointerGetDatum(bval),
2278 values[keyno],
2279 BoolGetDatum(nulls[keyno]));
2280 /* if that returned true, we need to insert the updated tuple */
2281 modified |= DatumGetBool(result);
2282
2283 /*
2284 * If the range was had actual NULL values (i.e. did not start empty),
2285 * make sure we don't forget about the NULL values. Either the
2286 * allnulls flag is still set to true, or (if the opclass cleared it)
2287 * we need to set hasnulls=true.
2288 *
2289 * XXX This can only happen when the opclass modified the tuple, so
2290 * the modified flag should be set.
2291 */
2292 if (has_nulls && !(bval->bv_hasnulls || bval->bv_allnulls))
2293 {
2295 bval->bv_hasnulls = true;
2296 }
2297 }
2298
2299 /*
2300 * After updating summaries for all the keys, mark it as not empty.
2301 *
2302 * If we're actually changing the flag value (i.e. tuple started as
2303 * empty), we should have modified the tuple. So we should not see empty
2304 * range that was not modified.
2305 */
2306 Assert(!dtup->bt_empty_range || modified);
2307 dtup->bt_empty_range = false;
2308
2309 return modified;
2310}
2311
2312static bool
2314{
2315 int keyno;
2316
2317 /*
2318 * First check if there are any IS [NOT] NULL scan keys, and if we're
2319 * violating them.
2320 */
2321 for (keyno = 0; keyno < nnullkeys; keyno++)
2322 {
2323 ScanKey key = nullkeys[keyno];
2324
2325 Assert(key->sk_attno == bval->bv_attno);
2326
2327 /* Handle only IS NULL/IS NOT NULL tests */
2328 if (!(key->sk_flags & SK_ISNULL))
2329 continue;
2330
2331 if (key->sk_flags & SK_SEARCHNULL)
2332 {
2333 /* IS NULL scan key, but range has no NULLs */
2334 if (!bval->bv_allnulls && !bval->bv_hasnulls)
2335 return false;
2336 }
2337 else if (key->sk_flags & SK_SEARCHNOTNULL)
2338 {
2339 /*
2340 * For IS NOT NULL, we can only skip ranges that are known to have
2341 * only nulls.
2342 */
2343 if (bval->bv_allnulls)
2344 return false;
2345 }
2346 else
2347 {
2348 /*
2349 * Neither IS NULL nor IS NOT NULL was used; assume all indexable
2350 * operators are strict and thus return false with NULL value in
2351 * the scan key.
2352 */
2353 return false;
2354 }
2355 }
2356
2357 return true;
2358}
2359
2360/*
2361 * Create parallel context, and launch workers for leader.
2362 *
2363 * buildstate argument should be initialized (with the exception of the
2364 * tuplesort states, which may later be created based on shared
2365 * state initially set up here).
2366 *
2367 * isconcurrent indicates if operation is CREATE INDEX CONCURRENTLY.
2368 *
2369 * request is the target number of parallel worker processes to launch.
2370 *
2371 * Sets buildstate's BrinLeader, which caller must use to shut down parallel
2372 * mode by passing it to _brin_end_parallel() at the very end of its index
2373 * build. If not even a single worker process can be launched, this is
2374 * never set, and caller should proceed with a serial index build.
2375 */
2376static void
2378 bool isconcurrent, int request)
2379{
2380 ParallelContext *pcxt;
2381 int scantuplesortstates;
2382 Snapshot snapshot;
2384 Size estsort;
2385 BrinShared *brinshared;
2386 Sharedsort *sharedsort;
2388 WalUsage *walusage;
2389 BufferUsage *bufferusage;
2390 bool leaderparticipates = true;
2391 int querylen;
2392
2393#ifdef DISABLE_LEADER_PARTICIPATION
2394 leaderparticipates = false;
2395#endif
2396
2397 /*
2398 * Enter parallel mode, and create context for parallel build of brin
2399 * index
2400 */
2402 Assert(request > 0);
2403 pcxt = CreateParallelContext("postgres", "_brin_parallel_build_main",
2404 request);
2405
2406 scantuplesortstates = leaderparticipates ? request + 1 : request;
2407
2408 /*
2409 * Prepare for scan of the base relation. In a normal index build, we use
2410 * SnapshotAny because we must retrieve all tuples and do our own time
2411 * qual checks (because we have to index RECENTLY_DEAD tuples). In a
2412 * concurrent build, we take a regular MVCC snapshot and index whatever's
2413 * live according to that.
2414 */
2415 if (!isconcurrent)
2416 snapshot = SnapshotAny;
2417 else
2419
2420 /*
2421 * Estimate size for our own PARALLEL_KEY_BRIN_SHARED workspace.
2422 */
2425 estsort = tuplesort_estimate_shared(scantuplesortstates);
2427
2429
2430 /*
2431 * Estimate space for WalUsage and BufferUsage -- PARALLEL_KEY_WAL_USAGE
2432 * and PARALLEL_KEY_BUFFER_USAGE.
2433 *
2434 * If there are no extensions loaded that care, we could skip this. We
2435 * have no way of knowing whether anyone's looking at pgWalUsage or
2436 * pgBufferUsage, so do it unconditionally.
2437 */
2439 mul_size(sizeof(WalUsage), pcxt->nworkers));
2442 mul_size(sizeof(BufferUsage), pcxt->nworkers));
2444
2445 /* Finally, estimate PARALLEL_KEY_QUERY_TEXT space */
2447 {
2451 }
2452 else
2453 querylen = 0; /* keep compiler quiet */
2454
2455 /* Everyone's had a chance to ask for space, so now create the DSM */
2457
2458 /* If no DSM segment was available, back out (do serial build) */
2459 if (pcxt->seg == NULL)
2460 {
2461 if (IsMVCCSnapshot(snapshot))
2462 UnregisterSnapshot(snapshot);
2465 return;
2466 }
2467
2468 /* Store shared build state, for which we reserved space */
2469 brinshared = (BrinShared *) shm_toc_allocate(pcxt->toc, estbrinshared);
2470 /* Initialize immutable state */
2471 brinshared->heaprelid = RelationGetRelid(heap);
2472 brinshared->indexrelid = RelationGetRelid(index);
2473 brinshared->isconcurrent = isconcurrent;
2474 brinshared->scantuplesortstates = scantuplesortstates;
2475 brinshared->pagesPerRange = buildstate->bs_pagesPerRange;
2476 brinshared->queryid = pgstat_get_my_query_id();
2478 SpinLockInit(&brinshared->mutex);
2479
2480 /* Initialize mutable state */
2481 brinshared->nparticipantsdone = 0;
2482 brinshared->reltuples = 0.0;
2483 brinshared->indtuples = 0.0;
2484
2487 snapshot);
2488
2489 /*
2490 * Store shared tuplesort-private state, for which we reserved space.
2491 * Then, initialize opaque state using tuplesort routine.
2492 */
2493 sharedsort = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);
2494 tuplesort_initialize_shared(sharedsort, scantuplesortstates,
2495 pcxt->seg);
2496
2497 /*
2498 * Store shared tuplesort-private state, for which we reserved space.
2499 * Then, initialize opaque state using tuplesort routine.
2500 */
2501 shm_toc_insert(pcxt->toc, PARALLEL_KEY_BRIN_SHARED, brinshared);
2502 shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT, sharedsort);
2503
2504 /* Store query string for workers */
2506 {
2507 char *sharedquery;
2508
2509 sharedquery = (char *) shm_toc_allocate(pcxt->toc, querylen + 1);
2512 }
2513
2514 /*
2515 * Allocate space for each worker's WalUsage and BufferUsage; no need to
2516 * initialize.
2517 */
2518 walusage = shm_toc_allocate(pcxt->toc,
2519 mul_size(sizeof(WalUsage), pcxt->nworkers));
2520 shm_toc_insert(pcxt->toc, PARALLEL_KEY_WAL_USAGE, walusage);
2521 bufferusage = shm_toc_allocate(pcxt->toc,
2522 mul_size(sizeof(BufferUsage), pcxt->nworkers));
2523 shm_toc_insert(pcxt->toc, PARALLEL_KEY_BUFFER_USAGE, bufferusage);
2524
2525 /* Launch workers, saving status for leader/caller */
2527 brinleader->pcxt = pcxt;
2528 brinleader->nparticipanttuplesorts = pcxt->nworkers_launched;
2530 brinleader->nparticipanttuplesorts++;
2531 brinleader->brinshared = brinshared;
2532 brinleader->sharedsort = sharedsort;
2533 brinleader->snapshot = snapshot;
2534 brinleader->walusage = walusage;
2535 brinleader->bufferusage = bufferusage;
2536
2537 /* If no workers were successfully launched, back out (do serial build) */
2538 if (pcxt->nworkers_launched == 0)
2539 {
2541 return;
2542 }
2543
2544 /* Save leader state now that it's clear build will be parallel */
2545 buildstate->bs_leader = brinleader;
2546
2547 /* Join heap scan ourselves */
2550
2551 /*
2552 * Caller needs to wait for all launched workers when we return. Make
2553 * sure that the failure-to-start case will not hang forever.
2554 */
2556}
2557
2558/*
2559 * Shut down workers, destroy parallel context, and end parallel mode.
2560 */
2561static void
2563{
2564 int i;
2565
2566 /* Shutdown worker processes */
2568
2569 /*
2570 * Next, accumulate WAL usage. (This must wait for the workers to finish,
2571 * or we might get incomplete data.)
2572 */
2573 for (i = 0; i < brinleader->pcxt->nworkers_launched; i++)
2574 InstrAccumParallelQuery(&brinleader->bufferusage[i], &brinleader->walusage[i]);
2575
2576 /* Free last reference to MVCC snapshot, if one was used */
2577 if (IsMVCCSnapshot(brinleader->snapshot))
2578 UnregisterSnapshot(brinleader->snapshot);
2581}
2582
2583/*
2584 * Within leader, wait for end of heap scan.
2585 *
2586 * When called, parallel heap scan started by _brin_begin_parallel() will
2587 * already be underway within worker processes (when leader participates
2588 * as a worker, we should end up here just as workers are finishing).
2589 *
2590 * Returns the total number of heap tuples scanned.
2591 */
2592static double
2594{
2595 BrinShared *brinshared = state->bs_leader->brinshared;
2596 int nparticipanttuplesorts;
2597
2598 nparticipanttuplesorts = state->bs_leader->nparticipanttuplesorts;
2599 for (;;)
2600 {
2601 SpinLockAcquire(&brinshared->mutex);
2602 if (brinshared->nparticipantsdone == nparticipanttuplesorts)
2603 {
2604 /* copy the data into leader state */
2605 state->bs_reltuples = brinshared->reltuples;
2606 state->bs_numtuples = brinshared->indtuples;
2607
2608 SpinLockRelease(&brinshared->mutex);
2609 break;
2610 }
2611 SpinLockRelease(&brinshared->mutex);
2612
2615 }
2616
2618
2619 return state->bs_reltuples;
2620}
2621
2622/*
2623 * Within leader, wait for end of heap scan and merge per-worker results.
2624 *
2625 * After waiting for all workers to finish, merge the per-worker results into
2626 * the complete index. The results from each worker are sorted by block number
2627 * (start of the page range). While combining the per-worker results we merge
2628 * summaries for the same page range, and also fill-in empty summaries for
2629 * ranges without any tuples.
2630 *
2631 * Returns the total number of heap tuples scanned.
2632 */
2633static double
2635{
2636 BrinTuple *btup;
2638 Size tuplen;
2639 BlockNumber prevblkno = InvalidBlockNumber;
2641 oldCxt;
2642 double reltuples;
2643
2644 /* wait for workers to scan table and produce partial results */
2645 reltuples = _brin_parallel_heapscan(state);
2646
2647 /* do the actual sort in the leader */
2648 tuplesort_performsort(state->bs_sortstate);
2649
2650 /*
2651 * Initialize BrinMemTuple we'll use to union summaries from workers (in
2652 * case they happened to produce parts of the same page range).
2653 */
2654 memtuple = brin_new_memtuple(state->bs_bdesc);
2655
2656 /*
2657 * Create a memory context we'll reset to combine results for a single
2658 * page range (received from the workers). We don't expect huge number of
2659 * overlaps under regular circumstances, because for large tables the
2660 * chunk size is likely larger than the BRIN page range), but it can
2661 * happen, and the union functions may do all kinds of stuff. So we better
2662 * reset the context once in a while.
2663 */
2665 "brin union",
2668
2669 /*
2670 * Read the BRIN tuples from the shared tuplesort, sorted by block number.
2671 * That probably gives us an index that is cheaper to scan, thanks to
2672 * mostly getting data from the same index page as before.
2673 */
2674 while ((btup = tuplesort_getbrintuple(state->bs_sortstate, &tuplen, true)) != NULL)
2675 {
2676 /* Ranges should be multiples of pages_per_range for the index. */
2677 Assert(btup->bt_blkno % state->bs_leader->brinshared->pagesPerRange == 0);
2678
2679 /*
2680 * Do we need to union summaries for the same page range?
2681 *
2682 * If this is the first brin tuple we read, then just deform it into
2683 * the memtuple, and continue with the next one from tuplesort. We
2684 * however may need to insert empty summaries into the index.
2685 *
2686 * If it's the same block as the last we saw, we simply union the brin
2687 * tuple into it, and we're done - we don't even need to insert empty
2688 * ranges, because that was done earlier when we saw the first brin
2689 * tuple (for this range).
2690 *
2691 * Finally, if it's not the first brin tuple, and it's not the same
2692 * page range, we need to do the insert and then deform the tuple into
2693 * the memtuple. Then we'll insert empty ranges before the new brin
2694 * tuple, if needed.
2695 */
2696 if (prevblkno == InvalidBlockNumber)
2697 {
2698 /* First brin tuples, just deform into memtuple. */
2700
2701 /* continue to insert empty pages before thisblock */
2702 }
2703 else if (memtuple->bt_blkno == btup->bt_blkno)
2704 {
2705 /*
2706 * Not the first brin tuple, but same page range as the previous
2707 * one, so we can merge it into the memtuple.
2708 */
2709 union_tuples(state->bs_bdesc, memtuple, btup);
2710 continue;
2711 }
2712 else
2713 {
2714 BrinTuple *tmp;
2715 Size len;
2716
2717 /*
2718 * We got brin tuple for a different page range, so form a brin
2719 * tuple from the memtuple, insert it, and re-init the memtuple
2720 * from the new brin tuple.
2721 */
2722 tmp = brin_form_tuple(state->bs_bdesc, memtuple->bt_blkno,
2723 memtuple, &len);
2724
2725 brin_doinsert(state->bs_irel, state->bs_pagesPerRange, state->bs_rmAccess,
2726 &state->bs_currentInsertBuf, tmp->bt_blkno, tmp, len);
2727
2728 /*
2729 * Reset the per-output-range context. This frees all the memory
2730 * possibly allocated by the union functions, and also the BRIN
2731 * tuple we just formed and inserted.
2732 */
2734
2736
2737 /* continue to insert empty pages before thisblock */
2738 }
2739
2740 /* Fill empty ranges for all ranges missing in the tuplesort. */
2741 brin_fill_empty_ranges(state, prevblkno, btup->bt_blkno);
2742
2743 prevblkno = btup->bt_blkno;
2744 }
2745
2746 tuplesort_end(state->bs_sortstate);
2747
2748 /* Fill the BRIN tuple for the last page range with data. */
2749 if (prevblkno != InvalidBlockNumber)
2750 {
2751 BrinTuple *tmp;
2752 Size len;
2753
2754 tmp = brin_form_tuple(state->bs_bdesc, memtuple->bt_blkno,
2755 memtuple, &len);
2756
2757 brin_doinsert(state->bs_irel, state->bs_pagesPerRange, state->bs_rmAccess,
2758 &state->bs_currentInsertBuf, tmp->bt_blkno, tmp, len);
2759
2760 pfree(tmp);
2761 }
2762
2763 /* Fill empty ranges at the end, for all ranges missing in the tuplesort. */
2764 brin_fill_empty_ranges(state, prevblkno, state->bs_maxRangeStart);
2765
2766 /*
2767 * Switch back to the original memory context, and destroy the one we
2768 * created to isolate the union_tuple calls.
2769 */
2772
2773 return reltuples;
2774}
2775
2776/*
2777 * Returns size of shared memory required to store state for a parallel
2778 * brin index build based on the snapshot its parallel scan will use.
2779 */
2780static Size
2782{
2783 /* c.f. shm_toc_allocate as to why BUFFERALIGN is used */
2784 return add_size(BUFFERALIGN(sizeof(BrinShared)),
2785 table_parallelscan_estimate(heap, snapshot));
2786}
2787
2788/*
2789 * Within leader, participate as a parallel worker.
2790 */
2791static void
2793{
2794 BrinLeader *brinleader = buildstate->bs_leader;
2795 int sortmem;
2796
2797 /*
2798 * Might as well use reliable figure when doling out maintenance_work_mem
2799 * (when requested number of workers were not launched, this will be
2800 * somewhat higher than it is for other workers).
2801 */
2803
2804 /* Perform work common to all participants */
2806 brinleader->sharedsort, heap, index, sortmem, true);
2807}
2808
2809/*
2810 * Perform a worker's portion of a parallel sort.
2811 *
2812 * This generates a tuplesort for the worker portion of the table.
2813 *
2814 * sortmem is the amount of working memory to use within each worker,
2815 * expressed in KBs.
2816 *
2817 * When this returns, workers are done, and need only release resources.
2818 */
2819static void
2821 BrinShared *brinshared, Sharedsort *sharedsort,
2822 Relation heap, Relation index,
2823 int sortmem, bool progress)
2824{
2826 TableScanDesc scan;
2827 double reltuples;
2828 IndexInfo *indexInfo;
2829
2830 /* Initialize local tuplesort coordination state */
2832 coordinate->isWorker = true;
2833 coordinate->nParticipants = -1;
2834 coordinate->sharedsort = sharedsort;
2835
2836 /* Begin "partial" tuplesort */
2839
2840 /* Join parallel scan */
2841 indexInfo = BuildIndexInfo(index);
2842 indexInfo->ii_Concurrent = brinshared->isconcurrent;
2843
2844 scan = table_beginscan_parallel(heap,
2846
2847 reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
2849
2850 /* insert the last item */
2852
2853 /* sort the BRIN ranges built by this worker */
2854 tuplesort_performsort(state->bs_sortstate);
2855
2856 state->bs_reltuples += reltuples;
2857
2858 /*
2859 * Done. Record ambuild statistics.
2860 */
2861 SpinLockAcquire(&brinshared->mutex);
2862 brinshared->nparticipantsdone++;
2863 brinshared->reltuples += state->bs_reltuples;
2864 brinshared->indtuples += state->bs_numtuples;
2865 SpinLockRelease(&brinshared->mutex);
2866
2867 /* Notify leader */
2869
2870 tuplesort_end(state->bs_sortstate);
2871}
2872
2873/*
2874 * Perform work within a launched parallel process.
2875 */
2876void
2878{
2879 char *sharedquery;
2880 BrinShared *brinshared;
2881 Sharedsort *sharedsort;
2883 Relation heapRel;
2884 Relation indexRel;
2887 WalUsage *walusage;
2888 BufferUsage *bufferusage;
2889 int sortmem;
2890
2891 /*
2892 * The only possible status flag that can be set to the parallel worker is
2893 * PROC_IN_SAFE_IC.
2894 */
2895 Assert((MyProc->statusFlags == 0) ||
2897
2898 /* Set debug_query_string for individual workers first */
2901
2902 /* Report the query string from leader */
2904
2905 /* Look up brin shared state */
2906 brinshared = shm_toc_lookup(toc, PARALLEL_KEY_BRIN_SHARED, false);
2907
2908 /* Open relations using lock modes known to be obtained by index.c */
2909 if (!brinshared->isconcurrent)
2910 {
2913 }
2914 else
2915 {
2918 }
2919
2920 /* Track query ID */
2921 pgstat_report_query_id(brinshared->queryid, false);
2922
2923 /* Open relations within worker */
2924 heapRel = table_open(brinshared->heaprelid, heapLockmode);
2925 indexRel = index_open(brinshared->indexrelid, indexLockmode);
2926
2928 brinshared->pagesPerRange,
2930
2931 /* Look up shared state private to tuplesort.c */
2932 sharedsort = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT, false);
2933 tuplesort_attach_shared(sharedsort, seg);
2934
2935 /* Prepare to track buffer usage during parallel execution */
2937
2938 /*
2939 * Might as well use reliable figure when doling out maintenance_work_mem
2940 * (when requested number of workers were not launched, this will be
2941 * somewhat higher than it is for other workers).
2942 */
2944
2945 _brin_parallel_scan_and_build(buildstate, brinshared, sharedsort,
2946 heapRel, indexRel, sortmem, false);
2947
2948 /* Report WAL/buffer usage during parallel execution */
2949 bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false);
2950 walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false);
2952 &walusage[ParallelWorkerNumber]);
2953
2954 index_close(indexRel, indexLockmode);
2955 table_close(heapRel, heapLockmode);
2956}
2957
2958/*
2959 * brin_build_empty_tuple
2960 * Maybe initialize a BRIN tuple representing empty range.
2961 *
2962 * Returns a BRIN tuple representing an empty page range starting at the
2963 * specified block number. The empty tuple is initialized only once, when it's
2964 * needed for the first time, stored in the memory context bs_context to ensure
2965 * proper life span, and reused on following calls. All empty tuples are
2966 * exactly the same except for the bt_blkno field, which is set to the value
2967 * in blkno parameter.
2968 */
2969static void
2971{
2972 /* First time an empty tuple is requested? If yes, initialize it. */
2973 if (state->bs_emptyTuple == NULL)
2974 {
2977
2978 /* Allocate the tuple in context for the whole index build. */
2979 oldcxt = MemoryContextSwitchTo(state->bs_context);
2980
2981 state->bs_emptyTuple = brin_form_tuple(state->bs_bdesc, blkno, dtuple,
2982 &state->bs_emptyTupleLen);
2983
2985 }
2986 else
2987 {
2988 /* If we already have an empty tuple, just update the block. */
2989 state->bs_emptyTuple->bt_blkno = blkno;
2990 }
2991}
2992
2993/*
2994 * brin_fill_empty_ranges
2995 * Add BRIN index tuples representing empty page ranges.
2996 *
2997 * prevRange/nextRange determine for which page ranges to add empty summaries.
2998 * Both boundaries are exclusive, i.e. only ranges starting at blkno for which
2999 * (prevRange < blkno < nextRange) will be added to the index.
3000 *
3001 * If prevRange is InvalidBlockNumber, this means there was no previous page
3002 * range (i.e. the first empty range to add is for blkno=0).
3003 *
3004 * The empty tuple is built only once, and then reused for all future calls.
3005 */
3006static void
3009{
3010 BlockNumber blkno;
3011
3012 /*
3013 * If we already summarized some ranges, we need to start with the next
3014 * one. Otherwise start from the first range of the table.
3015 */
3016 blkno = (prevRange == InvalidBlockNumber) ? 0 : (prevRange + state->bs_pagesPerRange);
3017
3018 /* Generate empty ranges until we hit the next non-empty range. */
3019 while (blkno < nextRange)
3020 {
3021 /* Did we already build the empty tuple? If not, do it now. */
3023
3024 brin_doinsert(state->bs_irel, state->bs_pagesPerRange, state->bs_rmAccess,
3025 &state->bs_currentInsertBuf,
3026 blkno, state->bs_emptyTuple, state->bs_emptyTupleLen);
3027
3028 /* try next page range */
3029 blkno += state->bs_pagesPerRange;
3030 }
3031}
@ ACLCHECK_NOT_OWNER
Definition acl.h:185
void aclcheck_error(AclResult aclerr, ObjectType objtype, const char *objectname)
Definition aclchk.c:2654
bool object_ownercheck(Oid classid, Oid objectid, Oid roleid)
Definition aclchk.c:4108
int16 AttrNumber
Definition attnum.h:21
static bool validate(Port *port, const char *auth)
Definition auth-oauth.c:638
bool AutoVacuumRequestWork(AutoVacuumWorkItemType type, Oid relationId, BlockNumber blkno)
@ AVW_BRINSummarizeRange
Definition autovacuum.h:25
int ParallelWorkerNumber
Definition parallel.c:117
void InitializeParallelDSM(ParallelContext *pcxt)
Definition parallel.c:213
void WaitForParallelWorkersToFinish(ParallelContext *pcxt)
Definition parallel.c:805
void LaunchParallelWorkers(ParallelContext *pcxt)
Definition parallel.c:583
void DestroyParallelContext(ParallelContext *pcxt)
Definition parallel.c:959
ParallelContext * CreateParallelContext(const char *library_name, const char *function_name, int nworkers)
Definition parallel.c:175
void WaitForParallelWorkersToAttach(ParallelContext *pcxt)
Definition parallel.c:702
void pgstat_report_query_id(int64 query_id, bool force)
int64 pgstat_get_my_query_id(void)
void pgstat_report_activity(BackendState state, const char *cmd_str)
@ STATE_RUNNING
uint32 BlockNumber
Definition block.h:31
#define InvalidBlockNumber
Definition block.h:33
#define MaxBlockNumber
Definition block.h:35
static Datum values[MAXATTR]
Definition bootstrap.c:187
#define PARALLEL_KEY_BUFFER_USAGE
Definition brin.c:53
void brininsertcleanup(Relation index, IndexInfo *indexInfo)
Definition brin.c:515
static double _brin_parallel_merge(BrinBuildState *state)
Definition brin.c:2634
static void brin_vacuum_scan(Relation idxrel, BufferAccessStrategy strategy)
Definition brin.c:2172
Datum brin_desummarize_range(PG_FUNCTION_ARGS)
Definition brin.c:1494
void brinrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, ScanKey orderbys, int norderbys)
Definition brin.c:962
static void terminate_brin_buildstate(BrinBuildState *state)
Definition brin.c:1716
#define PARALLEL_KEY_BRIN_SHARED
Definition brin.c:49
Datum brin_summarize_range(PG_FUNCTION_ARGS)
Definition brin.c:1384
IndexBulkDeleteResult * brinbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state)
Definition brin.c:1306
static void form_and_spill_tuple(BrinBuildState *state)
Definition brin.c:2006
#define BRIN_ALL_BLOCKRANGES
Definition brin.c:211
Datum brin_summarize_new_values(PG_FUNCTION_ARGS)
Definition brin.c:1369
IndexScanDesc brinbeginscan(Relation r, int nkeys, int norderbys)
Definition brin.c:542
bytea * brinoptions(Datum reloptions, bool validate)
Definition brin.c:1351
int64 bringetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
Definition brin.c:570
static void brinsummarize(Relation index, Relation heapRel, BlockNumber pageRange, bool include_partial, double *numSummarized, double *numExisting)
Definition brin.c:1887
static void form_and_insert_tuple(BrinBuildState *state)
Definition brin.c:1985
void brinbuildempty(Relation index)
Definition brin.c:1277
void brin_free_desc(BrinDesc *bdesc)
Definition brin.c:1639
static void union_tuples(BrinDesc *bdesc, BrinMemTuple *a, BrinTuple *b)
Definition brin.c:2031
static void _brin_parallel_scan_and_build(BrinBuildState *state, BrinShared *brinshared, Sharedsort *sharedsort, Relation heap, Relation index, int sortmem, bool progress)
Definition brin.c:2820
static BrinBuildState * initialize_brin_buildstate(Relation idxRel, BrinRevmap *revmap, BlockNumber pagesPerRange, BlockNumber tablePages)
Definition brin.c:1672
static void _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, bool isconcurrent, int request)
Definition brin.c:2377
void brinGetStats(Relation index, BrinStatsData *stats)
Definition brin.c:1651
static void _brin_leader_participate_as_worker(BrinBuildState *buildstate, Relation heap, Relation index)
Definition brin.c:2792
static bool add_values_to_range(Relation idxRel, BrinDesc *bdesc, BrinMemTuple *dtup, const Datum *values, const bool *nulls)
Definition brin.c:2219
static void _brin_end_parallel(BrinLeader *brinleader, BrinBuildState *state)
Definition brin.c:2562
static Size _brin_parallel_estimate_shared(Relation heap, Snapshot snapshot)
Definition brin.c:2781
static void brin_fill_empty_ranges(BrinBuildState *state, BlockNumber prevRange, BlockNumber nextRange)
Definition brin.c:3007
IndexBuildResult * brinbuild(Relation heap, Relation index, IndexInfo *indexInfo)
Definition brin.c:1108
IndexBulkDeleteResult * brinvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
Definition brin.c:1321
static void summarize_range(IndexInfo *indexInfo, BrinBuildState *state, Relation heapRel, BlockNumber heapBlk, BlockNumber heapNumBlks)
Definition brin.c:1761
#define ParallelTableScanFromBrinShared(shared)
Definition brin.c:118
#define PARALLEL_KEY_TUPLESORT
Definition brin.c:50
static void brinbuildCallbackParallel(Relation index, ItemPointer tid, Datum *values, bool *isnull, bool tupleIsAlive, void *brstate)
Definition brin.c:1049
bool brininsert(Relation idxRel, Datum *values, bool *nulls, ItemPointer heaptid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, IndexInfo *indexInfo)
Definition brin.c:347
#define PARALLEL_KEY_QUERY_TEXT
Definition brin.c:51
Datum brinhandler(PG_FUNCTION_ARGS)
Definition brin.c:252
BrinDesc * brin_build_desc(Relation rel)
Definition brin.c:1584
void _brin_parallel_build_main(dsm_segment *seg, shm_toc *toc)
Definition brin.c:2877
static void brin_build_empty_tuple(BrinBuildState *state, BlockNumber blkno)
Definition brin.c:2970
#define PARALLEL_KEY_WAL_USAGE
Definition brin.c:52
static double _brin_parallel_heapscan(BrinBuildState *state)
Definition brin.c:2593
static BrinInsertState * initialize_brin_insertstate(Relation idxRel, IndexInfo *indexInfo)
Definition brin.c:318
static void brinbuildCallback(Relation index, ItemPointer tid, Datum *values, bool *isnull, bool tupleIsAlive, void *brstate)
Definition brin.c:998
void brinendscan(IndexScanDesc scan)
Definition brin.c:981
static bool check_null_keys(BrinValues *bval, ScanKey *nullkeys, int nnullkeys)
Definition brin.c:2313
#define BrinGetPagesPerRange(relation)
Definition brin.h:41
#define BrinGetAutoSummarize(relation)
Definition brin.h:47
#define BRIN_LAST_OPTIONAL_PROCNUM
#define BRIN_PROCNUM_UNION
#define BRIN_PROCNUM_OPTIONS
#define BRIN_PROCNUM_OPCINFO
#define BRIN_PROCNUM_CONSISTENT
#define BRIN_elog(args)
#define BRIN_PROCNUM_ADDVALUE
#define BRIN_CURRENT_VERSION
Definition brin_page.h:72
#define BRIN_METAPAGE_BLKNO
Definition brin_page.h:75
bool brin_doupdate(Relation idxrel, BlockNumber pagesPerRange, BrinRevmap *revmap, BlockNumber heapBlk, Buffer oldbuf, OffsetNumber oldoff, const BrinTuple *origtup, Size origsz, const BrinTuple *newtup, Size newsz, bool samepage)
void brin_page_cleanup(Relation idxrel, Buffer buf)
OffsetNumber brin_doinsert(Relation idxrel, BlockNumber pagesPerRange, BrinRevmap *revmap, Buffer *buffer, BlockNumber heapBlk, const BrinTuple *tup, Size itemsz)
void brin_metapage_init(Page page, BlockNumber pagesPerRange, uint16 version)
bool brin_can_do_samepage_update(Buffer buffer, Size origsz, Size newsz)
bool brinRevmapDesummarizeRange(Relation idxrel, BlockNumber heapBlk)
void brinRevmapTerminate(BrinRevmap *revmap)
BrinRevmap * brinRevmapInitialize(Relation idxrel, BlockNumber *pagesPerRange)
Definition brin_revmap.c:70
BrinTuple * brinGetTupleForHeapBlock(BrinRevmap *revmap, BlockNumber heapBlk, Buffer *buf, OffsetNumber *off, Size *size, int mode)
BrinTuple * brin_copy_tuple(BrinTuple *tuple, Size len, BrinTuple *dest, Size *destsz)
Definition brin_tuple.c:446
BrinTuple * brin_form_tuple(BrinDesc *brdesc, BlockNumber blkno, BrinMemTuple *tuple, Size *size)
Definition brin_tuple.c:100
BrinMemTuple * brin_new_memtuple(BrinDesc *brdesc)
Definition brin_tuple.c:482
void brin_free_tuple(BrinTuple *tuple)
Definition brin_tuple.c:433
BrinTuple * brin_form_placeholder_tuple(BrinDesc *brdesc, BlockNumber blkno, Size *size)
Definition brin_tuple.c:388
BrinMemTuple * brin_memtuple_initialize(BrinMemTuple *dtuple, BrinDesc *brdesc)
Definition brin_tuple.c:511
BrinMemTuple * brin_deform_tuple(BrinDesc *brdesc, BrinTuple *tuple, BrinMemTuple *dMemtuple)
Definition brin_tuple.c:553
bool brinvalidate(Oid opclassoid)
#define SizeOfBrinCreateIdx
Definition brin_xlog.h:55
#define XLOG_BRIN_CREATE_INDEX
Definition brin_xlog.h:31
int Buffer
Definition buf.h:23
#define BufferIsInvalid(buffer)
Definition buf.h:31
#define InvalidBuffer
Definition buf.h:25
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition bufmgr.c:4357
Buffer ExtendBufferedRel(BufferManagerRelation bmr, ForkNumber forkNum, BufferAccessStrategy strategy, uint32 flags)
Definition bufmgr.c:974
void ReleaseBuffer(Buffer buffer)
Definition bufmgr.c:5505
void UnlockReleaseBuffer(Buffer buffer)
Definition bufmgr.c:5522
void MarkBufferDirty(Buffer buffer)
Definition bufmgr.c:3063
Buffer ReadBuffer(Relation reln, BlockNumber blockNum)
Definition bufmgr.c:874
#define RelationGetNumberOfBlocks(reln)
Definition bufmgr.h:307
static Page BufferGetPage(Buffer buffer)
Definition bufmgr.h:470
@ BUFFER_LOCK_SHARE
Definition bufmgr.h:210
@ BUFFER_LOCK_UNLOCK
Definition bufmgr.h:205
static void LockBuffer(Buffer buffer, BufferLockMode mode)
Definition bufmgr.h:332
@ EB_SKIP_EXTENSION_LOCK
Definition bufmgr.h:75
@ EB_LOCK_FIRST
Definition bufmgr.h:87
#define BMR_REL(p_rel)
Definition bufmgr.h:114
static bool BufferIsValid(Buffer bufnum)
Definition bufmgr.h:421
Size PageGetFreeSpace(const PageData *page)
Definition bufpage.c:906
static char * PageGetContents(Page page)
Definition bufpage.h:283
static ItemId PageGetItemId(Page page, OffsetNumber offsetNumber)
Definition bufpage.h:269
static void PageSetLSN(Page page, XLogRecPtr lsn)
Definition bufpage.h:417
PageData * Page
Definition bufpage.h:81
#define Min(x, y)
Definition c.h:1075
#define MAXALIGN(LEN)
Definition c.h:880
#define PG_USED_FOR_ASSERTS_ONLY
Definition c.h:243
#define BUFFERALIGN(LEN)
Definition c.h:882
#define Assert(condition)
Definition c.h:927
int64_t int64
Definition c.h:597
int32_t int32
Definition c.h:596
uint64_t uint64
Definition c.h:601
#define lengthof(array)
Definition c.h:857
#define OidIsValid(objectId)
Definition c.h:842
size_t Size
Definition c.h:673
bool ConditionVariableCancelSleep(void)
void ConditionVariableInit(ConditionVariable *cv)
void ConditionVariableSleep(ConditionVariable *cv, uint32 wait_event_info)
void ConditionVariableSignal(ConditionVariable *cv)
Datum datumCopy(Datum value, bool typByVal, int typLen)
Definition datum.c:132
int errcode(int sqlerrcode)
Definition elog.c:874
#define LOG
Definition elog.h:31
int errhint(const char *fmt,...) pg_attribute_printf(1
#define DEBUG2
Definition elog.h:29
#define DEBUG1
Definition elog.h:30
#define ERROR
Definition elog.h:39
#define elog(elevel,...)
Definition elog.h:226
#define ereport(elevel,...)
Definition elog.h:150
#define palloc_object(type)
Definition fe_memutils.h:74
#define palloc_array(type, count)
Definition fe_memutils.h:76
#define palloc0_array(type, count)
Definition fe_memutils.h:77
#define palloc0_object(type)
Definition fe_memutils.h:75
Datum FunctionCall4Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, Datum arg2, Datum arg3, Datum arg4)
Definition fmgr.c:1197
Datum FunctionCall3Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, Datum arg2, Datum arg3)
Definition fmgr.c:1172
void fmgr_info_copy(FmgrInfo *dstinfo, FmgrInfo *srcinfo, MemoryContext destcxt)
Definition fmgr.c:581
#define PG_RETURN_VOID()
Definition fmgr.h:350
#define PG_GETARG_OID(n)
Definition fmgr.h:275
#define DirectFunctionCall2(func, arg1, arg2)
Definition fmgr.h:686
#define PG_GETARG_DATUM(n)
Definition fmgr.h:268
#define PG_GETARG_INT64(n)
Definition fmgr.h:284
#define FunctionCall1(flinfo, arg1)
Definition fmgr.h:702
#define PG_RETURN_INT32(x)
Definition fmgr.h:355
#define PG_RETURN_POINTER(x)
Definition fmgr.h:363
#define PG_FUNCTION_ARGS
Definition fmgr.h:193
void FreeSpaceMapVacuumRange(Relation rel, BlockNumber start, BlockNumber end)
Definition freespace.c:377
void FreeSpaceMapVacuum(Relation rel)
Definition freespace.c:358
void RecordPageWithFreeSpace(Relation rel, BlockNumber heapBlk, Size spaceAvail)
Definition freespace.c:194
IndexScanDesc RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys)
Definition genam.c:80
bool(* IndexBulkDeleteCallback)(ItemPointer itemptr, void *state)
Definition genam.h:93
IndexUniqueCheck
Definition genam.h:122
int maintenance_work_mem
Definition globals.c:133
int NewGUCNestLevel(void)
Definition guc.c:2110
void RestrictSearchPath(void)
Definition guc.c:2121
void AtEOXact_GUC(bool isCommit, int nestLevel)
Definition guc.c:2137
Oid IndexGetRelation(Oid indexId, bool missing_ok)
Definition index.c:3583
IndexInfo * BuildIndexInfo(Relation index)
Definition index.c:2428
FmgrInfo * index_getprocinfo(Relation irel, AttrNumber attnum, uint16 procnum)
Definition indexam.c:917
void index_close(Relation relation, LOCKMODE lockmode)
Definition indexam.c:177
Relation index_open(Oid relationId, LOCKMODE lockmode)
Definition indexam.c:133
void InstrAccumParallelQuery(BufferUsage *bufusage, WalUsage *walusage)
Definition instrument.c:219
void InstrEndParallelQuery(BufferUsage *bufusage, WalUsage *walusage)
Definition instrument.c:209
void InstrStartParallelQuery(void)
Definition instrument.c:201
int b
Definition isn.c:74
int a
Definition isn.c:73
int i
Definition isn.c:77
#define ItemIdGetLength(itemId)
Definition itemid.h:59
static OffsetNumber ItemPointerGetOffsetNumber(const ItemPointerData *pointer)
Definition itemptr.h:124
static BlockNumber ItemPointerGetBlockNumber(const ItemPointerData *pointer)
Definition itemptr.h:103
int LOCKMODE
Definition lockdefs.h:26
#define AccessExclusiveLock
Definition lockdefs.h:43
#define AccessShareLock
Definition lockdefs.h:36
#define ShareUpdateExclusiveLock
Definition lockdefs.h:39
#define ShareLock
Definition lockdefs.h:40
#define RowExclusiveLock
Definition lockdefs.h:38
void MemoryContextReset(MemoryContext context)
Definition mcxt.c:403
void pfree(void *pointer)
Definition mcxt.c:1616
void * palloc(Size size)
Definition mcxt.c:1387
MemoryContext CurrentMemoryContext
Definition mcxt.c:160
void MemoryContextDelete(MemoryContext context)
Definition mcxt.c:472
#define AllocSetContextCreate
Definition memutils.h:129
#define ALLOCSET_DEFAULT_SIZES
Definition memutils.h:160
#define ALLOCSET_SMALL_SIZES
Definition memutils.h:170
#define SECURITY_RESTRICTED_OPERATION
Definition miscadmin.h:319
#define START_CRIT_SECTION()
Definition miscadmin.h:150
#define CHECK_FOR_INTERRUPTS()
Definition miscadmin.h:123
#define END_CRIT_SECTION()
Definition miscadmin.h:152
void GetUserIdAndSecContext(Oid *userid, int *sec_context)
Definition miscinit.c:613
Oid GetUserId(void)
Definition miscinit.c:470
void SetUserIdAndSecContext(Oid userid, int sec_context)
Definition miscinit.c:620
static char * errmsg
uint16 OffsetNumber
Definition off.h:24
#define FirstOffsetNumber
Definition off.h:27
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
Definition palloc.h:124
@ OBJECT_INDEX
FormData_pg_attribute * Form_pg_attribute
const void size_t len
static char buf[DEFAULT_XLOG_SEG_SIZE]
static int progress
Definition pgbench.c:262
#define ERRCODE_UNDEFINED_TABLE
Definition pgbench.c:79
#define pgstat_count_index_scan(rel)
Definition pgstat.h:708
const char * debug_query_string
Definition postgres.c:90
static Datum Int64GetDatum(int64 X)
Definition postgres.h:413
static bool DatumGetBool(Datum X)
Definition postgres.h:100
static Datum PointerGetDatum(const void *X)
Definition postgres.h:342
static Datum BoolGetDatum(bool X)
Definition postgres.h:112
static Datum ObjectIdGetDatum(Oid X)
Definition postgres.h:252
uint64_t Datum
Definition postgres.h:70
static Pointer DatumGetPointer(Datum X)
Definition postgres.h:332
static Datum Int32GetDatum(int32 X)
Definition postgres.h:212
#define InvalidOid
unsigned int Oid
static int fb(int x)
#define PROC_IN_SAFE_IC
Definition proc.h:60
Buffer read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
ReadStream * read_stream_begin_relation(int flags, BufferAccessStrategy strategy, Relation rel, ForkNumber forknum, ReadStreamBlockNumberCB callback, void *callback_private_data, size_t per_buffer_data_size)
void read_stream_end(ReadStream *stream)
BlockNumber block_range_read_stream_cb(ReadStream *stream, void *callback_private_data, void *per_buffer_data)
#define READ_STREAM_MAINTENANCE
Definition read_stream.h:28
#define READ_STREAM_USE_BATCHING
Definition read_stream.h:64
#define READ_STREAM_FULL
Definition read_stream.h:43
static void addrange(struct cvec *cv, chr from, chr to)
Definition regc_cvec.c:90
#define RelationGetRelid(relation)
Definition rel.h:514
#define RelationGetDescr(relation)
Definition rel.h:540
#define RelationGetRelationName(relation)
Definition rel.h:548
#define RelationNeedsWAL(relation)
Definition rel.h:637
void * build_reloptions(Datum reloptions, bool validate, relopt_kind kind, Size relopt_struct_size, const relopt_parse_elt *relopt_elems, int num_relopt_elems)
@ RELOPT_KIND_BRIN
Definition reloptions.h:53
@ RELOPT_TYPE_INT
Definition reloptions.h:33
@ RELOPT_TYPE_BOOL
Definition reloptions.h:31
@ MAIN_FORKNUM
Definition relpath.h:58
@ INIT_FORKNUM
Definition relpath.h:61
void brincostestimate(PlannerInfo *root, IndexPath *path, double loop_count, Cost *indexStartupCost, Cost *indexTotalCost, Selectivity *indexSelectivity, double *indexCorrelation, double *indexPages)
Definition selfuncs.c:8971
void * shm_toc_allocate(shm_toc *toc, Size nbytes)
Definition shm_toc.c:88
void shm_toc_insert(shm_toc *toc, uint64 key, void *address)
Definition shm_toc.c:171
void * shm_toc_lookup(shm_toc *toc, uint64 key, bool noError)
Definition shm_toc.c:232
#define shm_toc_estimate_chunk(e, sz)
Definition shm_toc.h:51
#define shm_toc_estimate_keys(e, cnt)
Definition shm_toc.h:53
Size add_size(Size s1, Size s2)
Definition shmem.c:482
Size mul_size(Size s1, Size s2)
Definition shmem.c:497
#define SK_SEARCHNOTNULL
Definition skey.h:122
#define SK_SEARCHNULL
Definition skey.h:121
#define SK_ISNULL
Definition skey.h:115
Snapshot GetTransactionSnapshot(void)
Definition snapmgr.c:272
void UnregisterSnapshot(Snapshot snapshot)
Definition snapmgr.c:866
Snapshot RegisterSnapshot(Snapshot snapshot)
Definition snapmgr.c:824
#define SnapshotAny
Definition snapmgr.h:33
#define IsMVCCSnapshot(snapshot)
Definition snapmgr.h:59
static void SpinLockRelease(volatile slock_t *lock)
Definition spin.h:62
static void SpinLockAcquire(volatile slock_t *lock)
Definition spin.h:56
static void SpinLockInit(volatile slock_t *lock)
Definition spin.h:50
PGPROC * MyProc
Definition proc.c:68
BlockNumber bs_maxRangeStart
Definition brin.c:165
Size bs_emptyTupleLen
Definition brin.c:171
MemoryContext bs_context
Definition brin.c:172
BrinMemTuple * bs_dtuple
Definition brin.c:168
Relation bs_irel
Definition brin.c:159
BlockNumber bs_pagesPerRange
Definition brin.c:163
double bs_numtuples
Definition brin.c:160
Buffer bs_currentInsertBuf
Definition brin.c:162
BrinRevmap * bs_rmAccess
Definition brin.c:166
Tuplesortstate * bs_sortstate
Definition brin.c:187
BrinLeader * bs_leader
Definition brin.c:179
int bs_worker_id
Definition brin.c:180
BlockNumber bs_currRangeStart
Definition brin.c:164
double bs_reltuples
Definition brin.c:161
BrinDesc * bs_bdesc
Definition brin.c:167
BrinTuple * bs_emptyTuple
Definition brin.c:170
BrinDesc * bis_desc
Definition brin.c:197
BrinRevmap * bis_rmAccess
Definition brin.c:196
BlockNumber bis_pages_per_range
Definition brin.c:198
int nparticipanttuplesorts
Definition brin.c:135
WalUsage * walusage
Definition brin.c:149
BrinShared * brinshared
Definition brin.c:146
BufferUsage * bufferusage
Definition brin.c:150
Snapshot snapshot
Definition brin.c:148
Sharedsort * sharedsort
Definition brin.c:147
ParallelContext * pcxt
Definition brin.c:127
BrinValues bt_columns[FLEXIBLE_ARRAY_MEMBER]
Definition brin_tuple.h:55
bool bt_empty_range
Definition brin_tuple.h:47
BlockNumber lastRevmapPage
Definition brin_page.h:69
BlockNumber pagesPerRange
Definition brin_page.h:68
BlockNumber bo_pagesPerRange
Definition brin.c:206
BrinDesc * bo_bdesc
Definition brin.c:208
BrinRevmap * bo_rmAccess
Definition brin.c:207
slock_t mutex
Definition brin.c:89
int scantuplesortstates
Definition brin.c:70
int nparticipantsdone
Definition brin.c:101
Oid heaprelid
Definition brin.c:66
BlockNumber pagesPerRange
Definition brin.c:69
ConditionVariable workersdonecv
Definition brin.c:81
Oid indexrelid
Definition brin.c:67
bool isconcurrent
Definition brin.c:68
double indtuples
Definition brin.c:103
int64 queryid
Definition brin.c:73
double reltuples
Definition brin.c:102
BlockNumber revmapNumPages
Definition brin.h:36
BlockNumber pagesPerRange
Definition brin.h:35
BlockNumber bt_blkno
Definition brin_tuple.h:66
bool bv_hasnulls
Definition brin_tuple.h:32
AttrNumber bv_attno
Definition brin_tuple.h:31
bool bv_allnulls
Definition brin_tuple.h:33
NodeTag type
Definition amapi.h:234
double heap_tuples
Definition genam.h:38
double index_tuples
Definition genam.h:39
BlockNumber num_pages
Definition genam.h:83
double num_index_tuples
Definition genam.h:85
void * ii_AmCache
Definition execnodes.h:225
int ii_ParallelWorkers
Definition execnodes.h:220
bool ii_Concurrent
Definition execnodes.h:212
MemoryContext ii_Context
Definition execnodes.h:228
struct ScanKeyData * keyData
Definition relscan.h:142
struct IndexScanInstrumentation * instrument
Definition relscan.h:160
Relation indexRelation
Definition relscan.h:138
Relation index
Definition genam.h:52
bool analyze_only
Definition genam.h:54
BufferAccessStrategy strategy
Definition genam.h:59
uint8 statusFlags
Definition proc.h:202
dsm_segment * seg
Definition parallel.h:44
shm_toc_estimator estimator
Definition parallel.h:43
shm_toc * toc
Definition parallel.h:46
int nworkers_launched
Definition parallel.h:39
Form_pg_index rd_index
Definition rel.h:192
Form_pg_class rd_rel
Definition rel.h:111
Oid sk_collation
Definition skey.h:70
Definition type.h:96
Definition c.h:760
void table_close(Relation relation, LOCKMODE lockmode)
Definition table.c:126
Relation table_open(Oid relationId, LOCKMODE lockmode)
Definition table.c:40
TableScanDesc table_beginscan_parallel(Relation relation, ParallelTableScanDesc pscan)
Definition tableam.c:166
Size table_parallelscan_estimate(Relation rel, Snapshot snapshot)
Definition tableam.c:131
void table_parallelscan_initialize(Relation rel, ParallelTableScanDesc pscan, Snapshot snapshot)
Definition tableam.c:146
static double table_index_build_range_scan(Relation table_rel, Relation index_rel, IndexInfo *index_info, bool allow_sync, bool anyvisible, bool progress, BlockNumber start_blockno, BlockNumber numblocks, IndexBuildCallback callback, void *callback_state, TableScanDesc scan)
Definition tableam.h:1798
static double table_index_build_scan(Relation table_rel, Relation index_rel, IndexInfo *index_info, bool allow_sync, bool progress, IndexBuildCallback callback, void *callback_state, TableScanDesc scan)
Definition tableam.h:1765
static void callback(struct sockaddr *addr, struct sockaddr *mask, void *unused)
void tbm_add_page(TIDBitmap *tbm, BlockNumber pageno)
Definition tidbitmap.c:432
static FormData_pg_attribute * TupleDescAttr(TupleDesc tupdesc, int i)
Definition tupdesc.h:178
void tuplesort_performsort(Tuplesortstate *state)
Definition tuplesort.c:1259
void tuplesort_initialize_shared(Sharedsort *shared, int nWorkers, dsm_segment *seg)
Definition tuplesort.c:3210
Size tuplesort_estimate_shared(int nWorkers)
Definition tuplesort.c:3189
void tuplesort_end(Tuplesortstate *state)
Definition tuplesort.c:847
void tuplesort_attach_shared(Sharedsort *shared, dsm_segment *seg)
Definition tuplesort.c:3233
#define TUPLESORT_NONE
Definition tuplesort.h:67
Tuplesortstate * tuplesort_begin_index_brin(int workMem, SortCoordinate coordinate, int sortopt)
BrinTuple * tuplesort_getbrintuple(Tuplesortstate *state, Size *len, bool forward)
void tuplesort_putbrintuple(Tuplesortstate *state, BrinTuple *tuple, Size size)
#define VACUUM_OPTION_PARALLEL_CLEANUP
Definition vacuum.h:63
void ExitParallelMode(void)
Definition xact.c:1066
void EnterParallelMode(void)
Definition xact.c:1053
bool RecoveryInProgress(void)
Definition xlog.c:6444
uint64 XLogRecPtr
Definition xlogdefs.h:21
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition xloginsert.c:479
void XLogRegisterData(const void *data, uint32 len)
Definition xloginsert.c:369
XLogRecPtr log_newpage_buffer(Buffer buffer, bool page_std)
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition xloginsert.c:246
void XLogBeginInsert(void)
Definition xloginsert.c:153
#define REGBUF_STANDARD
Definition xloginsert.h:35
#define REGBUF_WILL_INIT
Definition xloginsert.h:34