PostgreSQL Source Code  git master
brin.c
Go to the documentation of this file.
1 /*
2  * brin.c
3  * Implementation of BRIN indexes for Postgres
4  *
5  * See src/backend/access/brin/README for details.
6  *
7  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
8  * Portions Copyright (c) 1994, Regents of the University of California
9  *
10  * IDENTIFICATION
11  * src/backend/access/brin/brin.c
12  *
13  * TODO
14  * * ScalarArrayOpExpr (amsearcharray -> SK_SEARCHARRAY)
15  */
16 #include "postgres.h"
17 
18 #include "access/brin.h"
19 #include "access/brin_page.h"
20 #include "access/brin_pageops.h"
21 #include "access/brin_xlog.h"
22 #include "access/relation.h"
23 #include "access/reloptions.h"
24 #include "access/relscan.h"
25 #include "access/table.h"
26 #include "access/tableam.h"
27 #include "access/xloginsert.h"
28 #include "catalog/index.h"
29 #include "catalog/pg_am.h"
30 #include "commands/vacuum.h"
31 #include "miscadmin.h"
32 #include "pgstat.h"
33 #include "postmaster/autovacuum.h"
34 #include "storage/bufmgr.h"
35 #include "storage/freespace.h"
36 #include "tcop/tcopprot.h" /* pgrminclude ignore */
37 #include "utils/acl.h"
38 #include "utils/datum.h"
39 #include "utils/fmgrprotos.h"
40 #include "utils/guc.h"
41 #include "utils/index_selfuncs.h"
42 #include "utils/memutils.h"
43 #include "utils/rel.h"
44 #include "utils/tuplesort.h"
45 
46 /* Magic numbers for parallel state sharing */
47 #define PARALLEL_KEY_BRIN_SHARED UINT64CONST(0xB000000000000001)
48 #define PARALLEL_KEY_TUPLESORT UINT64CONST(0xB000000000000002)
49 #define PARALLEL_KEY_QUERY_TEXT UINT64CONST(0xB000000000000003)
50 #define PARALLEL_KEY_WAL_USAGE UINT64CONST(0xB000000000000004)
51 #define PARALLEL_KEY_BUFFER_USAGE UINT64CONST(0xB000000000000005)
52 
53 /*
54  * Status for index builds performed in parallel. This is allocated in a
55  * dynamic shared memory segment.
56  */
57 typedef struct BrinShared
58 {
59  /*
60  * These fields are not modified during the build. They primarily exist
61  * for the benefit of worker processes that need to create state
62  * corresponding to that used by the leader.
63  */
69 
70  /* Query ID, for report in worker processes */
71  uint64 queryid;
72 
73  /*
74  * workersdonecv is used to monitor the progress of workers. All parallel
75  * participants must indicate that they are done before leader can use
76  * results built by the workers (and before leader can write the data into
77  * the index).
78  */
80 
81  /*
82  * mutex protects all fields before heapdesc.
83  *
84  * These fields contain status information of interest to BRIN index
85  * builds that must work just the same when an index is built in parallel.
86  */
87  slock_t mutex;
88 
89  /*
90  * Mutable state that is maintained by workers, and reported back to
91  * leader at end of the scans.
92  *
93  * nparticipantsdone is number of worker processes finished.
94  *
95  * reltuples is the total number of input heap tuples.
96  *
97  * indtuples is the total number of tuples that made it into the index.
98  */
100  double reltuples;
101  double indtuples;
102 
103  /*
104  * ParallelTableScanDescData data follows. Can't directly embed here, as
105  * implementations of the parallel table scan desc interface might need
106  * stronger alignment.
107  */
109 
110 /*
111  * Return pointer to a BrinShared's parallel table scan.
112  *
113  * c.f. shm_toc_allocate as to why BUFFERALIGN is used, rather than just
114  * MAXALIGN.
115  */
116 #define ParallelTableScanFromBrinShared(shared) \
117  (ParallelTableScanDesc) ((char *) (shared) + BUFFERALIGN(sizeof(BrinShared)))
118 
119 /*
120  * Status for leader in parallel index build.
121  */
122 typedef struct BrinLeader
123 {
124  /* parallel context itself */
126 
127  /*
128  * nparticipanttuplesorts is the exact number of worker processes
129  * successfully launched, plus one leader process if it participates as a
130  * worker (only DISABLE_LEADER_PARTICIPATION builds avoid leader
131  * participating as a worker).
132  */
134 
135  /*
136  * Leader process convenience pointers to shared state (leader avoids TOC
137  * lookups).
138  *
139  * brinshared is the shared state for entire build. sharedsort is the
140  * shared, tuplesort-managed state passed to each process tuplesort.
141  * snapshot is the snapshot used by the scan iff an MVCC snapshot is
142  * required.
143  */
150 
151 /*
152  * We use a BrinBuildState during initial construction of a BRIN index.
153  * The running state is kept in a BrinMemTuple.
154  */
155 typedef struct BrinBuildState
156 {
158  double bs_numtuples;
159  double bs_reltuples;
167 
171 
172  /*
173  * bs_leader is only present when a parallel index build is performed, and
174  * only in the leader process. (Actually, only the leader process has a
175  * BrinBuildState.)
176  */
179 
180  /*
181  * The sortstate is used by workers (including the leader). It has to be
182  * part of the build state, because that's the only thing passed to the
183  * build callback etc.
184  */
187 
188 /*
189  * We use a BrinInsertState to capture running state spanning multiple
190  * brininsert invocations, within the same command.
191  */
192 typedef struct BrinInsertState
193 {
198 
199 /*
200  * Struct used as "opaque" during index scans
201  */
202 typedef struct BrinOpaque
203 {
208 
209 #define BRIN_ALL_BLOCKRANGES InvalidBlockNumber
210 
212  BrinRevmap *revmap,
213  BlockNumber pagesPerRange,
214  BlockNumber tablePages);
217 static void brinsummarize(Relation index, Relation heapRel, BlockNumber pageRange,
218  bool include_partial, double *numSummarized, double *numExisting);
221 static void union_tuples(BrinDesc *bdesc, BrinMemTuple *a,
222  BrinTuple *b);
223 static void brin_vacuum_scan(Relation idxrel, BufferAccessStrategy strategy);
224 static bool add_values_to_range(Relation idxRel, BrinDesc *bdesc,
225  BrinMemTuple *dtup, const Datum *values, const bool *nulls);
226 static bool check_null_keys(BrinValues *bval, ScanKey *nullkeys, int nnullkeys);
228  BlockNumber prevRange, BlockNumber nextRange);
229 
230 /* parallel index builds */
231 static void _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index,
232  bool isconcurrent, int request);
233 static void _brin_end_parallel(BrinLeader *brinleader, BrinBuildState *state);
237 static void _brin_leader_participate_as_worker(BrinBuildState *buildstate,
238  Relation heap, Relation index);
240  BrinShared *brinshared,
241  Sharedsort *sharedsort,
242  Relation heap, Relation index,
243  int sortmem, bool progress);
244 
245 /*
246  * BRIN handler function: return IndexAmRoutine with access method parameters
247  * and callbacks.
248  */
249 Datum
251 {
252  IndexAmRoutine *amroutine = makeNode(IndexAmRoutine);
253 
254  amroutine->amstrategies = 0;
256  amroutine->amoptsprocnum = BRIN_PROCNUM_OPTIONS;
257  amroutine->amcanorder = false;
258  amroutine->amcanorderbyop = false;
259  amroutine->amcanbackward = false;
260  amroutine->amcanunique = false;
261  amroutine->amcanmulticol = true;
262  amroutine->amoptionalkey = true;
263  amroutine->amsearcharray = false;
264  amroutine->amsearchnulls = true;
265  amroutine->amstorage = true;
266  amroutine->amclusterable = false;
267  amroutine->ampredlocks = false;
268  amroutine->amcanparallel = false;
269  amroutine->amcanbuildparallel = true;
270  amroutine->amcaninclude = false;
271  amroutine->amusemaintenanceworkmem = false;
272  amroutine->amsummarizing = true;
273  amroutine->amparallelvacuumoptions =
275  amroutine->amkeytype = InvalidOid;
276 
277  amroutine->ambuild = brinbuild;
278  amroutine->ambuildempty = brinbuildempty;
279  amroutine->aminsert = brininsert;
280  amroutine->aminsertcleanup = brininsertcleanup;
281  amroutine->ambulkdelete = brinbulkdelete;
282  amroutine->amvacuumcleanup = brinvacuumcleanup;
283  amroutine->amcanreturn = NULL;
284  amroutine->amcostestimate = brincostestimate;
285  amroutine->amgettreeheight = NULL;
286  amroutine->amoptions = brinoptions;
287  amroutine->amproperty = NULL;
288  amroutine->ambuildphasename = NULL;
289  amroutine->amvalidate = brinvalidate;
290  amroutine->amadjustmembers = NULL;
291  amroutine->ambeginscan = brinbeginscan;
292  amroutine->amrescan = brinrescan;
293  amroutine->amgettuple = NULL;
294  amroutine->amgetbitmap = bringetbitmap;
295  amroutine->amendscan = brinendscan;
296  amroutine->ammarkpos = NULL;
297  amroutine->amrestrpos = NULL;
298  amroutine->amestimateparallelscan = NULL;
299  amroutine->aminitparallelscan = NULL;
300  amroutine->amparallelrescan = NULL;
301 
302  PG_RETURN_POINTER(amroutine);
303 }
304 
305 /*
306  * Initialize a BrinInsertState to maintain state to be used across multiple
307  * tuple inserts, within the same command.
308  */
309 static BrinInsertState *
311 {
312  BrinInsertState *bistate;
313  MemoryContext oldcxt;
314 
315  oldcxt = MemoryContextSwitchTo(indexInfo->ii_Context);
316  bistate = palloc0(sizeof(BrinInsertState));
317  bistate->bis_desc = brin_build_desc(idxRel);
318  bistate->bis_rmAccess = brinRevmapInitialize(idxRel,
319  &bistate->bis_pages_per_range);
320  indexInfo->ii_AmCache = bistate;
321  MemoryContextSwitchTo(oldcxt);
322 
323  return bistate;
324 }
325 
326 /*
327  * A tuple in the heap is being inserted. To keep a brin index up to date,
328  * we need to obtain the relevant index tuple and compare its stored values
329  * with those of the new tuple. If the tuple values are not consistent with
330  * the summary tuple, we need to update the index tuple.
331  *
332  * If autosummarization is enabled, check if we need to summarize the previous
333  * page range.
334  *
335  * If the range is not currently summarized (i.e. the revmap returns NULL for
336  * it), there's nothing to do for this tuple.
337  */
338 bool
339 brininsert(Relation idxRel, Datum *values, bool *nulls,
340  ItemPointer heaptid, Relation heapRel,
341  IndexUniqueCheck checkUnique,
342  bool indexUnchanged,
343  IndexInfo *indexInfo)
344 {
345  BlockNumber pagesPerRange;
346  BlockNumber origHeapBlk;
347  BlockNumber heapBlk;
348  BrinInsertState *bistate = (BrinInsertState *) indexInfo->ii_AmCache;
349  BrinRevmap *revmap;
350  BrinDesc *bdesc;
352  MemoryContext tupcxt = NULL;
354  bool autosummarize = BrinGetAutoSummarize(idxRel);
355 
356  /*
357  * If first time through in this statement, initialize the insert state
358  * that we keep for all the inserts in the command.
359  */
360  if (!bistate)
361  bistate = initialize_brin_insertstate(idxRel, indexInfo);
362 
363  revmap = bistate->bis_rmAccess;
364  bdesc = bistate->bis_desc;
365  pagesPerRange = bistate->bis_pages_per_range;
366 
367  /*
368  * origHeapBlk is the block number where the insertion occurred. heapBlk
369  * is the first block in the corresponding page range.
370  */
371  origHeapBlk = ItemPointerGetBlockNumber(heaptid);
372  heapBlk = (origHeapBlk / pagesPerRange) * pagesPerRange;
373 
374  for (;;)
375  {
376  bool need_insert = false;
377  OffsetNumber off;
378  BrinTuple *brtup;
379  BrinMemTuple *dtup;
380 
382 
383  /*
384  * If auto-summarization is enabled and we just inserted the first
385  * tuple into the first block of a new non-first page range, request a
386  * summarization run of the previous range.
387  */
388  if (autosummarize &&
389  heapBlk > 0 &&
390  heapBlk == origHeapBlk &&
392  {
393  BlockNumber lastPageRange = heapBlk - 1;
394  BrinTuple *lastPageTuple;
395 
396  lastPageTuple =
397  brinGetTupleForHeapBlock(revmap, lastPageRange, &buf, &off,
398  NULL, BUFFER_LOCK_SHARE);
399  if (!lastPageTuple)
400  {
401  bool recorded;
402 
404  RelationGetRelid(idxRel),
405  lastPageRange);
406  if (!recorded)
407  ereport(LOG,
408  (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
409  errmsg("request for BRIN range summarization for index \"%s\" page %u was not recorded",
410  RelationGetRelationName(idxRel),
411  lastPageRange)));
412  }
413  else
415  }
416 
417  brtup = brinGetTupleForHeapBlock(revmap, heapBlk, &buf, &off,
418  NULL, BUFFER_LOCK_SHARE);
419 
420  /* if range is unsummarized, there's nothing to do */
421  if (!brtup)
422  break;
423 
424  /* First time through in this brininsert call? */
425  if (tupcxt == NULL)
426  {
428  "brininsert cxt",
430  MemoryContextSwitchTo(tupcxt);
431  }
432 
433  dtup = brin_deform_tuple(bdesc, brtup, NULL);
434 
435  need_insert = add_values_to_range(idxRel, bdesc, dtup, values, nulls);
436 
437  if (!need_insert)
438  {
439  /*
440  * The tuple is consistent with the new values, so there's nothing
441  * to do.
442  */
444  }
445  else
446  {
447  Page page = BufferGetPage(buf);
448  ItemId lp = PageGetItemId(page, off);
449  Size origsz;
450  BrinTuple *origtup;
451  Size newsz;
452  BrinTuple *newtup;
453  bool samepage;
454 
455  /*
456  * Make a copy of the old tuple, so that we can compare it after
457  * re-acquiring the lock.
458  */
459  origsz = ItemIdGetLength(lp);
460  origtup = brin_copy_tuple(brtup, origsz, NULL, NULL);
461 
462  /*
463  * Before releasing the lock, check if we can attempt a same-page
464  * update. Another process could insert a tuple concurrently in
465  * the same page though, so downstream we must be prepared to cope
466  * if this turns out to not be possible after all.
467  */
468  newtup = brin_form_tuple(bdesc, heapBlk, dtup, &newsz);
469  samepage = brin_can_do_samepage_update(buf, origsz, newsz);
471 
472  /*
473  * Try to update the tuple. If this doesn't work for whatever
474  * reason, we need to restart from the top; the revmap might be
475  * pointing at a different tuple for this block now, so we need to
476  * recompute to ensure both our new heap tuple and the other
477  * inserter's are covered by the combined tuple. It might be that
478  * we don't need to update at all.
479  */
480  if (!brin_doupdate(idxRel, pagesPerRange, revmap, heapBlk,
481  buf, off, origtup, origsz, newtup, newsz,
482  samepage))
483  {
484  /* no luck; start over */
485  MemoryContextReset(tupcxt);
486  continue;
487  }
488  }
489 
490  /* success! */
491  break;
492  }
493 
494  if (BufferIsValid(buf))
496  MemoryContextSwitchTo(oldcxt);
497  if (tupcxt != NULL)
498  MemoryContextDelete(tupcxt);
499 
500  return false;
501 }
502 
503 /*
504  * Callback to clean up the BrinInsertState once all tuple inserts are done.
505  */
506 void
508 {
509  BrinInsertState *bistate = (BrinInsertState *) indexInfo->ii_AmCache;
510 
511  /* bail out if cache not initialized */
512  if (indexInfo->ii_AmCache == NULL)
513  return;
514 
515  /*
516  * Clean up the revmap. Note that the brinDesc has already been cleaned up
517  * as part of its own memory context.
518  */
520  bistate->bis_rmAccess = NULL;
521  bistate->bis_desc = NULL;
522 }
523 
524 /*
525  * Initialize state for a BRIN index scan.
526  *
527  * We read the metapage here to determine the pages-per-range number that this
528  * index was built with. Note that since this cannot be changed while we're
529  * holding lock on index, it's not necessary to recompute it during brinrescan.
530  */
532 brinbeginscan(Relation r, int nkeys, int norderbys)
533 {
534  IndexScanDesc scan;
535  BrinOpaque *opaque;
536 
537  scan = RelationGetIndexScan(r, nkeys, norderbys);
538 
539  opaque = palloc_object(BrinOpaque);
540  opaque->bo_rmAccess = brinRevmapInitialize(r, &opaque->bo_pagesPerRange);
541  opaque->bo_bdesc = brin_build_desc(r);
542  scan->opaque = opaque;
543 
544  return scan;
545 }
546 
547 /*
548  * Execute the index scan.
549  *
550  * This works by reading index TIDs from the revmap, and obtaining the index
551  * tuples pointed to by them; the summary values in the index tuples are
552  * compared to the scan keys. We return into the TID bitmap all the pages in
553  * ranges corresponding to index tuples that match the scan keys.
554  *
555  * If a TID from the revmap is read as InvalidTID, we know that range is
556  * unsummarized. Pages in those ranges need to be returned regardless of scan
557  * keys.
558  */
559 int64
561 {
562  Relation idxRel = scan->indexRelation;
564  BrinDesc *bdesc;
565  Oid heapOid;
566  Relation heapRel;
567  BrinOpaque *opaque;
568  BlockNumber nblocks;
569  BlockNumber heapBlk;
570  int totalpages = 0;
571  FmgrInfo *consistentFn;
572  MemoryContext oldcxt;
573  MemoryContext perRangeCxt;
574  BrinMemTuple *dtup;
575  BrinTuple *btup = NULL;
576  Size btupsz = 0;
577  ScanKey **keys,
578  **nullkeys;
579  int *nkeys,
580  *nnullkeys;
581  char *ptr;
582  Size len;
583  char *tmp PG_USED_FOR_ASSERTS_ONLY;
584 
585  opaque = (BrinOpaque *) scan->opaque;
586  bdesc = opaque->bo_bdesc;
587  pgstat_count_index_scan(idxRel);
588 
589  /*
590  * We need to know the size of the table so that we know how long to
591  * iterate on the revmap.
592  */
593  heapOid = IndexGetRelation(RelationGetRelid(idxRel), false);
594  heapRel = table_open(heapOid, AccessShareLock);
595  nblocks = RelationGetNumberOfBlocks(heapRel);
596  table_close(heapRel, AccessShareLock);
597 
598  /*
599  * Make room for the consistent support procedures of indexed columns. We
600  * don't look them up here; we do that lazily the first time we see a scan
601  * key reference each of them. We rely on zeroing fn_oid to InvalidOid.
602  */
603  consistentFn = palloc0_array(FmgrInfo, bdesc->bd_tupdesc->natts);
604 
605  /*
606  * Make room for per-attribute lists of scan keys that we'll pass to the
607  * consistent support procedure. We don't know which attributes have scan
608  * keys, so we allocate space for all attributes. That may use more memory
609  * but it's probably cheaper than determining which attributes are used.
610  *
611  * We keep null and regular keys separate, so that we can pass just the
612  * regular keys to the consistent function easily.
613  *
614  * To reduce the allocation overhead, we allocate one big chunk and then
615  * carve it into smaller arrays ourselves. All the pieces have exactly the
616  * same lifetime, so that's OK.
617  *
618  * XXX The widest index can have 32 attributes, so the amount of wasted
619  * memory is negligible. We could invent a more compact approach (with
620  * just space for used attributes) but that would make the matching more
621  * complex so it's not a good trade-off.
622  */
623  len =
624  MAXALIGN(sizeof(ScanKey *) * bdesc->bd_tupdesc->natts) + /* regular keys */
625  MAXALIGN(sizeof(ScanKey) * scan->numberOfKeys) * bdesc->bd_tupdesc->natts +
626  MAXALIGN(sizeof(int) * bdesc->bd_tupdesc->natts) +
627  MAXALIGN(sizeof(ScanKey *) * bdesc->bd_tupdesc->natts) + /* NULL keys */
628  MAXALIGN(sizeof(ScanKey) * scan->numberOfKeys) * bdesc->bd_tupdesc->natts +
629  MAXALIGN(sizeof(int) * bdesc->bd_tupdesc->natts);
630 
631  ptr = palloc(len);
632  tmp = ptr;
633 
634  keys = (ScanKey **) ptr;
635  ptr += MAXALIGN(sizeof(ScanKey *) * bdesc->bd_tupdesc->natts);
636 
637  nullkeys = (ScanKey **) ptr;
638  ptr += MAXALIGN(sizeof(ScanKey *) * bdesc->bd_tupdesc->natts);
639 
640  nkeys = (int *) ptr;
641  ptr += MAXALIGN(sizeof(int) * bdesc->bd_tupdesc->natts);
642 
643  nnullkeys = (int *) ptr;
644  ptr += MAXALIGN(sizeof(int) * bdesc->bd_tupdesc->natts);
645 
646  for (int i = 0; i < bdesc->bd_tupdesc->natts; i++)
647  {
648  keys[i] = (ScanKey *) ptr;
649  ptr += MAXALIGN(sizeof(ScanKey) * scan->numberOfKeys);
650 
651  nullkeys[i] = (ScanKey *) ptr;
652  ptr += MAXALIGN(sizeof(ScanKey) * scan->numberOfKeys);
653  }
654 
655  Assert(tmp + len == ptr);
656 
657  /* zero the number of keys */
658  memset(nkeys, 0, sizeof(int) * bdesc->bd_tupdesc->natts);
659  memset(nnullkeys, 0, sizeof(int) * bdesc->bd_tupdesc->natts);
660 
661  /* Preprocess the scan keys - split them into per-attribute arrays. */
662  for (int keyno = 0; keyno < scan->numberOfKeys; keyno++)
663  {
664  ScanKey key = &scan->keyData[keyno];
665  AttrNumber keyattno = key->sk_attno;
666 
667  /*
668  * The collation of the scan key must match the collation used in the
669  * index column (but only if the search is not IS NULL/ IS NOT NULL).
670  * Otherwise we shouldn't be using this index ...
671  */
672  Assert((key->sk_flags & SK_ISNULL) ||
673  (key->sk_collation ==
674  TupleDescAttr(bdesc->bd_tupdesc,
675  keyattno - 1)->attcollation));
676 
677  /*
678  * First time we see this index attribute, so init as needed.
679  *
680  * This is a bit of an overkill - we don't know how many scan keys are
681  * there for this attribute, so we simply allocate the largest number
682  * possible (as if all keys were for this attribute). This may waste a
683  * bit of memory, but we only expect small number of scan keys in
684  * general, so this should be negligible, and repeated repalloc calls
685  * are not free either.
686  */
687  if (consistentFn[keyattno - 1].fn_oid == InvalidOid)
688  {
689  FmgrInfo *tmp;
690 
691  /* First time we see this attribute, so no key/null keys. */
692  Assert(nkeys[keyattno - 1] == 0);
693  Assert(nnullkeys[keyattno - 1] == 0);
694 
695  tmp = index_getprocinfo(idxRel, keyattno,
697  fmgr_info_copy(&consistentFn[keyattno - 1], tmp,
699  }
700 
701  /* Add key to the proper per-attribute array. */
702  if (key->sk_flags & SK_ISNULL)
703  {
704  nullkeys[keyattno - 1][nnullkeys[keyattno - 1]] = key;
705  nnullkeys[keyattno - 1]++;
706  }
707  else
708  {
709  keys[keyattno - 1][nkeys[keyattno - 1]] = key;
710  nkeys[keyattno - 1]++;
711  }
712  }
713 
714  /* allocate an initial in-memory tuple, out of the per-range memcxt */
715  dtup = brin_new_memtuple(bdesc);
716 
717  /*
718  * Setup and use a per-range memory context, which is reset every time we
719  * loop below. This avoids having to free the tuples within the loop.
720  */
722  "bringetbitmap cxt",
724  oldcxt = MemoryContextSwitchTo(perRangeCxt);
725 
726  /*
727  * Now scan the revmap. We start by querying for heap page 0,
728  * incrementing by the number of pages per range; this gives us a full
729  * view of the table.
730  */
731  for (heapBlk = 0; heapBlk < nblocks; heapBlk += opaque->bo_pagesPerRange)
732  {
733  bool addrange;
734  bool gottuple = false;
735  BrinTuple *tup;
736  OffsetNumber off;
737  Size size;
738 
740 
741  MemoryContextReset(perRangeCxt);
742 
743  tup = brinGetTupleForHeapBlock(opaque->bo_rmAccess, heapBlk, &buf,
744  &off, &size, BUFFER_LOCK_SHARE);
745  if (tup)
746  {
747  gottuple = true;
748  btup = brin_copy_tuple(tup, size, btup, &btupsz);
750  }
751 
752  /*
753  * For page ranges with no indexed tuple, we must return the whole
754  * range; otherwise, compare it to the scan keys.
755  */
756  if (!gottuple)
757  {
758  addrange = true;
759  }
760  else
761  {
762  dtup = brin_deform_tuple(bdesc, btup, dtup);
763  if (dtup->bt_placeholder)
764  {
765  /*
766  * Placeholder tuples are always returned, regardless of the
767  * values stored in them.
768  */
769  addrange = true;
770  }
771  else
772  {
773  int attno;
774 
775  /*
776  * Compare scan keys with summary values stored for the range.
777  * If scan keys are matched, the page range must be added to
778  * the bitmap. We initially assume the range needs to be
779  * added; in particular this serves the case where there are
780  * no keys.
781  */
782  addrange = true;
783  for (attno = 1; attno <= bdesc->bd_tupdesc->natts; attno++)
784  {
785  BrinValues *bval;
786  Datum add;
787  Oid collation;
788 
789  /*
790  * skip attributes without any scan keys (both regular and
791  * IS [NOT] NULL)
792  */
793  if (nkeys[attno - 1] == 0 && nnullkeys[attno - 1] == 0)
794  continue;
795 
796  bval = &dtup->bt_columns[attno - 1];
797 
798  /*
799  * If the BRIN tuple indicates that this range is empty,
800  * we can skip it: there's nothing to match. We don't
801  * need to examine the next columns.
802  */
803  if (dtup->bt_empty_range)
804  {
805  addrange = false;
806  break;
807  }
808 
809  /*
810  * First check if there are any IS [NOT] NULL scan keys,
811  * and if we're violating them. In that case we can
812  * terminate early, without invoking the support function.
813  *
814  * As there may be more keys, we can only determine
815  * mismatch within this loop.
816  */
817  if (bdesc->bd_info[attno - 1]->oi_regular_nulls &&
818  !check_null_keys(bval, nullkeys[attno - 1],
819  nnullkeys[attno - 1]))
820  {
821  /*
822  * If any of the IS [NOT] NULL keys failed, the page
823  * range as a whole can't pass. So terminate the loop.
824  */
825  addrange = false;
826  break;
827  }
828 
829  /*
830  * So either there are no IS [NOT] NULL keys, or all
831  * passed. If there are no regular scan keys, we're done -
832  * the page range matches. If there are regular keys, but
833  * the page range is marked as 'all nulls' it can't
834  * possibly pass (we're assuming the operators are
835  * strict).
836  */
837 
838  /* No regular scan keys - page range as a whole passes. */
839  if (!nkeys[attno - 1])
840  continue;
841 
842  Assert((nkeys[attno - 1] > 0) &&
843  (nkeys[attno - 1] <= scan->numberOfKeys));
844 
845  /* If it is all nulls, it cannot possibly be consistent. */
846  if (bval->bv_allnulls)
847  {
848  addrange = false;
849  break;
850  }
851 
852  /*
853  * Collation from the first key (has to be the same for
854  * all keys for the same attribute).
855  */
856  collation = keys[attno - 1][0]->sk_collation;
857 
858  /*
859  * Check whether the scan key is consistent with the page
860  * range values; if so, have the pages in the range added
861  * to the output bitmap.
862  *
863  * The opclass may or may not support processing of
864  * multiple scan keys. We can determine that based on the
865  * number of arguments - functions with extra parameter
866  * (number of scan keys) do support this, otherwise we
867  * have to simply pass the scan keys one by one.
868  */
869  if (consistentFn[attno - 1].fn_nargs >= 4)
870  {
871  /* Check all keys at once */
872  add = FunctionCall4Coll(&consistentFn[attno - 1],
873  collation,
874  PointerGetDatum(bdesc),
875  PointerGetDatum(bval),
876  PointerGetDatum(keys[attno - 1]),
877  Int32GetDatum(nkeys[attno - 1]));
878  addrange = DatumGetBool(add);
879  }
880  else
881  {
882  /*
883  * Check keys one by one
884  *
885  * When there are multiple scan keys, failure to meet
886  * the criteria for a single one of them is enough to
887  * discard the range as a whole, so break out of the
888  * loop as soon as a false return value is obtained.
889  */
890  int keyno;
891 
892  for (keyno = 0; keyno < nkeys[attno - 1]; keyno++)
893  {
894  add = FunctionCall3Coll(&consistentFn[attno - 1],
895  keys[attno - 1][keyno]->sk_collation,
896  PointerGetDatum(bdesc),
897  PointerGetDatum(bval),
898  PointerGetDatum(keys[attno - 1][keyno]));
899  addrange = DatumGetBool(add);
900  if (!addrange)
901  break;
902  }
903  }
904 
905  /*
906  * If we found a scan key eliminating the range, no need
907  * to check additional ones.
908  */
909  if (!addrange)
910  break;
911  }
912  }
913  }
914 
915  /* add the pages in the range to the output bitmap, if needed */
916  if (addrange)
917  {
918  BlockNumber pageno;
919 
920  for (pageno = heapBlk;
921  pageno <= Min(nblocks, heapBlk + opaque->bo_pagesPerRange) - 1;
922  pageno++)
923  {
924  MemoryContextSwitchTo(oldcxt);
925  tbm_add_page(tbm, pageno);
926  totalpages++;
927  MemoryContextSwitchTo(perRangeCxt);
928  }
929  }
930  }
931 
932  MemoryContextSwitchTo(oldcxt);
933  MemoryContextDelete(perRangeCxt);
934 
935  if (buf != InvalidBuffer)
937 
938  /*
939  * XXX We have an approximation of the number of *pages* that our scan
940  * returns, but we don't have a precise idea of the number of heap tuples
941  * involved.
942  */
943  return totalpages * 10;
944 }
945 
946 /*
947  * Re-initialize state for a BRIN index scan
948  */
949 void
950 brinrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
951  ScanKey orderbys, int norderbys)
952 {
953  /*
954  * Other index AMs preprocess the scan keys at this point, or sometime
955  * early during the scan; this lets them optimize by removing redundant
956  * keys, or doing early returns when they are impossible to satisfy; see
957  * _bt_preprocess_keys for an example. Something like that could be added
958  * here someday, too.
959  */
960 
961  if (scankey && scan->numberOfKeys > 0)
962  memcpy(scan->keyData, scankey, scan->numberOfKeys * sizeof(ScanKeyData));
963 }
964 
965 /*
966  * Close down a BRIN index scan
967  */
968 void
970 {
971  BrinOpaque *opaque = (BrinOpaque *) scan->opaque;
972 
974  brin_free_desc(opaque->bo_bdesc);
975  pfree(opaque);
976 }
977 
978 /*
979  * Per-heap-tuple callback for table_index_build_scan.
980  *
981  * Note we don't worry about the page range at the end of the table here; it is
982  * present in the build state struct after we're called the last time, but not
983  * inserted into the index. Caller must ensure to do so, if appropriate.
984  */
985 static void
987  ItemPointer tid,
988  Datum *values,
989  bool *isnull,
990  bool tupleIsAlive,
991  void *brstate)
992 {
993  BrinBuildState *state = (BrinBuildState *) brstate;
994  BlockNumber thisblock;
995 
996  thisblock = ItemPointerGetBlockNumber(tid);
997 
998  /*
999  * If we're in a block that belongs to a future range, summarize what
1000  * we've got and start afresh. Note the scan might have skipped many
1001  * pages, if they were devoid of live tuples; make sure to insert index
1002  * tuples for those too.
1003  */
1004  while (thisblock > state->bs_currRangeStart + state->bs_pagesPerRange - 1)
1005  {
1006 
1007  BRIN_elog((DEBUG2,
1008  "brinbuildCallback: completed a range: %u--%u",
1009  state->bs_currRangeStart,
1010  state->bs_currRangeStart + state->bs_pagesPerRange));
1011 
1012  /* create the index tuple and insert it */
1014 
1015  /* set state to correspond to the next range */
1016  state->bs_currRangeStart += state->bs_pagesPerRange;
1017 
1018  /* re-initialize state for it */
1019  brin_memtuple_initialize(state->bs_dtuple, state->bs_bdesc);
1020  }
1021 
1022  /* Accumulate the current tuple into the running state */
1023  (void) add_values_to_range(index, state->bs_bdesc, state->bs_dtuple,
1024  values, isnull);
1025 }
1026 
1027 /*
1028  * Per-heap-tuple callback for table_index_build_scan with parallelism.
1029  *
1030  * A version of the callback used by parallel index builds. The main difference
1031  * is that instead of writing the BRIN tuples into the index, we write them
1032  * into a shared tuplesort, and leave the insertion up to the leader (which may
1033  * reorder them a bit etc.). The callback also does not generate empty ranges,
1034  * those will be added by the leader when merging results from workers.
1035  */
1036 static void
1038  ItemPointer tid,
1039  Datum *values,
1040  bool *isnull,
1041  bool tupleIsAlive,
1042  void *brstate)
1043 {
1044  BrinBuildState *state = (BrinBuildState *) brstate;
1045  BlockNumber thisblock;
1046 
1047  thisblock = ItemPointerGetBlockNumber(tid);
1048 
1049  /*
1050  * If we're in a block that belongs to a different range, summarize what
1051  * we've got and start afresh. Note the scan might have skipped many
1052  * pages, if they were devoid of live tuples; we do not create empty BRIN
1053  * ranges here - the leader is responsible for filling them in.
1054  *
1055  * Unlike serial builds, parallel index builds allow synchronized seqscans
1056  * (because that's what parallel scans do). This means the block may wrap
1057  * around to the beginning of the relation, so the condition needs to
1058  * check for both future and past ranges.
1059  */
1060  if ((thisblock < state->bs_currRangeStart) ||
1061  (thisblock > state->bs_currRangeStart + state->bs_pagesPerRange - 1))
1062  {
1063 
1064  BRIN_elog((DEBUG2,
1065  "brinbuildCallbackParallel: completed a range: %u--%u",
1066  state->bs_currRangeStart,
1067  state->bs_currRangeStart + state->bs_pagesPerRange));
1068 
1069  /* create the index tuple and write it into the tuplesort */
1071 
1072  /*
1073  * Set state to correspond to the next range (for this block).
1074  *
1075  * This skips ranges that are either empty (and so we don't get any
1076  * tuples to summarize), or processed by other workers. We can't
1077  * differentiate those cases here easily, so we leave it up to the
1078  * leader to fill empty ranges where needed.
1079  */
1080  state->bs_currRangeStart
1081  = state->bs_pagesPerRange * (thisblock / state->bs_pagesPerRange);
1082 
1083  /* re-initialize state for it */
1084  brin_memtuple_initialize(state->bs_dtuple, state->bs_bdesc);
1085  }
1086 
1087  /* Accumulate the current tuple into the running state */
1088  (void) add_values_to_range(index, state->bs_bdesc, state->bs_dtuple,
1089  values, isnull);
1090 }
1091 
1092 /*
1093  * brinbuild() -- build a new BRIN index.
1094  */
1097 {
1098  IndexBuildResult *result;
1099  double reltuples;
1100  double idxtuples;
1101  BrinRevmap *revmap;
1103  Buffer meta;
1104  BlockNumber pagesPerRange;
1105 
1106  /*
1107  * We expect to be called exactly once for any index relation.
1108  */
1109  if (RelationGetNumberOfBlocks(index) != 0)
1110  elog(ERROR, "index \"%s\" already contains data",
1112 
1113  /*
1114  * Critical section not required, because on error the creation of the
1115  * whole relation will be rolled back.
1116  */
1117 
1121 
1124  MarkBufferDirty(meta);
1125 
1126  if (RelationNeedsWAL(index))
1127  {
1128  xl_brin_createidx xlrec;
1129  XLogRecPtr recptr;
1130  Page page;
1131 
1132  xlrec.version = BRIN_CURRENT_VERSION;
1134 
1135  XLogBeginInsert();
1136  XLogRegisterData((char *) &xlrec, SizeOfBrinCreateIdx);
1138 
1139  recptr = XLogInsert(RM_BRIN_ID, XLOG_BRIN_CREATE_INDEX);
1140 
1141  page = BufferGetPage(meta);
1142  PageSetLSN(page, recptr);
1143  }
1144 
1145  UnlockReleaseBuffer(meta);
1146 
1147  /*
1148  * Initialize our state, including the deformed tuple state.
1149  */
1150  revmap = brinRevmapInitialize(index, &pagesPerRange);
1151  state = initialize_brin_buildstate(index, revmap, pagesPerRange,
1153 
1154  /*
1155  * Attempt to launch parallel worker scan when required
1156  *
1157  * XXX plan_create_index_workers makes the number of workers dependent on
1158  * maintenance_work_mem, requiring 32MB for each worker. That makes sense
1159  * for btree, but not for BRIN, which can do with much less memory. So
1160  * maybe make that somehow less strict, optionally?
1161  */
1162  if (indexInfo->ii_ParallelWorkers > 0)
1163  _brin_begin_parallel(state, heap, index, indexInfo->ii_Concurrent,
1164  indexInfo->ii_ParallelWorkers);
1165 
1166  /*
1167  * If parallel build requested and at least one worker process was
1168  * successfully launched, set up coordination state, wait for workers to
1169  * complete. Then read all tuples from the shared tuplesort and insert
1170  * them into the index.
1171  *
1172  * In serial mode, simply scan the table and build the index one index
1173  * tuple at a time.
1174  */
1175  if (state->bs_leader)
1176  {
1177  SortCoordinate coordinate;
1178 
1179  coordinate = (SortCoordinate) palloc0(sizeof(SortCoordinateData));
1180  coordinate->isWorker = false;
1181  coordinate->nParticipants =
1182  state->bs_leader->nparticipanttuplesorts;
1183  coordinate->sharedsort = state->bs_leader->sharedsort;
1184 
1185  /*
1186  * Begin leader tuplesort.
1187  *
1188  * In cases where parallelism is involved, the leader receives the
1189  * same share of maintenance_work_mem as a serial sort (it is
1190  * generally treated in the same way as a serial sort once we return).
1191  * Parallel worker Tuplesortstates will have received only a fraction
1192  * of maintenance_work_mem, though.
1193  *
1194  * We rely on the lifetime of the Leader Tuplesortstate almost not
1195  * overlapping with any worker Tuplesortstate's lifetime. There may
1196  * be some small overlap, but that's okay because we rely on leader
1197  * Tuplesortstate only allocating a small, fixed amount of memory
1198  * here. When its tuplesort_performsort() is called (by our caller),
1199  * and significant amounts of memory are likely to be used, all
1200  * workers must have already freed almost all memory held by their
1201  * Tuplesortstates (they are about to go away completely, too). The
1202  * overall effect is that maintenance_work_mem always represents an
1203  * absolute high watermark on the amount of memory used by a CREATE
1204  * INDEX operation, regardless of the use of parallelism or any other
1205  * factor.
1206  */
1207  state->bs_sortstate =
1209  TUPLESORT_NONE);
1210 
1211  /* scan the relation and merge per-worker results */
1212  reltuples = _brin_parallel_merge(state);
1213 
1214  _brin_end_parallel(state->bs_leader, state);
1215  }
1216  else /* no parallel index build */
1217  {
1218  /*
1219  * Now scan the relation. No syncscan allowed here because we want
1220  * the heap blocks in physical order (we want to produce the ranges
1221  * starting from block 0, and the callback also relies on this to not
1222  * generate summary for the same range twice).
1223  */
1224  reltuples = table_index_build_scan(heap, index, indexInfo, false, true,
1225  brinbuildCallback, (void *) state, NULL);
1226 
1227  /*
1228  * process the final batch
1229  *
1230  * XXX Note this does not update state->bs_currRangeStart, i.e. it
1231  * stays set to the last range added to the index. This is OK, because
1232  * that's what brin_fill_empty_ranges expects.
1233  */
1235 
1236  /*
1237  * Backfill the final ranges with empty data.
1238  *
1239  * This saves us from doing what amounts to full table scans when the
1240  * index with a predicate like WHERE (nonnull_column IS NULL), or
1241  * other very selective predicates.
1242  */
1244  state->bs_currRangeStart,
1245  state->bs_maxRangeStart);
1246  }
1247 
1248  /* release resources */
1249  idxtuples = state->bs_numtuples;
1250  brinRevmapTerminate(state->bs_rmAccess);
1252 
1253  /*
1254  * Return statistics
1255  */
1256  result = palloc_object(IndexBuildResult);
1257 
1258  result->heap_tuples = reltuples;
1259  result->index_tuples = idxtuples;
1260 
1261  return result;
1262 }
1263 
1264 void
1266 {
1267  Buffer metabuf;
1268 
1269  /* An empty BRIN index has a metapage only. */
1270  metabuf = ExtendBufferedRel(BMR_REL(index), INIT_FORKNUM, NULL,
1272 
1273  /* Initialize and xlog metabuffer. */
1277  MarkBufferDirty(metabuf);
1278  log_newpage_buffer(metabuf, true);
1279  END_CRIT_SECTION();
1280 
1281  UnlockReleaseBuffer(metabuf);
1282 }
1283 
1284 /*
1285  * brinbulkdelete
1286  * Since there are no per-heap-tuple index tuples in BRIN indexes,
1287  * there's not a lot we can do here.
1288  *
1289  * XXX we could mark item tuples as "dirty" (when a minimum or maximum heap
1290  * tuple is deleted), meaning the need to re-run summarization on the affected
1291  * range. Would need to add an extra flag in brintuples for that.
1292  */
1295  IndexBulkDeleteCallback callback, void *callback_state)
1296 {
1297  /* allocate stats if first time through, else re-use existing struct */
1298  if (stats == NULL)
1300 
1301  return stats;
1302 }
1303 
1304 /*
1305  * This routine is in charge of "vacuuming" a BRIN index: we just summarize
1306  * ranges that are currently unsummarized.
1307  */
1310 {
1311  Relation heapRel;
1312 
1313  /* No-op in ANALYZE ONLY mode */
1314  if (info->analyze_only)
1315  return stats;
1316 
1317  if (!stats)
1319  stats->num_pages = RelationGetNumberOfBlocks(info->index);
1320  /* rest of stats is initialized by zeroing */
1321 
1322  heapRel = table_open(IndexGetRelation(RelationGetRelid(info->index), false),
1323  AccessShareLock);
1324 
1325  brin_vacuum_scan(info->index, info->strategy);
1326 
1327  brinsummarize(info->index, heapRel, BRIN_ALL_BLOCKRANGES, false,
1328  &stats->num_index_tuples, &stats->num_index_tuples);
1329 
1330  table_close(heapRel, AccessShareLock);
1331 
1332  return stats;
1333 }
1334 
1335 /*
1336  * reloptions processor for BRIN indexes
1337  */
1338 bytea *
1339 brinoptions(Datum reloptions, bool validate)
1340 {
1341  static const relopt_parse_elt tab[] = {
1342  {"pages_per_range", RELOPT_TYPE_INT, offsetof(BrinOptions, pagesPerRange)},
1343  {"autosummarize", RELOPT_TYPE_BOOL, offsetof(BrinOptions, autosummarize)}
1344  };
1345 
1346  return (bytea *) build_reloptions(reloptions, validate,
1348  sizeof(BrinOptions),
1349  tab, lengthof(tab));
1350 }
1351 
1352 /*
1353  * SQL-callable function to scan through an index and summarize all ranges
1354  * that are not currently summarized.
1355  */
1356 Datum
1358 {
1359  Datum relation = PG_GETARG_DATUM(0);
1360 
1362  relation,
1364 }
1365 
1366 /*
1367  * SQL-callable function to summarize the indicated page range, if not already
1368  * summarized. If the second argument is BRIN_ALL_BLOCKRANGES, all
1369  * unsummarized ranges are summarized.
1370  */
1371 Datum
1373 {
1374  Oid indexoid = PG_GETARG_OID(0);
1375  int64 heapBlk64 = PG_GETARG_INT64(1);
1376  BlockNumber heapBlk;
1377  Oid heapoid;
1378  Relation indexRel;
1379  Relation heapRel;
1380  Oid save_userid;
1381  int save_sec_context;
1382  int save_nestlevel;
1383  double numSummarized = 0;
1384 
1385  if (RecoveryInProgress())
1386  ereport(ERROR,
1387  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1388  errmsg("recovery is in progress"),
1389  errhint("BRIN control functions cannot be executed during recovery.")));
1390 
1391  if (heapBlk64 > BRIN_ALL_BLOCKRANGES || heapBlk64 < 0)
1392  ereport(ERROR,
1393  (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
1394  errmsg("block number out of range: %lld",
1395  (long long) heapBlk64)));
1396  heapBlk = (BlockNumber) heapBlk64;
1397 
1398  /*
1399  * We must lock table before index to avoid deadlocks. However, if the
1400  * passed indexoid isn't an index then IndexGetRelation() will fail.
1401  * Rather than emitting a not-very-helpful error message, postpone
1402  * complaining, expecting that the is-it-an-index test below will fail.
1403  */
1404  heapoid = IndexGetRelation(indexoid, true);
1405  if (OidIsValid(heapoid))
1406  {
1407  heapRel = table_open(heapoid, ShareUpdateExclusiveLock);
1408 
1409  /*
1410  * Autovacuum calls us. For its benefit, switch to the table owner's
1411  * userid, so that any index functions are run as that user. Also
1412  * lock down security-restricted operations and arrange to make GUC
1413  * variable changes local to this command. This is harmless, albeit
1414  * unnecessary, when called from SQL, because we fail shortly if the
1415  * user does not own the index.
1416  */
1417  GetUserIdAndSecContext(&save_userid, &save_sec_context);
1418  SetUserIdAndSecContext(heapRel->rd_rel->relowner,
1419  save_sec_context | SECURITY_RESTRICTED_OPERATION);
1420  save_nestlevel = NewGUCNestLevel();
1422  }
1423  else
1424  {
1425  heapRel = NULL;
1426  /* Set these just to suppress "uninitialized variable" warnings */
1427  save_userid = InvalidOid;
1428  save_sec_context = -1;
1429  save_nestlevel = -1;
1430  }
1431 
1432  indexRel = index_open(indexoid, ShareUpdateExclusiveLock);
1433 
1434  /* Must be a BRIN index */
1435  if (indexRel->rd_rel->relkind != RELKIND_INDEX ||
1436  indexRel->rd_rel->relam != BRIN_AM_OID)
1437  ereport(ERROR,
1438  (errcode(ERRCODE_WRONG_OBJECT_TYPE),
1439  errmsg("\"%s\" is not a BRIN index",
1440  RelationGetRelationName(indexRel))));
1441 
1442  /* User must own the index (comparable to privileges needed for VACUUM) */
1443  if (heapRel != NULL && !object_ownercheck(RelationRelationId, indexoid, save_userid))
1445  RelationGetRelationName(indexRel));
1446 
1447  /*
1448  * Since we did the IndexGetRelation call above without any lock, it's
1449  * barely possible that a race against an index drop/recreation could have
1450  * netted us the wrong table. Recheck.
1451  */
1452  if (heapRel == NULL || heapoid != IndexGetRelation(indexoid, false))
1453  ereport(ERROR,
1455  errmsg("could not open parent table of index \"%s\"",
1456  RelationGetRelationName(indexRel))));
1457 
1458  /* see gin_clean_pending_list() */
1459  if (indexRel->rd_index->indisvalid)
1460  brinsummarize(indexRel, heapRel, heapBlk, true, &numSummarized, NULL);
1461  else
1462  ereport(DEBUG1,
1463  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1464  errmsg("index \"%s\" is not valid",
1465  RelationGetRelationName(indexRel))));
1466 
1467  /* Roll back any GUC changes executed by index functions */
1468  AtEOXact_GUC(false, save_nestlevel);
1469 
1470  /* Restore userid and security context */
1471  SetUserIdAndSecContext(save_userid, save_sec_context);
1472 
1475 
1476  PG_RETURN_INT32((int32) numSummarized);
1477 }
1478 
1479 /*
1480  * SQL-callable interface to mark a range as no longer summarized
1481  */
1482 Datum
1484 {
1485  Oid indexoid = PG_GETARG_OID(0);
1486  int64 heapBlk64 = PG_GETARG_INT64(1);
1487  BlockNumber heapBlk;
1488  Oid heapoid;
1489  Relation heapRel;
1490  Relation indexRel;
1491  bool done;
1492 
1493  if (RecoveryInProgress())
1494  ereport(ERROR,
1495  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1496  errmsg("recovery is in progress"),
1497  errhint("BRIN control functions cannot be executed during recovery.")));
1498 
1499  if (heapBlk64 > MaxBlockNumber || heapBlk64 < 0)
1500  ereport(ERROR,
1501  (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
1502  errmsg("block number out of range: %lld",
1503  (long long) heapBlk64)));
1504  heapBlk = (BlockNumber) heapBlk64;
1505 
1506  /*
1507  * We must lock table before index to avoid deadlocks. However, if the
1508  * passed indexoid isn't an index then IndexGetRelation() will fail.
1509  * Rather than emitting a not-very-helpful error message, postpone
1510  * complaining, expecting that the is-it-an-index test below will fail.
1511  *
1512  * Unlike brin_summarize_range(), autovacuum never calls this. Hence, we
1513  * don't switch userid.
1514  */
1515  heapoid = IndexGetRelation(indexoid, true);
1516  if (OidIsValid(heapoid))
1517  heapRel = table_open(heapoid, ShareUpdateExclusiveLock);
1518  else
1519  heapRel = NULL;
1520 
1521  indexRel = index_open(indexoid, ShareUpdateExclusiveLock);
1522 
1523  /* Must be a BRIN index */
1524  if (indexRel->rd_rel->relkind != RELKIND_INDEX ||
1525  indexRel->rd_rel->relam != BRIN_AM_OID)
1526  ereport(ERROR,
1527  (errcode(ERRCODE_WRONG_OBJECT_TYPE),
1528  errmsg("\"%s\" is not a BRIN index",
1529  RelationGetRelationName(indexRel))));
1530 
1531  /* User must own the index (comparable to privileges needed for VACUUM) */
1532  if (!object_ownercheck(RelationRelationId, indexoid, GetUserId()))
1534  RelationGetRelationName(indexRel));
1535 
1536  /*
1537  * Since we did the IndexGetRelation call above without any lock, it's
1538  * barely possible that a race against an index drop/recreation could have
1539  * netted us the wrong table. Recheck.
1540  */
1541  if (heapRel == NULL || heapoid != IndexGetRelation(indexoid, false))
1542  ereport(ERROR,
1544  errmsg("could not open parent table of index \"%s\"",
1545  RelationGetRelationName(indexRel))));
1546 
1547  /* see gin_clean_pending_list() */
1548  if (indexRel->rd_index->indisvalid)
1549  {
1550  /* the revmap does the hard work */
1551  do
1552  {
1553  done = brinRevmapDesummarizeRange(indexRel, heapBlk);
1554  }
1555  while (!done);
1556  }
1557  else
1558  ereport(DEBUG1,
1559  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1560  errmsg("index \"%s\" is not valid",
1561  RelationGetRelationName(indexRel))));
1562 
1565 
1566  PG_RETURN_VOID();
1567 }
1568 
1569 /*
1570  * Build a BrinDesc used to create or scan a BRIN index
1571  */
1572 BrinDesc *
1574 {
1575  BrinOpcInfo **opcinfo;
1576  BrinDesc *bdesc;
1577  TupleDesc tupdesc;
1578  int totalstored = 0;
1579  int keyno;
1580  long totalsize;
1581  MemoryContext cxt;
1582  MemoryContext oldcxt;
1583 
1585  "brin desc cxt",
1587  oldcxt = MemoryContextSwitchTo(cxt);
1588  tupdesc = RelationGetDescr(rel);
1589 
1590  /*
1591  * Obtain BrinOpcInfo for each indexed column. While at it, accumulate
1592  * the number of columns stored, since the number is opclass-defined.
1593  */
1594  opcinfo = palloc_array(BrinOpcInfo *, tupdesc->natts);
1595  for (keyno = 0; keyno < tupdesc->natts; keyno++)
1596  {
1597  FmgrInfo *opcInfoFn;
1598  Form_pg_attribute attr = TupleDescAttr(tupdesc, keyno);
1599 
1600  opcInfoFn = index_getprocinfo(rel, keyno + 1, BRIN_PROCNUM_OPCINFO);
1601 
1602  opcinfo[keyno] = (BrinOpcInfo *)
1603  DatumGetPointer(FunctionCall1(opcInfoFn, attr->atttypid));
1604  totalstored += opcinfo[keyno]->oi_nstored;
1605  }
1606 
1607  /* Allocate our result struct and fill it in */
1608  totalsize = offsetof(BrinDesc, bd_info) +
1609  sizeof(BrinOpcInfo *) * tupdesc->natts;
1610 
1611  bdesc = palloc(totalsize);
1612  bdesc->bd_context = cxt;
1613  bdesc->bd_index = rel;
1614  bdesc->bd_tupdesc = tupdesc;
1615  bdesc->bd_disktdesc = NULL; /* generated lazily */
1616  bdesc->bd_totalstored = totalstored;
1617 
1618  for (keyno = 0; keyno < tupdesc->natts; keyno++)
1619  bdesc->bd_info[keyno] = opcinfo[keyno];
1620  pfree(opcinfo);
1621 
1622  MemoryContextSwitchTo(oldcxt);
1623 
1624  return bdesc;
1625 }
1626 
1627 void
1629 {
1630  /* make sure the tupdesc is still valid */
1631  Assert(bdesc->bd_tupdesc->tdrefcount >= 1);
1632  /* no need for retail pfree */
1634 }
1635 
1636 /*
1637  * Fetch index's statistical data into *stats
1638  */
1639 void
1641 {
1642  Buffer metabuffer;
1643  Page metapage;
1644  BrinMetaPageData *metadata;
1645 
1646  metabuffer = ReadBuffer(index, BRIN_METAPAGE_BLKNO);
1647  LockBuffer(metabuffer, BUFFER_LOCK_SHARE);
1648  metapage = BufferGetPage(metabuffer);
1649  metadata = (BrinMetaPageData *) PageGetContents(metapage);
1650 
1651  stats->pagesPerRange = metadata->pagesPerRange;
1652  stats->revmapNumPages = metadata->lastRevmapPage - 1;
1653 
1654  UnlockReleaseBuffer(metabuffer);
1655 }
1656 
1657 /*
1658  * Initialize a BrinBuildState appropriate to create tuples on the given index.
1659  */
1660 static BrinBuildState *
1662  BlockNumber pagesPerRange, BlockNumber tablePages)
1663 {
1665  BlockNumber lastRange = 0;
1666 
1668 
1669  state->bs_irel = idxRel;
1670  state->bs_numtuples = 0;
1671  state->bs_reltuples = 0;
1672  state->bs_currentInsertBuf = InvalidBuffer;
1673  state->bs_pagesPerRange = pagesPerRange;
1674  state->bs_currRangeStart = 0;
1675  state->bs_rmAccess = revmap;
1676  state->bs_bdesc = brin_build_desc(idxRel);
1677  state->bs_dtuple = brin_new_memtuple(state->bs_bdesc);
1678  state->bs_leader = NULL;
1679  state->bs_worker_id = 0;
1680  state->bs_sortstate = NULL;
1681  state->bs_context = CurrentMemoryContext;
1682  state->bs_emptyTuple = NULL;
1683  state->bs_emptyTupleLen = 0;
1684 
1685  /* Remember the memory context to use for an empty tuple, if needed. */
1686  state->bs_context = CurrentMemoryContext;
1687  state->bs_emptyTuple = NULL;
1688  state->bs_emptyTupleLen = 0;
1689 
1690  /*
1691  * Calculate the start of the last page range. Page numbers are 0-based,
1692  * so to calculate the index we need to subtract one. The integer division
1693  * gives us the index of the page range.
1694  */
1695  if (tablePages > 0)
1696  lastRange = ((tablePages - 1) / pagesPerRange) * pagesPerRange;
1697 
1698  /* Now calculate the start of the next range. */
1699  state->bs_maxRangeStart = lastRange + state->bs_pagesPerRange;
1700 
1701  return state;
1702 }
1703 
1704 /*
1705  * Release resources associated with a BrinBuildState.
1706  */
1707 static void
1709 {
1710  /*
1711  * Release the last index buffer used. We might as well ensure that
1712  * whatever free space remains in that page is available in FSM, too.
1713  */
1714  if (!BufferIsInvalid(state->bs_currentInsertBuf))
1715  {
1716  Page page;
1717  Size freespace;
1718  BlockNumber blk;
1719 
1720  page = BufferGetPage(state->bs_currentInsertBuf);
1721  freespace = PageGetFreeSpace(page);
1722  blk = BufferGetBlockNumber(state->bs_currentInsertBuf);
1723  ReleaseBuffer(state->bs_currentInsertBuf);
1724  RecordPageWithFreeSpace(state->bs_irel, blk, freespace);
1725  FreeSpaceMapVacuumRange(state->bs_irel, blk, blk + 1);
1726  }
1727 
1728  brin_free_desc(state->bs_bdesc);
1729  pfree(state->bs_dtuple);
1730  pfree(state);
1731 }
1732 
1733 /*
1734  * On the given BRIN index, summarize the heap page range that corresponds
1735  * to the heap block number given.
1736  *
1737  * This routine can run in parallel with insertions into the heap. To avoid
1738  * missing those values from the summary tuple, we first insert a placeholder
1739  * index tuple into the index, then execute the heap scan; transactions
1740  * concurrent with the scan update the placeholder tuple. After the scan, we
1741  * union the placeholder tuple with the one computed by this routine. The
1742  * update of the index value happens in a loop, so that if somebody updates
1743  * the placeholder tuple after we read it, we detect the case and try again.
1744  * This ensures that the concurrently inserted tuples are not lost.
1745  *
1746  * A further corner case is this routine being asked to summarize the partial
1747  * range at the end of the table. heapNumBlocks is the (possibly outdated)
1748  * table size; if we notice that the requested range lies beyond that size,
1749  * we re-compute the table size after inserting the placeholder tuple, to
1750  * avoid missing pages that were appended recently.
1751  */
1752 static void
1754  BlockNumber heapBlk, BlockNumber heapNumBlks)
1755 {
1756  Buffer phbuf;
1757  BrinTuple *phtup;
1758  Size phsz;
1759  OffsetNumber offset;
1760  BlockNumber scanNumBlks;
1761 
1762  /*
1763  * Insert the placeholder tuple
1764  */
1765  phbuf = InvalidBuffer;
1766  phtup = brin_form_placeholder_tuple(state->bs_bdesc, heapBlk, &phsz);
1767  offset = brin_doinsert(state->bs_irel, state->bs_pagesPerRange,
1768  state->bs_rmAccess, &phbuf,
1769  heapBlk, phtup, phsz);
1770 
1771  /*
1772  * Compute range end. We hold ShareUpdateExclusive lock on table, so it
1773  * cannot shrink concurrently (but it can grow).
1774  */
1775  Assert(heapBlk % state->bs_pagesPerRange == 0);
1776  if (heapBlk + state->bs_pagesPerRange > heapNumBlks)
1777  {
1778  /*
1779  * If we're asked to scan what we believe to be the final range on the
1780  * table (i.e. a range that might be partial) we need to recompute our
1781  * idea of what the latest page is after inserting the placeholder
1782  * tuple. Anyone that grows the table later will update the
1783  * placeholder tuple, so it doesn't matter that we won't scan these
1784  * pages ourselves. Careful: the table might have been extended
1785  * beyond the current range, so clamp our result.
1786  *
1787  * Fortunately, this should occur infrequently.
1788  */
1789  scanNumBlks = Min(RelationGetNumberOfBlocks(heapRel) - heapBlk,
1790  state->bs_pagesPerRange);
1791  }
1792  else
1793  {
1794  /* Easy case: range is known to be complete */
1795  scanNumBlks = state->bs_pagesPerRange;
1796  }
1797 
1798  /*
1799  * Execute the partial heap scan covering the heap blocks in the specified
1800  * page range, summarizing the heap tuples in it. This scan stops just
1801  * short of brinbuildCallback creating the new index entry.
1802  *
1803  * Note that it is critical we use the "any visible" mode of
1804  * table_index_build_range_scan here: otherwise, we would miss tuples
1805  * inserted by transactions that are still in progress, among other corner
1806  * cases.
1807  */
1808  state->bs_currRangeStart = heapBlk;
1809  table_index_build_range_scan(heapRel, state->bs_irel, indexInfo, false, true, false,
1810  heapBlk, scanNumBlks,
1811  brinbuildCallback, (void *) state, NULL);
1812 
1813  /*
1814  * Now we update the values obtained by the scan with the placeholder
1815  * tuple. We do this in a loop which only terminates if we're able to
1816  * update the placeholder tuple successfully; if we are not, this means
1817  * somebody else modified the placeholder tuple after we read it.
1818  */
1819  for (;;)
1820  {
1821  BrinTuple *newtup;
1822  Size newsize;
1823  bool didupdate;
1824  bool samepage;
1825 
1827 
1828  /*
1829  * Update the summary tuple and try to update.
1830  */
1831  newtup = brin_form_tuple(state->bs_bdesc,
1832  heapBlk, state->bs_dtuple, &newsize);
1833  samepage = brin_can_do_samepage_update(phbuf, phsz, newsize);
1834  didupdate =
1835  brin_doupdate(state->bs_irel, state->bs_pagesPerRange,
1836  state->bs_rmAccess, heapBlk, phbuf, offset,
1837  phtup, phsz, newtup, newsize, samepage);
1838  brin_free_tuple(phtup);
1839  brin_free_tuple(newtup);
1840 
1841  /* If the update succeeded, we're done. */
1842  if (didupdate)
1843  break;
1844 
1845  /*
1846  * If the update didn't work, it might be because somebody updated the
1847  * placeholder tuple concurrently. Extract the new version, union it
1848  * with the values we have from the scan, and start over. (There are
1849  * other reasons for the update to fail, but it's simple to treat them
1850  * the same.)
1851  */
1852  phtup = brinGetTupleForHeapBlock(state->bs_rmAccess, heapBlk, &phbuf,
1853  &offset, &phsz, BUFFER_LOCK_SHARE);
1854  /* the placeholder tuple must exist */
1855  if (phtup == NULL)
1856  elog(ERROR, "missing placeholder tuple");
1857  phtup = brin_copy_tuple(phtup, phsz, NULL, NULL);
1859 
1860  /* merge it into the tuple from the heap scan */
1861  union_tuples(state->bs_bdesc, state->bs_dtuple, phtup);
1862  }
1863 
1864  ReleaseBuffer(phbuf);
1865 }
1866 
1867 /*
1868  * Summarize page ranges that are not already summarized. If pageRange is
1869  * BRIN_ALL_BLOCKRANGES then the whole table is scanned; otherwise, only the
1870  * page range containing the given heap page number is scanned.
1871  * If include_partial is true, then the partial range at the end of the table
1872  * is summarized, otherwise not.
1873  *
1874  * For each new index tuple inserted, *numSummarized (if not NULL) is
1875  * incremented; for each existing tuple, *numExisting (if not NULL) is
1876  * incremented.
1877  */
1878 static void
1880  bool include_partial, double *numSummarized, double *numExisting)
1881 {
1882  BrinRevmap *revmap;
1883  BrinBuildState *state = NULL;
1884  IndexInfo *indexInfo = NULL;
1885  BlockNumber heapNumBlocks;
1886  BlockNumber pagesPerRange;
1887  Buffer buf;
1888  BlockNumber startBlk;
1889 
1890  revmap = brinRevmapInitialize(index, &pagesPerRange);
1891 
1892  /* determine range of pages to process */
1893  heapNumBlocks = RelationGetNumberOfBlocks(heapRel);
1894  if (pageRange == BRIN_ALL_BLOCKRANGES)
1895  startBlk = 0;
1896  else
1897  {
1898  startBlk = (pageRange / pagesPerRange) * pagesPerRange;
1899  heapNumBlocks = Min(heapNumBlocks, startBlk + pagesPerRange);
1900  }
1901  if (startBlk > heapNumBlocks)
1902  {
1903  /* Nothing to do if start point is beyond end of table */
1904  brinRevmapTerminate(revmap);
1905  return;
1906  }
1907 
1908  /*
1909  * Scan the revmap to find unsummarized items.
1910  */
1911  buf = InvalidBuffer;
1912  for (; startBlk < heapNumBlocks; startBlk += pagesPerRange)
1913  {
1914  BrinTuple *tup;
1915  OffsetNumber off;
1916 
1917  /*
1918  * Unless requested to summarize even a partial range, go away now if
1919  * we think the next range is partial. Caller would pass true when it
1920  * is typically run once bulk data loading is done
1921  * (brin_summarize_new_values), and false when it is typically the
1922  * result of arbitrarily-scheduled maintenance command (vacuuming).
1923  */
1924  if (!include_partial &&
1925  (startBlk + pagesPerRange > heapNumBlocks))
1926  break;
1927 
1929 
1930  tup = brinGetTupleForHeapBlock(revmap, startBlk, &buf, &off, NULL,
1932  if (tup == NULL)
1933  {
1934  /* no revmap entry for this heap range. Summarize it. */
1935  if (state == NULL)
1936  {
1937  /* first time through */
1938  Assert(!indexInfo);
1940  pagesPerRange,
1942  indexInfo = BuildIndexInfo(index);
1943  }
1944  summarize_range(indexInfo, state, heapRel, startBlk, heapNumBlocks);
1945 
1946  /* and re-initialize state for the next range */
1947  brin_memtuple_initialize(state->bs_dtuple, state->bs_bdesc);
1948 
1949  if (numSummarized)
1950  *numSummarized += 1.0;
1951  }
1952  else
1953  {
1954  if (numExisting)
1955  *numExisting += 1.0;
1957  }
1958  }
1959 
1960  if (BufferIsValid(buf))
1961  ReleaseBuffer(buf);
1962 
1963  /* free resources */
1964  brinRevmapTerminate(revmap);
1965  if (state)
1966  {
1968  pfree(indexInfo);
1969  }
1970 }
1971 
1972 /*
1973  * Given a deformed tuple in the build state, convert it into the on-disk
1974  * format and insert it into the index, making the revmap point to it.
1975  */
1976 static void
1978 {
1979  BrinTuple *tup;
1980  Size size;
1981 
1982  tup = brin_form_tuple(state->bs_bdesc, state->bs_currRangeStart,
1983  state->bs_dtuple, &size);
1984  brin_doinsert(state->bs_irel, state->bs_pagesPerRange, state->bs_rmAccess,
1985  &state->bs_currentInsertBuf, state->bs_currRangeStart,
1986  tup, size);
1987  state->bs_numtuples++;
1988 
1989  pfree(tup);
1990 }
1991 
1992 /*
1993  * Given a deformed tuple in the build state, convert it into the on-disk
1994  * format and write it to a (shared) tuplesort (the leader will insert it
1995  * into the index later).
1996  */
1997 static void
1999 {
2000  BrinTuple *tup;
2001  Size size;
2002 
2003  /* don't insert empty tuples in parallel build */
2004  if (state->bs_dtuple->bt_empty_range)
2005  return;
2006 
2007  tup = brin_form_tuple(state->bs_bdesc, state->bs_currRangeStart,
2008  state->bs_dtuple, &size);
2009 
2010  /* write the BRIN tuple to the tuplesort */
2011  tuplesort_putbrintuple(state->bs_sortstate, tup, size);
2012 
2013  state->bs_numtuples++;
2014 
2015  pfree(tup);
2016 }
2017 
2018 /*
2019  * Given two deformed tuples, adjust the first one so that it's consistent
2020  * with the summary values in both.
2021  */
2022 static void
2024 {
2025  int keyno;
2026  BrinMemTuple *db;
2027  MemoryContext cxt;
2028  MemoryContext oldcxt;
2029 
2030  /* Use our own memory context to avoid retail pfree */
2032  "brin union",
2034  oldcxt = MemoryContextSwitchTo(cxt);
2035  db = brin_deform_tuple(bdesc, b, NULL);
2036  MemoryContextSwitchTo(oldcxt);
2037 
2038  /*
2039  * Check if the ranges are empty.
2040  *
2041  * If at least one of them is empty, we don't need to call per-key union
2042  * functions at all. If "b" is empty, we just use "a" as the result (it
2043  * might be empty fine, but that's fine). If "a" is empty but "b" is not,
2044  * we use "b" as the result (but we have to copy the data into "a" first).
2045  *
2046  * Only when both ranges are non-empty, we actually do the per-key merge.
2047  */
2048 
2049  /* If "b" is empty - ignore it and just use "a" (even if it's empty etc.). */
2050  if (db->bt_empty_range)
2051  {
2052  /* skip the per-key merge */
2053  MemoryContextDelete(cxt);
2054  return;
2055  }
2056 
2057  /*
2058  * Now we know "b" is not empty. If "a" is empty, then "b" is the result.
2059  * But we need to copy the data from "b" to "a" first, because that's how
2060  * we pass result out.
2061  *
2062  * We have to copy all the global/per-key flags etc. too.
2063  */
2064  if (a->bt_empty_range)
2065  {
2066  for (keyno = 0; keyno < bdesc->bd_tupdesc->natts; keyno++)
2067  {
2068  int i;
2069  BrinValues *col_a = &a->bt_columns[keyno];
2070  BrinValues *col_b = &db->bt_columns[keyno];
2071  BrinOpcInfo *opcinfo = bdesc->bd_info[keyno];
2072 
2073  col_a->bv_allnulls = col_b->bv_allnulls;
2074  col_a->bv_hasnulls = col_b->bv_hasnulls;
2075 
2076  /* If "b" has no data, we're done. */
2077  if (col_b->bv_allnulls)
2078  continue;
2079 
2080  for (i = 0; i < opcinfo->oi_nstored; i++)
2081  col_a->bv_values[i] =
2082  datumCopy(col_b->bv_values[i],
2083  opcinfo->oi_typcache[i]->typbyval,
2084  opcinfo->oi_typcache[i]->typlen);
2085  }
2086 
2087  /* "a" started empty, but "b" was not empty, so remember that */
2088  a->bt_empty_range = false;
2089 
2090  /* skip the per-key merge */
2091  MemoryContextDelete(cxt);
2092  return;
2093  }
2094 
2095  /* Now we know neither range is empty. */
2096  for (keyno = 0; keyno < bdesc->bd_tupdesc->natts; keyno++)
2097  {
2098  FmgrInfo *unionFn;
2099  BrinValues *col_a = &a->bt_columns[keyno];
2100  BrinValues *col_b = &db->bt_columns[keyno];
2101  BrinOpcInfo *opcinfo = bdesc->bd_info[keyno];
2102 
2103  if (opcinfo->oi_regular_nulls)
2104  {
2105  /* Does the "b" summary represent any NULL values? */
2106  bool b_has_nulls = (col_b->bv_hasnulls || col_b->bv_allnulls);
2107 
2108  /* Adjust "hasnulls". */
2109  if (!col_a->bv_allnulls && b_has_nulls)
2110  col_a->bv_hasnulls = true;
2111 
2112  /* If there are no values in B, there's nothing left to do. */
2113  if (col_b->bv_allnulls)
2114  continue;
2115 
2116  /*
2117  * Adjust "allnulls". If A doesn't have values, just copy the
2118  * values from B into A, and we're done. We cannot run the
2119  * operators in this case, because values in A might contain
2120  * garbage. Note we already established that B contains values.
2121  *
2122  * Also adjust "hasnulls" in order not to forget the summary
2123  * represents NULL values. This is not redundant with the earlier
2124  * update, because that only happens when allnulls=false.
2125  */
2126  if (col_a->bv_allnulls)
2127  {
2128  int i;
2129 
2130  col_a->bv_allnulls = false;
2131  col_a->bv_hasnulls = true;
2132 
2133  for (i = 0; i < opcinfo->oi_nstored; i++)
2134  col_a->bv_values[i] =
2135  datumCopy(col_b->bv_values[i],
2136  opcinfo->oi_typcache[i]->typbyval,
2137  opcinfo->oi_typcache[i]->typlen);
2138 
2139  continue;
2140  }
2141  }
2142 
2143  unionFn = index_getprocinfo(bdesc->bd_index, keyno + 1,
2145  FunctionCall3Coll(unionFn,
2146  bdesc->bd_index->rd_indcollation[keyno],
2147  PointerGetDatum(bdesc),
2148  PointerGetDatum(col_a),
2149  PointerGetDatum(col_b));
2150  }
2151 
2152  MemoryContextDelete(cxt);
2153 }
2154 
2155 /*
2156  * brin_vacuum_scan
2157  * Do a complete scan of the index during VACUUM.
2158  *
2159  * This routine scans the complete index looking for uncataloged index pages,
2160  * i.e. those that might have been lost due to a crash after index extension
2161  * and such.
2162  */
2163 static void
2165 {
2166  BlockNumber nblocks;
2167  BlockNumber blkno;
2168 
2169  /*
2170  * Scan the index in physical order, and clean up any possible mess in
2171  * each page.
2172  */
2173  nblocks = RelationGetNumberOfBlocks(idxrel);
2174  for (blkno = 0; blkno < nblocks; blkno++)
2175  {
2176  Buffer buf;
2177 
2179 
2180  buf = ReadBufferExtended(idxrel, MAIN_FORKNUM, blkno,
2181  RBM_NORMAL, strategy);
2182 
2183  brin_page_cleanup(idxrel, buf);
2184 
2185  ReleaseBuffer(buf);
2186  }
2187 
2188  /*
2189  * Update all upper pages in the index's FSM, as well. This ensures not
2190  * only that we propagate leaf-page FSM updates made by brin_page_cleanup,
2191  * but also that any pre-existing damage or out-of-dateness is repaired.
2192  */
2193  FreeSpaceMapVacuum(idxrel);
2194 }
2195 
2196 static bool
2198  const Datum *values, const bool *nulls)
2199 {
2200  int keyno;
2201 
2202  /* If the range starts empty, we're certainly going to modify it. */
2203  bool modified = dtup->bt_empty_range;
2204 
2205  /*
2206  * Compare the key values of the new tuple to the stored index values; our
2207  * deformed tuple will get updated if the new tuple doesn't fit the
2208  * original range (note this means we can't break out of the loop early).
2209  * Make a note of whether this happens, so that we know to insert the
2210  * modified tuple later.
2211  */
2212  for (keyno = 0; keyno < bdesc->bd_tupdesc->natts; keyno++)
2213  {
2214  Datum result;
2215  BrinValues *bval;
2216  FmgrInfo *addValue;
2217  bool has_nulls;
2218 
2219  bval = &dtup->bt_columns[keyno];
2220 
2221  /*
2222  * Does the range have actual NULL values? Either of the flags can be
2223  * set, but we ignore the state before adding first row.
2224  *
2225  * We have to remember this, because we'll modify the flags and we
2226  * need to know if the range started as empty.
2227  */
2228  has_nulls = ((!dtup->bt_empty_range) &&
2229  (bval->bv_hasnulls || bval->bv_allnulls));
2230 
2231  /*
2232  * If the value we're adding is NULL, handle it locally. Otherwise
2233  * call the BRIN_PROCNUM_ADDVALUE procedure.
2234  */
2235  if (bdesc->bd_info[keyno]->oi_regular_nulls && nulls[keyno])
2236  {
2237  /*
2238  * If the new value is null, we record that we saw it if it's the
2239  * first one; otherwise, there's nothing to do.
2240  */
2241  if (!bval->bv_hasnulls)
2242  {
2243  bval->bv_hasnulls = true;
2244  modified = true;
2245  }
2246 
2247  continue;
2248  }
2249 
2250  addValue = index_getprocinfo(idxRel, keyno + 1,
2252  result = FunctionCall4Coll(addValue,
2253  idxRel->rd_indcollation[keyno],
2254  PointerGetDatum(bdesc),
2255  PointerGetDatum(bval),
2256  values[keyno],
2257  nulls[keyno]);
2258  /* if that returned true, we need to insert the updated tuple */
2259  modified |= DatumGetBool(result);
2260 
2261  /*
2262  * If the range was had actual NULL values (i.e. did not start empty),
2263  * make sure we don't forget about the NULL values. Either the
2264  * allnulls flag is still set to true, or (if the opclass cleared it)
2265  * we need to set hasnulls=true.
2266  *
2267  * XXX This can only happen when the opclass modified the tuple, so
2268  * the modified flag should be set.
2269  */
2270  if (has_nulls && !(bval->bv_hasnulls || bval->bv_allnulls))
2271  {
2272  Assert(modified);
2273  bval->bv_hasnulls = true;
2274  }
2275  }
2276 
2277  /*
2278  * After updating summaries for all the keys, mark it as not empty.
2279  *
2280  * If we're actually changing the flag value (i.e. tuple started as
2281  * empty), we should have modified the tuple. So we should not see empty
2282  * range that was not modified.
2283  */
2284  Assert(!dtup->bt_empty_range || modified);
2285  dtup->bt_empty_range = false;
2286 
2287  return modified;
2288 }
2289 
2290 static bool
2291 check_null_keys(BrinValues *bval, ScanKey *nullkeys, int nnullkeys)
2292 {
2293  int keyno;
2294 
2295  /*
2296  * First check if there are any IS [NOT] NULL scan keys, and if we're
2297  * violating them.
2298  */
2299  for (keyno = 0; keyno < nnullkeys; keyno++)
2300  {
2301  ScanKey key = nullkeys[keyno];
2302 
2303  Assert(key->sk_attno == bval->bv_attno);
2304 
2305  /* Handle only IS NULL/IS NOT NULL tests */
2306  if (!(key->sk_flags & SK_ISNULL))
2307  continue;
2308 
2309  if (key->sk_flags & SK_SEARCHNULL)
2310  {
2311  /* IS NULL scan key, but range has no NULLs */
2312  if (!bval->bv_allnulls && !bval->bv_hasnulls)
2313  return false;
2314  }
2315  else if (key->sk_flags & SK_SEARCHNOTNULL)
2316  {
2317  /*
2318  * For IS NOT NULL, we can only skip ranges that are known to have
2319  * only nulls.
2320  */
2321  if (bval->bv_allnulls)
2322  return false;
2323  }
2324  else
2325  {
2326  /*
2327  * Neither IS NULL nor IS NOT NULL was used; assume all indexable
2328  * operators are strict and thus return false with NULL value in
2329  * the scan key.
2330  */
2331  return false;
2332  }
2333  }
2334 
2335  return true;
2336 }
2337 
2338 /*
2339  * Create parallel context, and launch workers for leader.
2340  *
2341  * buildstate argument should be initialized (with the exception of the
2342  * tuplesort states, which may later be created based on shared
2343  * state initially set up here).
2344  *
2345  * isconcurrent indicates if operation is CREATE INDEX CONCURRENTLY.
2346  *
2347  * request is the target number of parallel worker processes to launch.
2348  *
2349  * Sets buildstate's BrinLeader, which caller must use to shut down parallel
2350  * mode by passing it to _brin_end_parallel() at the very end of its index
2351  * build. If not even a single worker process can be launched, this is
2352  * never set, and caller should proceed with a serial index build.
2353  */
2354 static void
2356  bool isconcurrent, int request)
2357 {
2358  ParallelContext *pcxt;
2359  int scantuplesortstates;
2360  Snapshot snapshot;
2361  Size estbrinshared;
2362  Size estsort;
2363  BrinShared *brinshared;
2364  Sharedsort *sharedsort;
2365  BrinLeader *brinleader = (BrinLeader *) palloc0(sizeof(BrinLeader));
2366  WalUsage *walusage;
2367  BufferUsage *bufferusage;
2368  bool leaderparticipates = true;
2369  int querylen;
2370 
2371 #ifdef DISABLE_LEADER_PARTICIPATION
2372  leaderparticipates = false;
2373 #endif
2374 
2375  /*
2376  * Enter parallel mode, and create context for parallel build of brin
2377  * index
2378  */
2380  Assert(request > 0);
2381  pcxt = CreateParallelContext("postgres", "_brin_parallel_build_main",
2382  request);
2383 
2384  scantuplesortstates = leaderparticipates ? request + 1 : request;
2385 
2386  /*
2387  * Prepare for scan of the base relation. In a normal index build, we use
2388  * SnapshotAny because we must retrieve all tuples and do our own time
2389  * qual checks (because we have to index RECENTLY_DEAD tuples). In a
2390  * concurrent build, we take a regular MVCC snapshot and index whatever's
2391  * live according to that.
2392  */
2393  if (!isconcurrent)
2394  snapshot = SnapshotAny;
2395  else
2397 
2398  /*
2399  * Estimate size for our own PARALLEL_KEY_BRIN_SHARED workspace.
2400  */
2401  estbrinshared = _brin_parallel_estimate_shared(heap, snapshot);
2402  shm_toc_estimate_chunk(&pcxt->estimator, estbrinshared);
2403  estsort = tuplesort_estimate_shared(scantuplesortstates);
2404  shm_toc_estimate_chunk(&pcxt->estimator, estsort);
2405 
2406  shm_toc_estimate_keys(&pcxt->estimator, 2);
2407 
2408  /*
2409  * Estimate space for WalUsage and BufferUsage -- PARALLEL_KEY_WAL_USAGE
2410  * and PARALLEL_KEY_BUFFER_USAGE.
2411  *
2412  * If there are no extensions loaded that care, we could skip this. We
2413  * have no way of knowing whether anyone's looking at pgWalUsage or
2414  * pgBufferUsage, so do it unconditionally.
2415  */
2417  mul_size(sizeof(WalUsage), pcxt->nworkers));
2418  shm_toc_estimate_keys(&pcxt->estimator, 1);
2420  mul_size(sizeof(BufferUsage), pcxt->nworkers));
2421  shm_toc_estimate_keys(&pcxt->estimator, 1);
2422 
2423  /* Finally, estimate PARALLEL_KEY_QUERY_TEXT space */
2424  if (debug_query_string)
2425  {
2426  querylen = strlen(debug_query_string);
2427  shm_toc_estimate_chunk(&pcxt->estimator, querylen + 1);
2428  shm_toc_estimate_keys(&pcxt->estimator, 1);
2429  }
2430  else
2431  querylen = 0; /* keep compiler quiet */
2432 
2433  /* Everyone's had a chance to ask for space, so now create the DSM */
2434  InitializeParallelDSM(pcxt);
2435 
2436  /* If no DSM segment was available, back out (do serial build) */
2437  if (pcxt->seg == NULL)
2438  {
2439  if (IsMVCCSnapshot(snapshot))
2440  UnregisterSnapshot(snapshot);
2441  DestroyParallelContext(pcxt);
2442  ExitParallelMode();
2443  return;
2444  }
2445 
2446  /* Store shared build state, for which we reserved space */
2447  brinshared = (BrinShared *) shm_toc_allocate(pcxt->toc, estbrinshared);
2448  /* Initialize immutable state */
2449  brinshared->heaprelid = RelationGetRelid(heap);
2450  brinshared->indexrelid = RelationGetRelid(index);
2451  brinshared->isconcurrent = isconcurrent;
2452  brinshared->scantuplesortstates = scantuplesortstates;
2453  brinshared->pagesPerRange = buildstate->bs_pagesPerRange;
2454  brinshared->queryid = pgstat_get_my_query_id();
2455  ConditionVariableInit(&brinshared->workersdonecv);
2456  SpinLockInit(&brinshared->mutex);
2457 
2458  /* Initialize mutable state */
2459  brinshared->nparticipantsdone = 0;
2460  brinshared->reltuples = 0.0;
2461  brinshared->indtuples = 0.0;
2462 
2464  ParallelTableScanFromBrinShared(brinshared),
2465  snapshot);
2466 
2467  /*
2468  * Store shared tuplesort-private state, for which we reserved space.
2469  * Then, initialize opaque state using tuplesort routine.
2470  */
2471  sharedsort = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);
2472  tuplesort_initialize_shared(sharedsort, scantuplesortstates,
2473  pcxt->seg);
2474 
2475  /*
2476  * Store shared tuplesort-private state, for which we reserved space.
2477  * Then, initialize opaque state using tuplesort routine.
2478  */
2479  shm_toc_insert(pcxt->toc, PARALLEL_KEY_BRIN_SHARED, brinshared);
2480  shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT, sharedsort);
2481 
2482  /* Store query string for workers */
2483  if (debug_query_string)
2484  {
2485  char *sharedquery;
2486 
2487  sharedquery = (char *) shm_toc_allocate(pcxt->toc, querylen + 1);
2488  memcpy(sharedquery, debug_query_string, querylen + 1);
2489  shm_toc_insert(pcxt->toc, PARALLEL_KEY_QUERY_TEXT, sharedquery);
2490  }
2491 
2492  /*
2493  * Allocate space for each worker's WalUsage and BufferUsage; no need to
2494  * initialize.
2495  */
2496  walusage = shm_toc_allocate(pcxt->toc,
2497  mul_size(sizeof(WalUsage), pcxt->nworkers));
2498  shm_toc_insert(pcxt->toc, PARALLEL_KEY_WAL_USAGE, walusage);
2499  bufferusage = shm_toc_allocate(pcxt->toc,
2500  mul_size(sizeof(BufferUsage), pcxt->nworkers));
2501  shm_toc_insert(pcxt->toc, PARALLEL_KEY_BUFFER_USAGE, bufferusage);
2502 
2503  /* Launch workers, saving status for leader/caller */
2504  LaunchParallelWorkers(pcxt);
2505  brinleader->pcxt = pcxt;
2506  brinleader->nparticipanttuplesorts = pcxt->nworkers_launched;
2507  if (leaderparticipates)
2508  brinleader->nparticipanttuplesorts++;
2509  brinleader->brinshared = brinshared;
2510  brinleader->sharedsort = sharedsort;
2511  brinleader->snapshot = snapshot;
2512  brinleader->walusage = walusage;
2513  brinleader->bufferusage = bufferusage;
2514 
2515  /* If no workers were successfully launched, back out (do serial build) */
2516  if (pcxt->nworkers_launched == 0)
2517  {
2518  _brin_end_parallel(brinleader, NULL);
2519  return;
2520  }
2521 
2522  /* Save leader state now that it's clear build will be parallel */
2523  buildstate->bs_leader = brinleader;
2524 
2525  /* Join heap scan ourselves */
2526  if (leaderparticipates)
2527  _brin_leader_participate_as_worker(buildstate, heap, index);
2528 
2529  /*
2530  * Caller needs to wait for all launched workers when we return. Make
2531  * sure that the failure-to-start case will not hang forever.
2532  */
2534 }
2535 
2536 /*
2537  * Shut down workers, destroy parallel context, and end parallel mode.
2538  */
2539 static void
2541 {
2542  int i;
2543 
2544  /* Shutdown worker processes */
2545  WaitForParallelWorkersToFinish(brinleader->pcxt);
2546 
2547  /*
2548  * Next, accumulate WAL usage. (This must wait for the workers to finish,
2549  * or we might get incomplete data.)
2550  */
2551  for (i = 0; i < brinleader->pcxt->nworkers_launched; i++)
2552  InstrAccumParallelQuery(&brinleader->bufferusage[i], &brinleader->walusage[i]);
2553 
2554  /* Free last reference to MVCC snapshot, if one was used */
2555  if (IsMVCCSnapshot(brinleader->snapshot))
2556  UnregisterSnapshot(brinleader->snapshot);
2557  DestroyParallelContext(brinleader->pcxt);
2558  ExitParallelMode();
2559 }
2560 
2561 /*
2562  * Within leader, wait for end of heap scan.
2563  *
2564  * When called, parallel heap scan started by _brin_begin_parallel() will
2565  * already be underway within worker processes (when leader participates
2566  * as a worker, we should end up here just as workers are finishing).
2567  *
2568  * Returns the total number of heap tuples scanned.
2569  */
2570 static double
2572 {
2573  BrinShared *brinshared = state->bs_leader->brinshared;
2574  int nparticipanttuplesorts;
2575 
2576  nparticipanttuplesorts = state->bs_leader->nparticipanttuplesorts;
2577  for (;;)
2578  {
2579  SpinLockAcquire(&brinshared->mutex);
2580  if (brinshared->nparticipantsdone == nparticipanttuplesorts)
2581  {
2582  /* copy the data into leader state */
2583  state->bs_reltuples = brinshared->reltuples;
2584  state->bs_numtuples = brinshared->indtuples;
2585 
2586  SpinLockRelease(&brinshared->mutex);
2587  break;
2588  }
2589  SpinLockRelease(&brinshared->mutex);
2590 
2592  WAIT_EVENT_PARALLEL_CREATE_INDEX_SCAN);
2593  }
2594 
2596 
2597  return state->bs_reltuples;
2598 }
2599 
2600 /*
2601  * Within leader, wait for end of heap scan and merge per-worker results.
2602  *
2603  * After waiting for all workers to finish, merge the per-worker results into
2604  * the complete index. The results from each worker are sorted by block number
2605  * (start of the page range). While combining the per-worker results we merge
2606  * summaries for the same page range, and also fill-in empty summaries for
2607  * ranges without any tuples.
2608  *
2609  * Returns the total number of heap tuples scanned.
2610  */
2611 static double
2613 {
2614  BrinTuple *btup;
2615  BrinMemTuple *memtuple = NULL;
2616  Size tuplen;
2617  BlockNumber prevblkno = InvalidBlockNumber;
2618  MemoryContext rangeCxt,
2619  oldCxt;
2620  double reltuples;
2621 
2622  /* wait for workers to scan table and produce partial results */
2623  reltuples = _brin_parallel_heapscan(state);
2624 
2625  /* do the actual sort in the leader */
2626  tuplesort_performsort(state->bs_sortstate);
2627 
2628  /*
2629  * Initialize BrinMemTuple we'll use to union summaries from workers (in
2630  * case they happened to produce parts of the same page range).
2631  */
2632  memtuple = brin_new_memtuple(state->bs_bdesc);
2633 
2634  /*
2635  * Create a memory context we'll reset to combine results for a single
2636  * page range (received from the workers). We don't expect huge number of
2637  * overlaps under regular circumstances, because for large tables the
2638  * chunk size is likely larger than the BRIN page range), but it can
2639  * happen, and the union functions may do all kinds of stuff. So we better
2640  * reset the context once in a while.
2641  */
2643  "brin union",
2645  oldCxt = MemoryContextSwitchTo(rangeCxt);
2646 
2647  /*
2648  * Read the BRIN tuples from the shared tuplesort, sorted by block number.
2649  * That probably gives us an index that is cheaper to scan, thanks to
2650  * mostly getting data from the same index page as before.
2651  */
2652  while ((btup = tuplesort_getbrintuple(state->bs_sortstate, &tuplen, true)) != NULL)
2653  {
2654  /* Ranges should be multiples of pages_per_range for the index. */
2655  Assert(btup->bt_blkno % state->bs_leader->brinshared->pagesPerRange == 0);
2656 
2657  /*
2658  * Do we need to union summaries for the same page range?
2659  *
2660  * If this is the first brin tuple we read, then just deform it into
2661  * the memtuple, and continue with the next one from tuplesort. We
2662  * however may need to insert empty summaries into the index.
2663  *
2664  * If it's the same block as the last we saw, we simply union the brin
2665  * tuple into it, and we're done - we don't even need to insert empty
2666  * ranges, because that was done earlier when we saw the first brin
2667  * tuple (for this range).
2668  *
2669  * Finally, if it's not the first brin tuple, and it's not the same
2670  * page range, we need to do the insert and then deform the tuple into
2671  * the memtuple. Then we'll insert empty ranges before the new brin
2672  * tuple, if needed.
2673  */
2674  if (prevblkno == InvalidBlockNumber)
2675  {
2676  /* First brin tuples, just deform into memtuple. */
2677  memtuple = brin_deform_tuple(state->bs_bdesc, btup, memtuple);
2678 
2679  /* continue to insert empty pages before thisblock */
2680  }
2681  else if (memtuple->bt_blkno == btup->bt_blkno)
2682  {
2683  /*
2684  * Not the first brin tuple, but same page range as the previous
2685  * one, so we can merge it into the memtuple.
2686  */
2687  union_tuples(state->bs_bdesc, memtuple, btup);
2688  continue;
2689  }
2690  else
2691  {
2692  BrinTuple *tmp;
2693  Size len;
2694 
2695  /*
2696  * We got brin tuple for a different page range, so form a brin
2697  * tuple from the memtuple, insert it, and re-init the memtuple
2698  * from the new brin tuple.
2699  */
2700  tmp = brin_form_tuple(state->bs_bdesc, memtuple->bt_blkno,
2701  memtuple, &len);
2702 
2703  brin_doinsert(state->bs_irel, state->bs_pagesPerRange, state->bs_rmAccess,
2704  &state->bs_currentInsertBuf, tmp->bt_blkno, tmp, len);
2705 
2706  /*
2707  * Reset the per-output-range context. This frees all the memory
2708  * possibly allocated by the union functions, and also the BRIN
2709  * tuple we just formed and inserted.
2710  */
2711  MemoryContextReset(rangeCxt);
2712 
2713  memtuple = brin_deform_tuple(state->bs_bdesc, btup, memtuple);
2714 
2715  /* continue to insert empty pages before thisblock */
2716  }
2717 
2718  /* Fill empty ranges for all ranges missing in the tuplesort. */
2719  brin_fill_empty_ranges(state, prevblkno, btup->bt_blkno);
2720 
2721  prevblkno = btup->bt_blkno;
2722  }
2723 
2724  tuplesort_end(state->bs_sortstate);
2725 
2726  /* Fill the BRIN tuple for the last page range with data. */
2727  if (prevblkno != InvalidBlockNumber)
2728  {
2729  BrinTuple *tmp;
2730  Size len;
2731 
2732  tmp = brin_form_tuple(state->bs_bdesc, memtuple->bt_blkno,
2733  memtuple, &len);
2734 
2735  brin_doinsert(state->bs_irel, state->bs_pagesPerRange, state->bs_rmAccess,
2736  &state->bs_currentInsertBuf, tmp->bt_blkno, tmp, len);
2737 
2738  pfree(tmp);
2739  }
2740 
2741  /* Fill empty ranges at the end, for all ranges missing in the tuplesort. */
2742  brin_fill_empty_ranges(state, prevblkno, state->bs_maxRangeStart);
2743 
2744  /*
2745  * Switch back to the original memory context, and destroy the one we
2746  * created to isolate the union_tuple calls.
2747  */
2748  MemoryContextSwitchTo(oldCxt);
2749  MemoryContextDelete(rangeCxt);
2750 
2751  return reltuples;
2752 }
2753 
2754 /*
2755  * Returns size of shared memory required to store state for a parallel
2756  * brin index build based on the snapshot its parallel scan will use.
2757  */
2758 static Size
2760 {
2761  /* c.f. shm_toc_allocate as to why BUFFERALIGN is used */
2762  return add_size(BUFFERALIGN(sizeof(BrinShared)),
2763  table_parallelscan_estimate(heap, snapshot));
2764 }
2765 
2766 /*
2767  * Within leader, participate as a parallel worker.
2768  */
2769 static void
2771 {
2772  BrinLeader *brinleader = buildstate->bs_leader;
2773  int sortmem;
2774 
2775  /*
2776  * Might as well use reliable figure when doling out maintenance_work_mem
2777  * (when requested number of workers were not launched, this will be
2778  * somewhat higher than it is for other workers).
2779  */
2780  sortmem = maintenance_work_mem / brinleader->nparticipanttuplesorts;
2781 
2782  /* Perform work common to all participants */
2783  _brin_parallel_scan_and_build(buildstate, brinleader->brinshared,
2784  brinleader->sharedsort, heap, index, sortmem, true);
2785 }
2786 
2787 /*
2788  * Perform a worker's portion of a parallel sort.
2789  *
2790  * This generates a tuplesort for the worker portion of the table.
2791  *
2792  * sortmem is the amount of working memory to use within each worker,
2793  * expressed in KBs.
2794  *
2795  * When this returns, workers are done, and need only release resources.
2796  */
2797 static void
2799  BrinShared *brinshared, Sharedsort *sharedsort,
2800  Relation heap, Relation index,
2801  int sortmem, bool progress)
2802 {
2803  SortCoordinate coordinate;
2804  TableScanDesc scan;
2805  double reltuples;
2806  IndexInfo *indexInfo;
2807 
2808  /* Initialize local tuplesort coordination state */
2809  coordinate = palloc0(sizeof(SortCoordinateData));
2810  coordinate->isWorker = true;
2811  coordinate->nParticipants = -1;
2812  coordinate->sharedsort = sharedsort;
2813 
2814  /* Begin "partial" tuplesort */
2815  state->bs_sortstate = tuplesort_begin_index_brin(sortmem, coordinate,
2816  TUPLESORT_NONE);
2817 
2818  /* Join parallel scan */
2819  indexInfo = BuildIndexInfo(index);
2820  indexInfo->ii_Concurrent = brinshared->isconcurrent;
2821 
2822  scan = table_beginscan_parallel(heap,
2823  ParallelTableScanFromBrinShared(brinshared));
2824 
2825  reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
2827 
2828  /* insert the last item */
2830 
2831  /* sort the BRIN ranges built by this worker */
2832  tuplesort_performsort(state->bs_sortstate);
2833 
2834  state->bs_reltuples += reltuples;
2835 
2836  /*
2837  * Done. Record ambuild statistics.
2838  */
2839  SpinLockAcquire(&brinshared->mutex);
2840  brinshared->nparticipantsdone++;
2841  brinshared->reltuples += state->bs_reltuples;
2842  brinshared->indtuples += state->bs_numtuples;
2843  SpinLockRelease(&brinshared->mutex);
2844 
2845  /* Notify leader */
2846  ConditionVariableSignal(&brinshared->workersdonecv);
2847 
2848  tuplesort_end(state->bs_sortstate);
2849 }
2850 
2851 /*
2852  * Perform work within a launched parallel process.
2853  */
2854 void
2856 {
2857  char *sharedquery;
2858  BrinShared *brinshared;
2859  Sharedsort *sharedsort;
2860  BrinBuildState *buildstate;
2861  Relation heapRel;
2862  Relation indexRel;
2863  LOCKMODE heapLockmode;
2864  LOCKMODE indexLockmode;
2865  WalUsage *walusage;
2866  BufferUsage *bufferusage;
2867  int sortmem;
2868 
2869  /*
2870  * The only possible status flag that can be set to the parallel worker is
2871  * PROC_IN_SAFE_IC.
2872  */
2873  Assert((MyProc->statusFlags == 0) ||
2875 
2876  /* Set debug_query_string for individual workers first */
2877  sharedquery = shm_toc_lookup(toc, PARALLEL_KEY_QUERY_TEXT, true);
2878  debug_query_string = sharedquery;
2879 
2880  /* Report the query string from leader */
2882 
2883  /* Look up brin shared state */
2884  brinshared = shm_toc_lookup(toc, PARALLEL_KEY_BRIN_SHARED, false);
2885 
2886  /* Open relations using lock modes known to be obtained by index.c */
2887  if (!brinshared->isconcurrent)
2888  {
2889  heapLockmode = ShareLock;
2890  indexLockmode = AccessExclusiveLock;
2891  }
2892  else
2893  {
2894  heapLockmode = ShareUpdateExclusiveLock;
2895  indexLockmode = RowExclusiveLock;
2896  }
2897 
2898  /* Track query ID */
2899  pgstat_report_query_id(brinshared->queryid, false);
2900 
2901  /* Open relations within worker */
2902  heapRel = table_open(brinshared->heaprelid, heapLockmode);
2903  indexRel = index_open(brinshared->indexrelid, indexLockmode);
2904 
2905  buildstate = initialize_brin_buildstate(indexRel, NULL,
2906  brinshared->pagesPerRange,
2908 
2909  /* Look up shared state private to tuplesort.c */
2910  sharedsort = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT, false);
2911  tuplesort_attach_shared(sharedsort, seg);
2912 
2913  /* Prepare to track buffer usage during parallel execution */
2915 
2916  /*
2917  * Might as well use reliable figure when doling out maintenance_work_mem
2918  * (when requested number of workers were not launched, this will be
2919  * somewhat higher than it is for other workers).
2920  */
2921  sortmem = maintenance_work_mem / brinshared->scantuplesortstates;
2922 
2923  _brin_parallel_scan_and_build(buildstate, brinshared, sharedsort,
2924  heapRel, indexRel, sortmem, false);
2925 
2926  /* Report WAL/buffer usage during parallel execution */
2927  bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false);
2928  walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false);
2930  &walusage[ParallelWorkerNumber]);
2931 
2932  index_close(indexRel, indexLockmode);
2933  table_close(heapRel, heapLockmode);
2934 }
2935 
2936 /*
2937  * brin_build_empty_tuple
2938  * Maybe initialize a BRIN tuple representing empty range.
2939  *
2940  * Returns a BRIN tuple representing an empty page range starting at the
2941  * specified block number. The empty tuple is initialized only once, when it's
2942  * needed for the first time, stored in the memory context bs_context to ensure
2943  * proper life span, and reused on following calls. All empty tuples are
2944  * exactly the same except for the bt_blkno field, which is set to the value
2945  * in blkno parameter.
2946  */
2947 static void
2949 {
2950  /* First time an empty tuple is requested? If yes, initialize it. */
2951  if (state->bs_emptyTuple == NULL)
2952  {
2953  MemoryContext oldcxt;
2954  BrinMemTuple *dtuple = brin_new_memtuple(state->bs_bdesc);
2955 
2956  /* Allocate the tuple in context for the whole index build. */
2957  oldcxt = MemoryContextSwitchTo(state->bs_context);
2958 
2959  state->bs_emptyTuple = brin_form_tuple(state->bs_bdesc, blkno, dtuple,
2960  &state->bs_emptyTupleLen);
2961 
2962  MemoryContextSwitchTo(oldcxt);
2963  }
2964  else
2965  {
2966  /* If we already have an empty tuple, just update the block. */
2967  state->bs_emptyTuple->bt_blkno = blkno;
2968  }
2969 }
2970 
2971 /*
2972  * brin_fill_empty_ranges
2973  * Add BRIN index tuples representing empty page ranges.
2974  *
2975  * prevRange/nextRange determine for which page ranges to add empty summaries.
2976  * Both boundaries are exclusive, i.e. only ranges starting at blkno for which
2977  * (prevRange < blkno < nextRange) will be added to the index.
2978  *
2979  * If prevRange is InvalidBlockNumber, this means there was no previous page
2980  * range (i.e. the first empty range to add is for blkno=0).
2981  *
2982  * The empty tuple is built only once, and then reused for all future calls.
2983  */
2984 static void
2986  BlockNumber prevRange, BlockNumber nextRange)
2987 {
2988  BlockNumber blkno;
2989 
2990  /*
2991  * If we already summarized some ranges, we need to start with the next
2992  * one. Otherwise start from the first range of the table.
2993  */
2994  blkno = (prevRange == InvalidBlockNumber) ? 0 : (prevRange + state->bs_pagesPerRange);
2995 
2996  /* Generate empty ranges until we hit the next non-empty range. */
2997  while (blkno < nextRange)
2998  {
2999  /* Did we already build the empty tuple? If not, do it now. */
3000  brin_build_empty_tuple(state, blkno);
3001 
3002  brin_doinsert(state->bs_irel, state->bs_pagesPerRange, state->bs_rmAccess,
3003  &state->bs_currentInsertBuf,
3004  blkno, state->bs_emptyTuple, state->bs_emptyTupleLen);
3005 
3006  /* try next page range */
3007  blkno += state->bs_pagesPerRange;
3008  }
3009 }
@ ACLCHECK_NOT_OWNER
Definition: acl.h:185
void aclcheck_error(AclResult aclerr, ObjectType objtype, const char *objectname)
Definition: aclchk.c:2703
bool object_ownercheck(Oid classid, Oid objectid, Oid roleid)
Definition: aclchk.c:4145
int16 AttrNumber
Definition: attnum.h:21
bool AutoVacuumRequestWork(AutoVacuumWorkItemType type, Oid relationId, BlockNumber blkno)
Definition: autovacuum.c:3212
@ AVW_BRINSummarizeRange
Definition: autovacuum.h:25
int ParallelWorkerNumber
Definition: parallel.c:112
void InitializeParallelDSM(ParallelContext *pcxt)
Definition: parallel.c:205
void WaitForParallelWorkersToFinish(ParallelContext *pcxt)
Definition: parallel.c:775
void LaunchParallelWorkers(ParallelContext *pcxt)
Definition: parallel.c:552
void DestroyParallelContext(ParallelContext *pcxt)
Definition: parallel.c:929
ParallelContext * CreateParallelContext(const char *library_name, const char *function_name, int nworkers)
Definition: parallel.c:167
void WaitForParallelWorkersToAttach(ParallelContext *pcxt)
Definition: parallel.c:672
uint64 pgstat_get_my_query_id(void)
void pgstat_report_query_id(uint64 query_id, bool force)
void pgstat_report_activity(BackendState state, const char *cmd_str)
@ STATE_RUNNING
uint32 BlockNumber
Definition: block.h:31
#define InvalidBlockNumber
Definition: block.h:33
#define MaxBlockNumber
Definition: block.h:35
static Datum values[MAXATTR]
Definition: bootstrap.c:150
#define PARALLEL_KEY_BUFFER_USAGE
Definition: brin.c:51
IndexBulkDeleteResult * brinvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
Definition: brin.c:1309
void brininsertcleanup(Relation index, IndexInfo *indexInfo)
Definition: brin.c:507
static double _brin_parallel_merge(BrinBuildState *state)
Definition: brin.c:2612
static void brin_vacuum_scan(Relation idxrel, BufferAccessStrategy strategy)
Definition: brin.c:2164
Datum brin_desummarize_range(PG_FUNCTION_ARGS)
Definition: brin.c:1483
void brinrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, ScanKey orderbys, int norderbys)
Definition: brin.c:950
static void terminate_brin_buildstate(BrinBuildState *state)
Definition: brin.c:1708
#define PARALLEL_KEY_BRIN_SHARED
Definition: brin.c:47
Datum brin_summarize_range(PG_FUNCTION_ARGS)
Definition: brin.c:1372
static void form_and_spill_tuple(BrinBuildState *state)
Definition: brin.c:1998
#define BRIN_ALL_BLOCKRANGES
Definition: brin.c:209
struct BrinShared BrinShared
Datum brin_summarize_new_values(PG_FUNCTION_ARGS)
Definition: brin.c:1357
IndexScanDesc brinbeginscan(Relation r, int nkeys, int norderbys)
Definition: brin.c:532
IndexBuildResult * brinbuild(Relation heap, Relation index, IndexInfo *indexInfo)
Definition: brin.c:1096
int64 bringetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
Definition: brin.c:560
static void brinsummarize(Relation index, Relation heapRel, BlockNumber pageRange, bool include_partial, double *numSummarized, double *numExisting)
Definition: brin.c:1879
static void form_and_insert_tuple(BrinBuildState *state)
Definition: brin.c:1977
void brinbuildempty(Relation index)
Definition: brin.c:1265
void brin_free_desc(BrinDesc *bdesc)
Definition: brin.c:1628
struct BrinInsertState BrinInsertState
static void union_tuples(BrinDesc *bdesc, BrinMemTuple *a, BrinTuple *b)
Definition: brin.c:2023
static void _brin_parallel_scan_and_build(BrinBuildState *state, BrinShared *brinshared, Sharedsort *sharedsort, Relation heap, Relation index, int sortmem, bool progress)
Definition: brin.c:2798
static BrinBuildState * initialize_brin_buildstate(Relation idxRel, BrinRevmap *revmap, BlockNumber pagesPerRange, BlockNumber tablePages)
Definition: brin.c:1661
static void _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, bool isconcurrent, int request)
Definition: brin.c:2355
void brinGetStats(Relation index, BrinStatsData *stats)
Definition: brin.c:1640
BrinDesc * brin_build_desc(Relation rel)
Definition: brin.c:1573
static void _brin_leader_participate_as_worker(BrinBuildState *buildstate, Relation heap, Relation index)
Definition: brin.c:2770
static bool add_values_to_range(Relation idxRel, BrinDesc *bdesc, BrinMemTuple *dtup, const Datum *values, const bool *nulls)
Definition: brin.c:2197
static void _brin_end_parallel(BrinLeader *brinleader, BrinBuildState *state)
Definition: brin.c:2540
static Size _brin_parallel_estimate_shared(Relation heap, Snapshot snapshot)
Definition: brin.c:2759
struct BrinBuildState BrinBuildState
static void brin_fill_empty_ranges(BrinBuildState *state, BlockNumber prevRange, BlockNumber nextRange)
Definition: brin.c:2985
struct BrinLeader BrinLeader
struct BrinOpaque BrinOpaque
static void summarize_range(IndexInfo *indexInfo, BrinBuildState *state, Relation heapRel, BlockNumber heapBlk, BlockNumber heapNumBlks)
Definition: brin.c:1753
#define ParallelTableScanFromBrinShared(shared)
Definition: brin.c:116
#define PARALLEL_KEY_TUPLESORT
Definition: brin.c:48
static void brinbuildCallbackParallel(Relation index, ItemPointer tid, Datum *values, bool *isnull, bool tupleIsAlive, void *brstate)
Definition: brin.c:1037
bool brininsert(Relation idxRel, Datum *values, bool *nulls, ItemPointer heaptid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, IndexInfo *indexInfo)
Definition: brin.c:339
#define PARALLEL_KEY_QUERY_TEXT
Definition: brin.c:49
Datum brinhandler(PG_FUNCTION_ARGS)
Definition: brin.c:250
void _brin_parallel_build_main(dsm_segment *seg, shm_toc *toc)
Definition: brin.c:2855
static void brin_build_empty_tuple(BrinBuildState *state, BlockNumber blkno)
Definition: brin.c:2948
#define PARALLEL_KEY_WAL_USAGE
Definition: brin.c:50
bytea * brinoptions(Datum reloptions, bool validate)
Definition: brin.c:1339
static double _brin_parallel_heapscan(BrinBuildState *state)
Definition: brin.c:2571
IndexBulkDeleteResult * brinbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state)
Definition: brin.c:1294
static BrinInsertState * initialize_brin_insertstate(Relation idxRel, IndexInfo *indexInfo)
Definition: brin.c:310
static void brinbuildCallback(Relation index, ItemPointer tid, Datum *values, bool *isnull, bool tupleIsAlive, void *brstate)
Definition: brin.c:986
void brinendscan(IndexScanDesc scan)
Definition: brin.c:969
static bool check_null_keys(BrinValues *bval, ScanKey *nullkeys, int nnullkeys)
Definition: brin.c:2291
#define BrinGetPagesPerRange(relation)
Definition: brin.h:40
#define BrinGetAutoSummarize(relation)
Definition: brin.h:46
#define BRIN_LAST_OPTIONAL_PROCNUM
Definition: brin_internal.h:78
#define BRIN_PROCNUM_UNION
Definition: brin_internal.h:73
#define BRIN_PROCNUM_OPTIONS
Definition: brin_internal.h:75
#define BRIN_PROCNUM_OPCINFO
Definition: brin_internal.h:70
#define BRIN_PROCNUM_CONSISTENT
Definition: brin_internal.h:72
#define BRIN_elog(args)
Definition: brin_internal.h:85
#define BRIN_PROCNUM_ADDVALUE
Definition: brin_internal.h:71
#define BRIN_CURRENT_VERSION
Definition: brin_page.h:72
#define BRIN_METAPAGE_BLKNO
Definition: brin_page.h:75
bool brin_doupdate(Relation idxrel, BlockNumber pagesPerRange, BrinRevmap *revmap, BlockNumber heapBlk, Buffer oldbuf, OffsetNumber oldoff, const BrinTuple *origtup, Size origsz, const BrinTuple *newtup, Size newsz, bool samepage)
Definition: brin_pageops.c:53
void brin_page_cleanup(Relation idxrel, Buffer buf)
Definition: brin_pageops.c:624
OffsetNumber brin_doinsert(Relation idxrel, BlockNumber pagesPerRange, BrinRevmap *revmap, Buffer *buffer, BlockNumber heapBlk, BrinTuple *tup, Size itemsz)
Definition: brin_pageops.c:342
void brin_metapage_init(Page page, BlockNumber pagesPerRange, uint16 version)
Definition: brin_pageops.c:486
bool brin_can_do_samepage_update(Buffer buffer, Size origsz, Size newsz)
Definition: brin_pageops.c:323
bool brinRevmapDesummarizeRange(Relation idxrel, BlockNumber heapBlk)
Definition: brin_revmap.c:323
void brinRevmapTerminate(BrinRevmap *revmap)
Definition: brin_revmap.c:100
BrinTuple * brinGetTupleForHeapBlock(BrinRevmap *revmap, BlockNumber heapBlk, Buffer *buf, OffsetNumber *off, Size *size, int mode)
Definition: brin_revmap.c:194
BrinRevmap * brinRevmapInitialize(Relation idxrel, BlockNumber *pagesPerRange)
Definition: brin_revmap.c:70
BrinTuple * brin_form_tuple(BrinDesc *brdesc, BlockNumber blkno, BrinMemTuple *tuple, Size *size)
Definition: brin_tuple.c:99
BrinMemTuple * brin_new_memtuple(BrinDesc *brdesc)
Definition: brin_tuple.c:482
BrinMemTuple * brin_deform_tuple(BrinDesc *brdesc, BrinTuple *tuple, BrinMemTuple *dMemtuple)
Definition: brin_tuple.c:553
BrinMemTuple * brin_memtuple_initialize(BrinMemTuple *dtuple, BrinDesc *brdesc)
Definition: brin_tuple.c:511
BrinTuple * brin_copy_tuple(BrinTuple *tuple, Size len, BrinTuple *dest, Size *destsz)
Definition: brin_tuple.c:446
void brin_free_tuple(BrinTuple *tuple)
Definition: brin_tuple.c:433
BrinTuple * brin_form_placeholder_tuple(BrinDesc *brdesc, BlockNumber blkno, Size *size)
Definition: brin_tuple.c:388
bool brinvalidate(Oid opclassoid)
Definition: brin_validate.c:37
#define SizeOfBrinCreateIdx
Definition: brin_xlog.h:55
#define XLOG_BRIN_CREATE_INDEX
Definition: brin_xlog.h:31
int Buffer
Definition: buf.h:23
#define BufferIsInvalid(buffer)
Definition: buf.h:31
#define InvalidBuffer
Definition: buf.h:25
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:3706
Buffer ExtendBufferedRel(BufferManagerRelation bmr, ForkNumber forkNum, BufferAccessStrategy strategy, uint32 flags)
Definition: bufmgr.c:846
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:4906
void UnlockReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:4923
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:2514
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:5140
Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition: bufmgr.c:793
Buffer ReadBuffer(Relation reln, BlockNumber blockNum)
Definition: bufmgr.c:746
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:189
#define BUFFER_LOCK_SHARE
Definition: bufmgr.h:190
#define RelationGetNumberOfBlocks(reln)
Definition: bufmgr.h:273
static Page BufferGetPage(Buffer buffer)
Definition: bufmgr.h:400
@ EB_SKIP_EXTENSION_LOCK
Definition: bufmgr.h:74
@ EB_LOCK_FIRST
Definition: bufmgr.h:86
@ RBM_NORMAL
Definition: bufmgr.h:45
#define BMR_REL(p_rel)
Definition: bufmgr.h:107
static bool BufferIsValid(Buffer bufnum)
Definition: bufmgr.h:351
Size PageGetFreeSpace(Page page)
Definition: bufpage.c:907
static char * PageGetContents(Page page)
Definition: bufpage.h:257
Pointer Page
Definition: bufpage.h:81
static ItemId PageGetItemId(Page page, OffsetNumber offsetNumber)
Definition: bufpage.h:243
static void PageSetLSN(Page page, XLogRecPtr lsn)
Definition: bufpage.h:391
#define Min(x, y)
Definition: c.h:1007
#define MAXALIGN(LEN)
Definition: c.h:814
signed int int32
Definition: c.h:497
#define PG_USED_FOR_ASSERTS_ONLY
Definition: c.h:185
#define BUFFERALIGN(LEN)
Definition: c.h:816
#define Assert(condition)
Definition: c.h:861
#define lengthof(array)
Definition: c.h:791
#define OidIsValid(objectId)
Definition: c.h:778
size_t Size
Definition: c.h:608
bool ConditionVariableCancelSleep(void)
void ConditionVariableInit(ConditionVariable *cv)
void ConditionVariableSleep(ConditionVariable *cv, uint32 wait_event_info)
void ConditionVariableSignal(ConditionVariable *cv)
Datum datumCopy(Datum value, bool typByVal, int typLen)
Definition: datum.c:132
int errhint(const char *fmt,...)
Definition: elog.c:1317
int errcode(int sqlerrcode)
Definition: elog.c:853
int errmsg(const char *fmt,...)
Definition: elog.c:1070
#define LOG
Definition: elog.h:31
#define DEBUG2
Definition: elog.h:29
#define DEBUG1
Definition: elog.h:30
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:225
#define ereport(elevel,...)
Definition: elog.h:149
#define palloc_object(type)
Definition: fe_memutils.h:62
#define palloc_array(type, count)
Definition: fe_memutils.h:64
#define palloc0_array(type, count)
Definition: fe_memutils.h:65
#define palloc0_object(type)
Definition: fe_memutils.h:63
Datum FunctionCall4Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, Datum arg2, Datum arg3, Datum arg4)
Definition: fmgr.c:1196
Datum Int64GetDatum(int64 X)
Definition: fmgr.c:1807
Datum FunctionCall3Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, Datum arg2, Datum arg3)
Definition: fmgr.c:1171
void fmgr_info_copy(FmgrInfo *dstinfo, FmgrInfo *srcinfo, MemoryContext destcxt)
Definition: fmgr.c:580
#define PG_RETURN_VOID()
Definition: fmgr.h:349
#define PG_GETARG_OID(n)
Definition: fmgr.h:275
#define DirectFunctionCall2(func, arg1, arg2)
Definition: fmgr.h:643
#define PG_GETARG_DATUM(n)
Definition: fmgr.h:268
#define PG_GETARG_INT64(n)
Definition: fmgr.h:283
#define FunctionCall1(flinfo, arg1)
Definition: fmgr.h:659
#define PG_RETURN_INT32(x)
Definition: fmgr.h:354
#define PG_RETURN_POINTER(x)
Definition: fmgr.h:361
#define PG_FUNCTION_ARGS
Definition: fmgr.h:193
void FreeSpaceMapVacuumRange(Relation rel, BlockNumber start, BlockNumber end)
Definition: freespace.c:377
void FreeSpaceMapVacuum(Relation rel)
Definition: freespace.c:358
void RecordPageWithFreeSpace(Relation rel, BlockNumber heapBlk, Size spaceAvail)
Definition: freespace.c:194
IndexScanDesc RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys)
Definition: genam.c:80
bool(* IndexBulkDeleteCallback)(ItemPointer itemptr, void *state)
Definition: genam.h:87
IndexUniqueCheck
Definition: genam.h:116
int maintenance_work_mem
Definition: globals.c:132
int NewGUCNestLevel(void)
Definition: guc.c:2234
void RestrictSearchPath(void)
Definition: guc.c:2245
void AtEOXact_GUC(bool isCommit, int nestLevel)
Definition: guc.c:2261
Oid IndexGetRelation(Oid indexId, bool missing_ok)
Definition: index.c:3534
IndexInfo * BuildIndexInfo(Relation index)
Definition: index.c:2430
void index_close(Relation relation, LOCKMODE lockmode)
Definition: indexam.c:177
FmgrInfo * index_getprocinfo(Relation irel, AttrNumber attnum, uint16 procnum)
Definition: indexam.c:862
Relation index_open(Oid relationId, LOCKMODE lockmode)
Definition: indexam.c:133
void InstrAccumParallelQuery(BufferUsage *bufusage, WalUsage *walusage)
Definition: instrument.c:218
void InstrEndParallelQuery(BufferUsage *bufusage, WalUsage *walusage)
Definition: instrument.c:208
void InstrStartParallelQuery(void)
Definition: instrument.c:200
int b
Definition: isn.c:70
int a
Definition: isn.c:69
int i
Definition: isn.c:73
if(TABLE==NULL||TABLE_index==NULL)
Definition: isn.c:77
#define ItemIdGetLength(itemId)
Definition: itemid.h:59
static OffsetNumber ItemPointerGetOffsetNumber(const ItemPointerData *pointer)
Definition: itemptr.h:124
static BlockNumber ItemPointerGetBlockNumber(const ItemPointerData *pointer)
Definition: itemptr.h:103
int LOCKMODE
Definition: lockdefs.h:26
#define AccessExclusiveLock
Definition: lockdefs.h:43
#define AccessShareLock
Definition: lockdefs.h:36
#define ShareUpdateExclusiveLock
Definition: lockdefs.h:39
#define ShareLock
Definition: lockdefs.h:40
#define RowExclusiveLock
Definition: lockdefs.h:38
void MemoryContextReset(MemoryContext context)
Definition: mcxt.c:383
void pfree(void *pointer)
Definition: mcxt.c:1521
void * palloc0(Size size)
Definition: mcxt.c:1347
MemoryContext CurrentMemoryContext
Definition: mcxt.c:143
void MemoryContextDelete(MemoryContext context)
Definition: mcxt.c:454
void * palloc(Size size)
Definition: mcxt.c:1317
#define AllocSetContextCreate
Definition: memutils.h:129
#define ALLOCSET_DEFAULT_SIZES
Definition: memutils.h:160
#define ALLOCSET_SMALL_SIZES
Definition: memutils.h:170
#define SECURITY_RESTRICTED_OPERATION
Definition: miscadmin.h:312
#define START_CRIT_SECTION()
Definition: miscadmin.h:149
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:122
#define END_CRIT_SECTION()
Definition: miscadmin.h:151
void GetUserIdAndSecContext(Oid *userid, int *sec_context)
Definition: miscinit.c:635
Oid GetUserId(void)
Definition: miscinit.c:514
void SetUserIdAndSecContext(Oid userid, int sec_context)
Definition: miscinit.c:642
#define makeNode(_type_)
Definition: nodes.h:155
uint16 OffsetNumber
Definition: off.h:24
#define FirstOffsetNumber
Definition: off.h:27
@ OBJECT_INDEX
Definition: parsenodes.h:2277
FormData_pg_attribute * Form_pg_attribute
Definition: pg_attribute.h:209
const void size_t len
static char * buf
Definition: pg_test_fsync.c:73
static int progress
Definition: pgbench.c:261
#define ERRCODE_UNDEFINED_TABLE
Definition: pgbench.c:78
#define pgstat_count_index_scan(rel)
Definition: pgstat.h:660
const char * debug_query_string
Definition: postgres.c:88
static bool DatumGetBool(Datum X)
Definition: postgres.h:90
static Datum PointerGetDatum(const void *X)
Definition: postgres.h:322
uintptr_t Datum
Definition: postgres.h:64
static Pointer DatumGetPointer(Datum X)
Definition: postgres.h:312
static Datum Int32GetDatum(int32 X)
Definition: postgres.h:212
#define InvalidOid
Definition: postgres_ext.h:36
unsigned int Oid
Definition: postgres_ext.h:31
#define PROC_IN_SAFE_IC
Definition: proc.h:59
MemoryContextSwitchTo(old_ctx)
static void addrange(struct cvec *cv, chr from, chr to)
Definition: regc_cvec.c:90
#define RelationGetRelid(relation)
Definition: rel.h:505
#define RelationGetDescr(relation)
Definition: rel.h:531
#define RelationGetRelationName(relation)
Definition: rel.h:539
#define RelationNeedsWAL(relation)
Definition: rel.h:628
void * build_reloptions(Datum reloptions, bool validate, relopt_kind kind, Size relopt_struct_size, const relopt_parse_elt *relopt_elems, int num_relopt_elems)
Definition: reloptions.c:1908
@ RELOPT_KIND_BRIN
Definition: reloptions.h:52
@ RELOPT_TYPE_INT
Definition: reloptions.h:32
@ RELOPT_TYPE_BOOL
Definition: reloptions.h:31
@ MAIN_FORKNUM
Definition: relpath.h:58
@ INIT_FORKNUM
Definition: relpath.h:61
void brincostestimate(PlannerInfo *root, IndexPath *path, double loop_count, Cost *indexStartupCost, Cost *indexTotalCost, Selectivity *indexSelectivity, double *indexCorrelation, double *indexPages)
Definition: selfuncs.c:7981
void shm_toc_insert(shm_toc *toc, uint64 key, void *address)
Definition: shm_toc.c:171
void * shm_toc_allocate(shm_toc *toc, Size nbytes)
Definition: shm_toc.c:88
void * shm_toc_lookup(shm_toc *toc, uint64 key, bool noError)
Definition: shm_toc.c:232
#define shm_toc_estimate_chunk(e, sz)
Definition: shm_toc.h:51
#define shm_toc_estimate_keys(e, cnt)
Definition: shm_toc.h:53
Size add_size(Size s1, Size s2)
Definition: shmem.c:493
Size mul_size(Size s1, Size s2)
Definition: shmem.c:510
#define SK_SEARCHNOTNULL
Definition: skey.h:122
#define SK_SEARCHNULL
Definition: skey.h:121
#define SK_ISNULL
Definition: skey.h:115
static pg_noinline void Size size
Definition: slab.c:607
Snapshot GetTransactionSnapshot(void)
Definition: snapmgr.c:216
void UnregisterSnapshot(Snapshot snapshot)
Definition: snapmgr.c:836
Snapshot RegisterSnapshot(Snapshot snapshot)
Definition: snapmgr.c:794
#define SnapshotAny
Definition: snapmgr.h:33
#define IsMVCCSnapshot(snapshot)
Definition: snapmgr.h:62
#define SpinLockInit(lock)
Definition: spin.h:57
#define SpinLockRelease(lock)
Definition: spin.h:61
#define SpinLockAcquire(lock)
Definition: spin.h:59
void relation_close(Relation relation, LOCKMODE lockmode)
Definition: relation.c:205
PGPROC * MyProc
Definition: proc.c:67
BlockNumber bs_maxRangeStart
Definition: brin.c:163
Size bs_emptyTupleLen
Definition: brin.c:169
MemoryContext bs_context
Definition: brin.c:170
BrinMemTuple * bs_dtuple
Definition: brin.c:166
Relation bs_irel
Definition: brin.c:157
BlockNumber bs_pagesPerRange
Definition: brin.c:161
double bs_numtuples
Definition: brin.c:158
Buffer bs_currentInsertBuf
Definition: brin.c:160
BrinRevmap * bs_rmAccess
Definition: brin.c:164
Tuplesortstate * bs_sortstate
Definition: brin.c:185
BrinLeader * bs_leader
Definition: brin.c:177
int bs_worker_id
Definition: brin.c:178
BlockNumber bs_currRangeStart
Definition: brin.c:162
double bs_reltuples
Definition: brin.c:159
BrinDesc * bs_bdesc
Definition: brin.c:165
BrinTuple * bs_emptyTuple
Definition: brin.c:168
int bd_totalstored
Definition: brin_internal.h:59
TupleDesc bd_tupdesc
Definition: brin_internal.h:53
BrinOpcInfo * bd_info[FLEXIBLE_ARRAY_MEMBER]
Definition: brin_internal.h:62
Relation bd_index
Definition: brin_internal.h:50
MemoryContext bd_context
Definition: brin_internal.h:47
TupleDesc bd_disktdesc
Definition: brin_internal.h:56
BrinDesc * bis_desc
Definition: brin.c:195
BrinRevmap * bis_rmAccess
Definition: brin.c:194
BlockNumber bis_pages_per_range
Definition: brin.c:196
int nparticipanttuplesorts
Definition: brin.c:133
WalUsage * walusage
Definition: brin.c:147
BrinShared * brinshared
Definition: brin.c:144
BufferUsage * bufferusage
Definition: brin.c:148
Snapshot snapshot
Definition: brin.c:146
Sharedsort * sharedsort
Definition: brin.c:145
ParallelContext * pcxt
Definition: brin.c:125
BrinValues bt_columns[FLEXIBLE_ARRAY_MEMBER]
Definition: brin_tuple.h:55
BlockNumber bt_blkno
Definition: brin_tuple.h:48
bool bt_placeholder
Definition: brin_tuple.h:46
bool bt_empty_range
Definition: brin_tuple.h:47
BlockNumber lastRevmapPage
Definition: brin_page.h:69
BlockNumber pagesPerRange
Definition: brin_page.h:68
BlockNumber bo_pagesPerRange
Definition: brin.c:204
BrinDesc * bo_bdesc
Definition: brin.c:206
BrinRevmap * bo_rmAccess
Definition: brin.c:205
TypeCacheEntry * oi_typcache[FLEXIBLE_ARRAY_MEMBER]
Definition: brin_internal.h:37
uint16 oi_nstored
Definition: brin_internal.h:28
bool oi_regular_nulls
Definition: brin_internal.h:31
slock_t mutex
Definition: brin.c:87
int scantuplesortstates
Definition: brin.c:68
int nparticipantsdone
Definition: brin.c:99
Oid heaprelid
Definition: brin.c:64
BlockNumber pagesPerRange
Definition: brin.c:67
uint64 queryid
Definition: brin.c:71
ConditionVariable workersdonecv
Definition: brin.c:79
Oid indexrelid
Definition: brin.c:65
bool isconcurrent
Definition: brin.c:66
double indtuples
Definition: brin.c:101
double reltuples
Definition: brin.c:100
BlockNumber revmapNumPages
Definition: brin.h:35
BlockNumber pagesPerRange
Definition: brin.h:34
BlockNumber bt_blkno
Definition: brin_tuple.h:66
bool bv_hasnulls
Definition: brin_tuple.h:32
Datum * bv_values
Definition: brin_tuple.h:34
AttrNumber bv_attno
Definition: brin_tuple.h:31
bool bv_allnulls
Definition: brin_tuple.h:33
Definition: fmgr.h:57
ambuildphasename_function ambuildphasename
Definition: amapi.h:285
ambuildempty_function ambuildempty
Definition: amapi.h:275
amvacuumcleanup_function amvacuumcleanup
Definition: amapi.h:279
bool amclusterable
Definition: amapi.h:249
amoptions_function amoptions
Definition: amapi.h:283
amestimateparallelscan_function amestimateparallelscan
Definition: amapi.h:297
amrestrpos_function amrestrpos
Definition: amapi.h:294
aminsert_function aminsert
Definition: amapi.h:276
amendscan_function amendscan
Definition: amapi.h:292
uint16 amoptsprocnum
Definition: amapi.h:229
amparallelrescan_function amparallelrescan
Definition: amapi.h:299
Oid amkeytype
Definition: amapi.h:265
bool ampredlocks
Definition: amapi.h:251
uint16 amsupport
Definition: amapi.h:227
amcostestimate_function amcostestimate
Definition: amapi.h:281
bool amcanorderbyop
Definition: amapi.h:233
amadjustmembers_function amadjustmembers
Definition: amapi.h:287
ambuild_function ambuild
Definition: amapi.h:274
bool amstorage
Definition: amapi.h:247
uint16 amstrategies
Definition: amapi.h:225
bool amoptionalkey
Definition: amapi.h:241
amgettuple_function amgettuple
Definition: amapi.h:290
amcanreturn_function amcanreturn
Definition: amapi.h:280
bool amcanunique
Definition: amapi.h:237
amgetbitmap_function amgetbitmap
Definition: amapi.h:291
amproperty_function amproperty
Definition: amapi.h:284
ambulkdelete_function ambulkdelete
Definition: amapi.h:278
bool amsearcharray
Definition: amapi.h:243
bool amsummarizing
Definition: amapi.h:261
amvalidate_function amvalidate
Definition: amapi.h:286
ammarkpos_function ammarkpos
Definition: amapi.h:293
bool amcanmulticol
Definition: amapi.h:239
bool amusemaintenanceworkmem
Definition: amapi.h:259
ambeginscan_function ambeginscan
Definition: amapi.h:288
bool amcanparallel
Definition: amapi.h:253
amrescan_function amrescan
Definition: amapi.h:289
bool amcanorder
Definition: amapi.h:231
bool amcanbuildparallel
Definition: amapi.h:255
aminitparallelscan_function aminitparallelscan
Definition: amapi.h:298
uint8 amparallelvacuumoptions
Definition: amapi.h:263
aminsertcleanup_function aminsertcleanup
Definition: amapi.h:277
bool amcanbackward
Definition: amapi.h:235
amgettreeheight_function amgettreeheight
Definition: amapi.h:282
bool amcaninclude
Definition: amapi.h:257
bool amsearchnulls
Definition: amapi.h:245
double heap_tuples
Definition: genam.h:32
double index_tuples
Definition: genam.h:33
BlockNumber num_pages
Definition: genam.h:77
double num_index_tuples
Definition: genam.h:79
void * ii_AmCache
Definition: execnodes.h:210
int ii_ParallelWorkers
Definition: execnodes.h:208
bool ii_Concurrent
Definition: execnodes.h:204
MemoryContext ii_Context
Definition: execnodes.h:211
struct ScanKeyData * keyData
Definition: relscan.h:123
Relation indexRelation
Definition: relscan.h:119
Relation index
Definition: genam.h:46
bool analyze_only
Definition: genam.h:48
BufferAccessStrategy strategy
Definition: genam.h:53
uint8 statusFlags
Definition: proc.h:242
dsm_segment * seg
Definition: parallel.h:42
shm_toc_estimator estimator
Definition: parallel.h:41
shm_toc * toc
Definition: parallel.h:44
int nworkers_launched
Definition: parallel.h:37
Form_pg_index rd_index
Definition: rel.h:192
Oid * rd_indcollation
Definition: rel.h:217
Form_pg_class rd_rel
Definition: rel.h:111
Oid sk_collation
Definition: skey.h:70
Sharedsort * sharedsort
Definition: tuplesort.h:58
int tdrefcount
Definition: tupdesc.h:84
bool typbyval
Definition: typcache.h:40
int16 typlen
Definition: typcache.h:39
Definition: type.h:95
Definition: regguts.h:323
Definition: c.h:690
BlockNumber pagesPerRange
Definition: brin_xlog.h:52
void table_close(Relation relation, LOCKMODE lockmode)
Definition: table.c:126
Relation table_open(Oid relationId, LOCKMODE lockmode)
Definition: table.c:40
TableScanDesc table_beginscan_parallel(Relation relation, ParallelTableScanDesc pscan)
Definition: tableam.c:165
Size table_parallelscan_estimate(Relation rel, Snapshot snapshot)
Definition: tableam.c:130
void table_parallelscan_initialize(Relation rel, ParallelTableScanDesc pscan, Snapshot snapshot)
Definition: tableam.c:145
static double table_index_build_range_scan(Relation table_rel, Relation index_rel, struct IndexInfo *index_info, bool allow_sync, bool anyvisible, bool progress, BlockNumber start_blockno, BlockNumber numblocks, IndexBuildCallback callback, void *callback_state, TableScanDesc scan)
Definition: tableam.h:1808
static double table_index_build_scan(Relation table_rel, Relation index_rel, struct IndexInfo *index_info, bool allow_sync, bool progress, IndexBuildCallback callback, void *callback_state, TableScanDesc scan)
Definition: tableam.h:1775
static void callback(struct sockaddr *addr, struct sockaddr *mask, void *unused)
Definition: test_ifaddrs.c:46
void tbm_add_page(TIDBitmap *tbm, BlockNumber pageno)
Definition: tidbitmap.c:443
#define TupleDescAttr(tupdesc, i)
Definition: tupdesc.h:92
void tuplesort_performsort(Tuplesortstate *state)
Definition: tuplesort.c:1363
void tuplesort_initialize_shared(Sharedsort *shared, int nWorkers, dsm_segment *seg)
Definition: tuplesort.c:2938
Size tuplesort_estimate_shared(int nWorkers)
Definition: tuplesort.c:2917
void tuplesort_end(Tuplesortstate *state)
Definition: tuplesort.c:951
void tuplesort_attach_shared(Sharedsort *shared, dsm_segment *seg)
Definition: tuplesort.c:2961
struct SortCoordinateData * SortCoordinate
Definition: tuplesort.h:61
#define TUPLESORT_NONE
Definition: tuplesort.h:93
Tuplesortstate * tuplesort_begin_index_brin(int workMem, SortCoordinate coordinate, int sortopt)
void tuplesort_putbrintuple(Tuplesortstate *state, BrinTuple *tuple, Size size)
BrinTuple * tuplesort_getbrintuple(Tuplesortstate *state, Size *len, bool forward)
#define VACUUM_OPTION_PARALLEL_CLEANUP
Definition: vacuum.h:63
void ExitParallelMode(void)
Definition: xact.c:1063
void EnterParallelMode(void)
Definition: xact.c:1050
bool RecoveryInProgress(void)
Definition: xlog.c:6333
uint64 XLogRecPtr
Definition: xlogdefs.h:21
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:474
void XLogRegisterData(const char *data, uint32 len)
Definition: xloginsert.c:364
XLogRecPtr log_newpage_buffer(Buffer buffer, bool page_std)
Definition: xloginsert.c:1237
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition: xloginsert.c:242
void XLogBeginInsert(void)
Definition: xloginsert.c:149
#define REGBUF_STANDARD
Definition: xloginsert.h:34
#define REGBUF_WILL_INIT
Definition: xloginsert.h:33