PostgreSQL Source Code  git master
brin.c
Go to the documentation of this file.
1 /*
2  * brin.c
3  * Implementation of BRIN indexes for Postgres
4  *
5  * See src/backend/access/brin/README for details.
6  *
7  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
8  * Portions Copyright (c) 1994, Regents of the University of California
9  *
10  * IDENTIFICATION
11  * src/backend/access/brin/brin.c
12  *
13  * TODO
14  * * ScalarArrayOpExpr (amsearcharray -> SK_SEARCHARRAY)
15  */
16 #include "postgres.h"
17 
18 #include "access/brin.h"
19 #include "access/brin_page.h"
20 #include "access/brin_pageops.h"
21 #include "access/brin_xlog.h"
22 #include "access/relation.h"
23 #include "access/reloptions.h"
24 #include "access/relscan.h"
25 #include "access/table.h"
26 #include "access/tableam.h"
27 #include "access/xloginsert.h"
28 #include "catalog/index.h"
29 #include "catalog/pg_am.h"
30 #include "commands/vacuum.h"
31 #include "miscadmin.h"
32 #include "pgstat.h"
33 #include "postmaster/autovacuum.h"
34 #include "storage/bufmgr.h"
35 #include "storage/freespace.h"
36 #include "tcop/tcopprot.h" /* pgrminclude ignore */
37 #include "utils/acl.h"
38 #include "utils/datum.h"
39 #include "utils/fmgrprotos.h"
40 #include "utils/guc.h"
41 #include "utils/index_selfuncs.h"
42 #include "utils/memutils.h"
43 #include "utils/rel.h"
44 #include "utils/tuplesort.h"
45 
46 /* Magic numbers for parallel state sharing */
47 #define PARALLEL_KEY_BRIN_SHARED UINT64CONST(0xB000000000000001)
48 #define PARALLEL_KEY_TUPLESORT UINT64CONST(0xB000000000000002)
49 #define PARALLEL_KEY_QUERY_TEXT UINT64CONST(0xB000000000000003)
50 #define PARALLEL_KEY_WAL_USAGE UINT64CONST(0xB000000000000004)
51 #define PARALLEL_KEY_BUFFER_USAGE UINT64CONST(0xB000000000000005)
52 
53 /*
54  * Status for index builds performed in parallel. This is allocated in a
55  * dynamic shared memory segment.
56  */
57 typedef struct BrinShared
58 {
59  /*
60  * These fields are not modified during the build. They primarily exist
61  * for the benefit of worker processes that need to create state
62  * corresponding to that used by the leader.
63  */
69 
70  /*
71  * workersdonecv is used to monitor the progress of workers. All parallel
72  * participants must indicate that they are done before leader can use
73  * results built by the workers (and before leader can write the data into
74  * the index).
75  */
77 
78  /*
79  * mutex protects all fields before heapdesc.
80  *
81  * These fields contain status information of interest to BRIN index
82  * builds that must work just the same when an index is built in parallel.
83  */
85 
86  /*
87  * Mutable state that is maintained by workers, and reported back to
88  * leader at end of the scans.
89  *
90  * nparticipantsdone is number of worker processes finished.
91  *
92  * reltuples is the total number of input heap tuples.
93  *
94  * indtuples is the total number of tuples that made it into the index.
95  */
97  double reltuples;
98  double indtuples;
99 
100  /*
101  * ParallelTableScanDescData data follows. Can't directly embed here, as
102  * implementations of the parallel table scan desc interface might need
103  * stronger alignment.
104  */
106 
107 /*
108  * Return pointer to a BrinShared's parallel table scan.
109  *
110  * c.f. shm_toc_allocate as to why BUFFERALIGN is used, rather than just
111  * MAXALIGN.
112  */
113 #define ParallelTableScanFromBrinShared(shared) \
114  (ParallelTableScanDesc) ((char *) (shared) + BUFFERALIGN(sizeof(BrinShared)))
115 
116 /*
117  * Status for leader in parallel index build.
118  */
119 typedef struct BrinLeader
120 {
121  /* parallel context itself */
123 
124  /*
125  * nparticipanttuplesorts is the exact number of worker processes
126  * successfully launched, plus one leader process if it participates as a
127  * worker (only DISABLE_LEADER_PARTICIPATION builds avoid leader
128  * participating as a worker).
129  */
131 
132  /*
133  * Leader process convenience pointers to shared state (leader avoids TOC
134  * lookups).
135  *
136  * brinshared is the shared state for entire build. sharedsort is the
137  * shared, tuplesort-managed state passed to each process tuplesort.
138  * snapshot is the snapshot used by the scan iff an MVCC snapshot is
139  * required.
140  */
147 
148 /*
149  * We use a BrinBuildState during initial construction of a BRIN index.
150  * The running state is kept in a BrinMemTuple.
151  */
152 typedef struct BrinBuildState
153 {
155  double bs_numtuples;
156  double bs_reltuples;
164 
168 
169  /*
170  * bs_leader is only present when a parallel index build is performed, and
171  * only in the leader process. (Actually, only the leader process has a
172  * BrinBuildState.)
173  */
176 
177  /*
178  * The sortstate is used by workers (including the leader). It has to be
179  * part of the build state, because that's the only thing passed to the
180  * build callback etc.
181  */
184 
185 /*
186  * We use a BrinInsertState to capture running state spanning multiple
187  * brininsert invocations, within the same command.
188  */
189 typedef struct BrinInsertState
190 {
195 
196 /*
197  * Struct used as "opaque" during index scans
198  */
199 typedef struct BrinOpaque
200 {
205 
206 #define BRIN_ALL_BLOCKRANGES InvalidBlockNumber
207 
209  BrinRevmap *revmap,
210  BlockNumber pagesPerRange,
211  BlockNumber tablePages);
214 static void brinsummarize(Relation index, Relation heapRel, BlockNumber pageRange,
215  bool include_partial, double *numSummarized, double *numExisting);
218 static void union_tuples(BrinDesc *bdesc, BrinMemTuple *a,
219  BrinTuple *b);
220 static void brin_vacuum_scan(Relation idxrel, BufferAccessStrategy strategy);
221 static bool add_values_to_range(Relation idxRel, BrinDesc *bdesc,
222  BrinMemTuple *dtup, const Datum *values, const bool *nulls);
223 static bool check_null_keys(BrinValues *bval, ScanKey *nullkeys, int nnullkeys);
225  BlockNumber prevRange, BlockNumber maxRange);
226 
227 /* parallel index builds */
228 static void _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index,
229  bool isconcurrent, int request);
230 static void _brin_end_parallel(BrinLeader *brinleader, BrinBuildState *state);
232 static void _brin_leader_participate_as_worker(BrinBuildState *buildstate,
233  Relation heap, Relation index);
234 static void _brin_parallel_scan_and_build(BrinBuildState *buildstate,
235  BrinShared *brinshared,
236  Sharedsort *sharedsort,
237  Relation heap, Relation index,
238  int sortmem, bool progress);
239 
240 /*
241  * BRIN handler function: return IndexAmRoutine with access method parameters
242  * and callbacks.
243  */
244 Datum
246 {
247  IndexAmRoutine *amroutine = makeNode(IndexAmRoutine);
248 
249  amroutine->amstrategies = 0;
251  amroutine->amoptsprocnum = BRIN_PROCNUM_OPTIONS;
252  amroutine->amcanorder = false;
253  amroutine->amcanorderbyop = false;
254  amroutine->amcanbackward = false;
255  amroutine->amcanunique = false;
256  amroutine->amcanmulticol = true;
257  amroutine->amoptionalkey = true;
258  amroutine->amsearcharray = false;
259  amroutine->amsearchnulls = true;
260  amroutine->amstorage = true;
261  amroutine->amclusterable = false;
262  amroutine->ampredlocks = false;
263  amroutine->amcanparallel = false;
264  amroutine->amcanbuildparallel = true;
265  amroutine->amcaninclude = false;
266  amroutine->amusemaintenanceworkmem = false;
267  amroutine->amsummarizing = true;
268  amroutine->amparallelvacuumoptions =
270  amroutine->amkeytype = InvalidOid;
271 
272  amroutine->ambuild = brinbuild;
273  amroutine->ambuildempty = brinbuildempty;
274  amroutine->aminsert = brininsert;
275  amroutine->aminsertcleanup = brininsertcleanup;
276  amroutine->ambulkdelete = brinbulkdelete;
277  amroutine->amvacuumcleanup = brinvacuumcleanup;
278  amroutine->amcanreturn = NULL;
279  amroutine->amcostestimate = brincostestimate;
280  amroutine->amoptions = brinoptions;
281  amroutine->amproperty = NULL;
282  amroutine->ambuildphasename = NULL;
283  amroutine->amvalidate = brinvalidate;
284  amroutine->amadjustmembers = NULL;
285  amroutine->ambeginscan = brinbeginscan;
286  amroutine->amrescan = brinrescan;
287  amroutine->amgettuple = NULL;
288  amroutine->amgetbitmap = bringetbitmap;
289  amroutine->amendscan = brinendscan;
290  amroutine->ammarkpos = NULL;
291  amroutine->amrestrpos = NULL;
292  amroutine->amestimateparallelscan = NULL;
293  amroutine->aminitparallelscan = NULL;
294  amroutine->amparallelrescan = NULL;
295 
296  PG_RETURN_POINTER(amroutine);
297 }
298 
299 /*
300  * Initialize a BrinInsertState to maintain state to be used across multiple
301  * tuple inserts, within the same command.
302  */
303 static BrinInsertState *
305 {
306  BrinInsertState *bistate;
307  MemoryContext oldcxt;
308 
309  oldcxt = MemoryContextSwitchTo(indexInfo->ii_Context);
310  bistate = palloc0(sizeof(BrinInsertState));
311  bistate->bis_desc = brin_build_desc(idxRel);
312  bistate->bis_rmAccess = brinRevmapInitialize(idxRel,
313  &bistate->bis_pages_per_range);
314  indexInfo->ii_AmCache = bistate;
315  MemoryContextSwitchTo(oldcxt);
316 
317  return bistate;
318 }
319 
320 /*
321  * A tuple in the heap is being inserted. To keep a brin index up to date,
322  * we need to obtain the relevant index tuple and compare its stored values
323  * with those of the new tuple. If the tuple values are not consistent with
324  * the summary tuple, we need to update the index tuple.
325  *
326  * If autosummarization is enabled, check if we need to summarize the previous
327  * page range.
328  *
329  * If the range is not currently summarized (i.e. the revmap returns NULL for
330  * it), there's nothing to do for this tuple.
331  */
332 bool
333 brininsert(Relation idxRel, Datum *values, bool *nulls,
334  ItemPointer heaptid, Relation heapRel,
335  IndexUniqueCheck checkUnique,
336  bool indexUnchanged,
337  IndexInfo *indexInfo)
338 {
339  BlockNumber pagesPerRange;
340  BlockNumber origHeapBlk;
341  BlockNumber heapBlk;
342  BrinInsertState *bistate = (BrinInsertState *) indexInfo->ii_AmCache;
343  BrinRevmap *revmap;
344  BrinDesc *bdesc;
346  MemoryContext tupcxt = NULL;
348  bool autosummarize = BrinGetAutoSummarize(idxRel);
349 
350  /*
351  * If first time through in this statement, initialize the insert state
352  * that we keep for all the inserts in the command.
353  */
354  if (!bistate)
355  bistate = initialize_brin_insertstate(idxRel, indexInfo);
356 
357  revmap = bistate->bis_rmAccess;
358  bdesc = bistate->bis_desc;
359  pagesPerRange = bistate->bis_pages_per_range;
360 
361  /*
362  * origHeapBlk is the block number where the insertion occurred. heapBlk
363  * is the first block in the corresponding page range.
364  */
365  origHeapBlk = ItemPointerGetBlockNumber(heaptid);
366  heapBlk = (origHeapBlk / pagesPerRange) * pagesPerRange;
367 
368  for (;;)
369  {
370  bool need_insert = false;
371  OffsetNumber off;
372  BrinTuple *brtup;
373  BrinMemTuple *dtup;
374 
376 
377  /*
378  * If auto-summarization is enabled and we just inserted the first
379  * tuple into the first block of a new non-first page range, request a
380  * summarization run of the previous range.
381  */
382  if (autosummarize &&
383  heapBlk > 0 &&
384  heapBlk == origHeapBlk &&
386  {
387  BlockNumber lastPageRange = heapBlk - 1;
388  BrinTuple *lastPageTuple;
389 
390  lastPageTuple =
391  brinGetTupleForHeapBlock(revmap, lastPageRange, &buf, &off,
392  NULL, BUFFER_LOCK_SHARE);
393  if (!lastPageTuple)
394  {
395  bool recorded;
396 
398  RelationGetRelid(idxRel),
399  lastPageRange);
400  if (!recorded)
401  ereport(LOG,
402  (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
403  errmsg("request for BRIN range summarization for index \"%s\" page %u was not recorded",
404  RelationGetRelationName(idxRel),
405  lastPageRange)));
406  }
407  else
409  }
410 
411  brtup = brinGetTupleForHeapBlock(revmap, heapBlk, &buf, &off,
412  NULL, BUFFER_LOCK_SHARE);
413 
414  /* if range is unsummarized, there's nothing to do */
415  if (!brtup)
416  break;
417 
418  /* First time through in this brininsert call? */
419  if (tupcxt == NULL)
420  {
422  "brininsert cxt",
424  MemoryContextSwitchTo(tupcxt);
425  }
426 
427  dtup = brin_deform_tuple(bdesc, brtup, NULL);
428 
429  need_insert = add_values_to_range(idxRel, bdesc, dtup, values, nulls);
430 
431  if (!need_insert)
432  {
433  /*
434  * The tuple is consistent with the new values, so there's nothing
435  * to do.
436  */
438  }
439  else
440  {
441  Page page = BufferGetPage(buf);
442  ItemId lp = PageGetItemId(page, off);
443  Size origsz;
444  BrinTuple *origtup;
445  Size newsz;
446  BrinTuple *newtup;
447  bool samepage;
448 
449  /*
450  * Make a copy of the old tuple, so that we can compare it after
451  * re-acquiring the lock.
452  */
453  origsz = ItemIdGetLength(lp);
454  origtup = brin_copy_tuple(brtup, origsz, NULL, NULL);
455 
456  /*
457  * Before releasing the lock, check if we can attempt a same-page
458  * update. Another process could insert a tuple concurrently in
459  * the same page though, so downstream we must be prepared to cope
460  * if this turns out to not be possible after all.
461  */
462  newtup = brin_form_tuple(bdesc, heapBlk, dtup, &newsz);
463  samepage = brin_can_do_samepage_update(buf, origsz, newsz);
465 
466  /*
467  * Try to update the tuple. If this doesn't work for whatever
468  * reason, we need to restart from the top; the revmap might be
469  * pointing at a different tuple for this block now, so we need to
470  * recompute to ensure both our new heap tuple and the other
471  * inserter's are covered by the combined tuple. It might be that
472  * we don't need to update at all.
473  */
474  if (!brin_doupdate(idxRel, pagesPerRange, revmap, heapBlk,
475  buf, off, origtup, origsz, newtup, newsz,
476  samepage))
477  {
478  /* no luck; start over */
479  MemoryContextReset(tupcxt);
480  continue;
481  }
482  }
483 
484  /* success! */
485  break;
486  }
487 
488  if (BufferIsValid(buf))
490  MemoryContextSwitchTo(oldcxt);
491  if (tupcxt != NULL)
492  MemoryContextDelete(tupcxt);
493 
494  return false;
495 }
496 
497 /*
498  * Callback to clean up the BrinInsertState once all tuple inserts are done.
499  */
500 void
502 {
503  BrinInsertState *bistate = (BrinInsertState *) indexInfo->ii_AmCache;
504 
505  Assert(bistate);
506 
507  /*
508  * Clean up the revmap. Note that the brinDesc has already been cleaned up
509  * as part of its own memory context.
510  */
512  bistate->bis_rmAccess = NULL;
513  bistate->bis_desc = NULL;
514 }
515 
516 /*
517  * Initialize state for a BRIN index scan.
518  *
519  * We read the metapage here to determine the pages-per-range number that this
520  * index was built with. Note that since this cannot be changed while we're
521  * holding lock on index, it's not necessary to recompute it during brinrescan.
522  */
524 brinbeginscan(Relation r, int nkeys, int norderbys)
525 {
526  IndexScanDesc scan;
527  BrinOpaque *opaque;
528 
529  scan = RelationGetIndexScan(r, nkeys, norderbys);
530 
531  opaque = palloc_object(BrinOpaque);
532  opaque->bo_rmAccess = brinRevmapInitialize(r, &opaque->bo_pagesPerRange);
533  opaque->bo_bdesc = brin_build_desc(r);
534  scan->opaque = opaque;
535 
536  return scan;
537 }
538 
539 /*
540  * Execute the index scan.
541  *
542  * This works by reading index TIDs from the revmap, and obtaining the index
543  * tuples pointed to by them; the summary values in the index tuples are
544  * compared to the scan keys. We return into the TID bitmap all the pages in
545  * ranges corresponding to index tuples that match the scan keys.
546  *
547  * If a TID from the revmap is read as InvalidTID, we know that range is
548  * unsummarized. Pages in those ranges need to be returned regardless of scan
549  * keys.
550  */
551 int64
553 {
554  Relation idxRel = scan->indexRelation;
556  BrinDesc *bdesc;
557  Oid heapOid;
558  Relation heapRel;
559  BrinOpaque *opaque;
560  BlockNumber nblocks;
561  BlockNumber heapBlk;
562  int totalpages = 0;
563  FmgrInfo *consistentFn;
564  MemoryContext oldcxt;
565  MemoryContext perRangeCxt;
566  BrinMemTuple *dtup;
567  BrinTuple *btup = NULL;
568  Size btupsz = 0;
569  ScanKey **keys,
570  **nullkeys;
571  int *nkeys,
572  *nnullkeys;
573  char *ptr;
574  Size len;
575  char *tmp PG_USED_FOR_ASSERTS_ONLY;
576 
577  opaque = (BrinOpaque *) scan->opaque;
578  bdesc = opaque->bo_bdesc;
579  pgstat_count_index_scan(idxRel);
580 
581  /*
582  * We need to know the size of the table so that we know how long to
583  * iterate on the revmap.
584  */
585  heapOid = IndexGetRelation(RelationGetRelid(idxRel), false);
586  heapRel = table_open(heapOid, AccessShareLock);
587  nblocks = RelationGetNumberOfBlocks(heapRel);
588  table_close(heapRel, AccessShareLock);
589 
590  /*
591  * Make room for the consistent support procedures of indexed columns. We
592  * don't look them up here; we do that lazily the first time we see a scan
593  * key reference each of them. We rely on zeroing fn_oid to InvalidOid.
594  */
595  consistentFn = palloc0_array(FmgrInfo, bdesc->bd_tupdesc->natts);
596 
597  /*
598  * Make room for per-attribute lists of scan keys that we'll pass to the
599  * consistent support procedure. We don't know which attributes have scan
600  * keys, so we allocate space for all attributes. That may use more memory
601  * but it's probably cheaper than determining which attributes are used.
602  *
603  * We keep null and regular keys separate, so that we can pass just the
604  * regular keys to the consistent function easily.
605  *
606  * To reduce the allocation overhead, we allocate one big chunk and then
607  * carve it into smaller arrays ourselves. All the pieces have exactly the
608  * same lifetime, so that's OK.
609  *
610  * XXX The widest index can have 32 attributes, so the amount of wasted
611  * memory is negligible. We could invent a more compact approach (with
612  * just space for used attributes) but that would make the matching more
613  * complex so it's not a good trade-off.
614  */
615  len =
616  MAXALIGN(sizeof(ScanKey *) * bdesc->bd_tupdesc->natts) + /* regular keys */
617  MAXALIGN(sizeof(ScanKey) * scan->numberOfKeys) * bdesc->bd_tupdesc->natts +
618  MAXALIGN(sizeof(int) * bdesc->bd_tupdesc->natts) +
619  MAXALIGN(sizeof(ScanKey *) * bdesc->bd_tupdesc->natts) + /* NULL keys */
620  MAXALIGN(sizeof(ScanKey) * scan->numberOfKeys) * bdesc->bd_tupdesc->natts +
621  MAXALIGN(sizeof(int) * bdesc->bd_tupdesc->natts);
622 
623  ptr = palloc(len);
624  tmp = ptr;
625 
626  keys = (ScanKey **) ptr;
627  ptr += MAXALIGN(sizeof(ScanKey *) * bdesc->bd_tupdesc->natts);
628 
629  nullkeys = (ScanKey **) ptr;
630  ptr += MAXALIGN(sizeof(ScanKey *) * bdesc->bd_tupdesc->natts);
631 
632  nkeys = (int *) ptr;
633  ptr += MAXALIGN(sizeof(int) * bdesc->bd_tupdesc->natts);
634 
635  nnullkeys = (int *) ptr;
636  ptr += MAXALIGN(sizeof(int) * bdesc->bd_tupdesc->natts);
637 
638  for (int i = 0; i < bdesc->bd_tupdesc->natts; i++)
639  {
640  keys[i] = (ScanKey *) ptr;
641  ptr += MAXALIGN(sizeof(ScanKey) * scan->numberOfKeys);
642 
643  nullkeys[i] = (ScanKey *) ptr;
644  ptr += MAXALIGN(sizeof(ScanKey) * scan->numberOfKeys);
645  }
646 
647  Assert(tmp + len == ptr);
648 
649  /* zero the number of keys */
650  memset(nkeys, 0, sizeof(int) * bdesc->bd_tupdesc->natts);
651  memset(nnullkeys, 0, sizeof(int) * bdesc->bd_tupdesc->natts);
652 
653  /* Preprocess the scan keys - split them into per-attribute arrays. */
654  for (int keyno = 0; keyno < scan->numberOfKeys; keyno++)
655  {
656  ScanKey key = &scan->keyData[keyno];
657  AttrNumber keyattno = key->sk_attno;
658 
659  /*
660  * The collation of the scan key must match the collation used in the
661  * index column (but only if the search is not IS NULL/ IS NOT NULL).
662  * Otherwise we shouldn't be using this index ...
663  */
664  Assert((key->sk_flags & SK_ISNULL) ||
665  (key->sk_collation ==
666  TupleDescAttr(bdesc->bd_tupdesc,
667  keyattno - 1)->attcollation));
668 
669  /*
670  * First time we see this index attribute, so init as needed.
671  *
672  * This is a bit of an overkill - we don't know how many scan keys are
673  * there for this attribute, so we simply allocate the largest number
674  * possible (as if all keys were for this attribute). This may waste a
675  * bit of memory, but we only expect small number of scan keys in
676  * general, so this should be negligible, and repeated repalloc calls
677  * are not free either.
678  */
679  if (consistentFn[keyattno - 1].fn_oid == InvalidOid)
680  {
681  FmgrInfo *tmp;
682 
683  /* First time we see this attribute, so no key/null keys. */
684  Assert(nkeys[keyattno - 1] == 0);
685  Assert(nnullkeys[keyattno - 1] == 0);
686 
687  tmp = index_getprocinfo(idxRel, keyattno,
689  fmgr_info_copy(&consistentFn[keyattno - 1], tmp,
691  }
692 
693  /* Add key to the proper per-attribute array. */
694  if (key->sk_flags & SK_ISNULL)
695  {
696  nullkeys[keyattno - 1][nnullkeys[keyattno - 1]] = key;
697  nnullkeys[keyattno - 1]++;
698  }
699  else
700  {
701  keys[keyattno - 1][nkeys[keyattno - 1]] = key;
702  nkeys[keyattno - 1]++;
703  }
704  }
705 
706  /* allocate an initial in-memory tuple, out of the per-range memcxt */
707  dtup = brin_new_memtuple(bdesc);
708 
709  /*
710  * Setup and use a per-range memory context, which is reset every time we
711  * loop below. This avoids having to free the tuples within the loop.
712  */
714  "bringetbitmap cxt",
716  oldcxt = MemoryContextSwitchTo(perRangeCxt);
717 
718  /*
719  * Now scan the revmap. We start by querying for heap page 0,
720  * incrementing by the number of pages per range; this gives us a full
721  * view of the table.
722  */
723  for (heapBlk = 0; heapBlk < nblocks; heapBlk += opaque->bo_pagesPerRange)
724  {
725  bool addrange;
726  bool gottuple = false;
727  BrinTuple *tup;
728  OffsetNumber off;
729  Size size;
730 
732 
733  MemoryContextReset(perRangeCxt);
734 
735  tup = brinGetTupleForHeapBlock(opaque->bo_rmAccess, heapBlk, &buf,
736  &off, &size, BUFFER_LOCK_SHARE);
737  if (tup)
738  {
739  gottuple = true;
740  btup = brin_copy_tuple(tup, size, btup, &btupsz);
742  }
743 
744  /*
745  * For page ranges with no indexed tuple, we must return the whole
746  * range; otherwise, compare it to the scan keys.
747  */
748  if (!gottuple)
749  {
750  addrange = true;
751  }
752  else
753  {
754  dtup = brin_deform_tuple(bdesc, btup, dtup);
755  if (dtup->bt_placeholder)
756  {
757  /*
758  * Placeholder tuples are always returned, regardless of the
759  * values stored in them.
760  */
761  addrange = true;
762  }
763  else
764  {
765  int attno;
766 
767  /*
768  * Compare scan keys with summary values stored for the range.
769  * If scan keys are matched, the page range must be added to
770  * the bitmap. We initially assume the range needs to be
771  * added; in particular this serves the case where there are
772  * no keys.
773  */
774  addrange = true;
775  for (attno = 1; attno <= bdesc->bd_tupdesc->natts; attno++)
776  {
777  BrinValues *bval;
778  Datum add;
779  Oid collation;
780 
781  /*
782  * skip attributes without any scan keys (both regular and
783  * IS [NOT] NULL)
784  */
785  if (nkeys[attno - 1] == 0 && nnullkeys[attno - 1] == 0)
786  continue;
787 
788  bval = &dtup->bt_columns[attno - 1];
789 
790  /*
791  * If the BRIN tuple indicates that this range is empty,
792  * we can skip it: there's nothing to match. We don't
793  * need to examine the next columns.
794  */
795  if (dtup->bt_empty_range)
796  {
797  addrange = false;
798  break;
799  }
800 
801  /*
802  * First check if there are any IS [NOT] NULL scan keys,
803  * and if we're violating them. In that case we can
804  * terminate early, without invoking the support function.
805  *
806  * As there may be more keys, we can only determine
807  * mismatch within this loop.
808  */
809  if (bdesc->bd_info[attno - 1]->oi_regular_nulls &&
810  !check_null_keys(bval, nullkeys[attno - 1],
811  nnullkeys[attno - 1]))
812  {
813  /*
814  * If any of the IS [NOT] NULL keys failed, the page
815  * range as a whole can't pass. So terminate the loop.
816  */
817  addrange = false;
818  break;
819  }
820 
821  /*
822  * So either there are no IS [NOT] NULL keys, or all
823  * passed. If there are no regular scan keys, we're done -
824  * the page range matches. If there are regular keys, but
825  * the page range is marked as 'all nulls' it can't
826  * possibly pass (we're assuming the operators are
827  * strict).
828  */
829 
830  /* No regular scan keys - page range as a whole passes. */
831  if (!nkeys[attno - 1])
832  continue;
833 
834  Assert((nkeys[attno - 1] > 0) &&
835  (nkeys[attno - 1] <= scan->numberOfKeys));
836 
837  /* If it is all nulls, it cannot possibly be consistent. */
838  if (bval->bv_allnulls)
839  {
840  addrange = false;
841  break;
842  }
843 
844  /*
845  * Collation from the first key (has to be the same for
846  * all keys for the same attribute).
847  */
848  collation = keys[attno - 1][0]->sk_collation;
849 
850  /*
851  * Check whether the scan key is consistent with the page
852  * range values; if so, have the pages in the range added
853  * to the output bitmap.
854  *
855  * The opclass may or may not support processing of
856  * multiple scan keys. We can determine that based on the
857  * number of arguments - functions with extra parameter
858  * (number of scan keys) do support this, otherwise we
859  * have to simply pass the scan keys one by one.
860  */
861  if (consistentFn[attno - 1].fn_nargs >= 4)
862  {
863  /* Check all keys at once */
864  add = FunctionCall4Coll(&consistentFn[attno - 1],
865  collation,
866  PointerGetDatum(bdesc),
867  PointerGetDatum(bval),
868  PointerGetDatum(keys[attno - 1]),
869  Int32GetDatum(nkeys[attno - 1]));
870  addrange = DatumGetBool(add);
871  }
872  else
873  {
874  /*
875  * Check keys one by one
876  *
877  * When there are multiple scan keys, failure to meet
878  * the criteria for a single one of them is enough to
879  * discard the range as a whole, so break out of the
880  * loop as soon as a false return value is obtained.
881  */
882  int keyno;
883 
884  for (keyno = 0; keyno < nkeys[attno - 1]; keyno++)
885  {
886  add = FunctionCall3Coll(&consistentFn[attno - 1],
887  keys[attno - 1][keyno]->sk_collation,
888  PointerGetDatum(bdesc),
889  PointerGetDatum(bval),
890  PointerGetDatum(keys[attno - 1][keyno]));
891  addrange = DatumGetBool(add);
892  if (!addrange)
893  break;
894  }
895  }
896 
897  /*
898  * If we found a scan key eliminating the range, no need
899  * to check additional ones.
900  */
901  if (!addrange)
902  break;
903  }
904  }
905  }
906 
907  /* add the pages in the range to the output bitmap, if needed */
908  if (addrange)
909  {
910  BlockNumber pageno;
911 
912  for (pageno = heapBlk;
913  pageno <= Min(nblocks, heapBlk + opaque->bo_pagesPerRange) - 1;
914  pageno++)
915  {
916  MemoryContextSwitchTo(oldcxt);
917  tbm_add_page(tbm, pageno);
918  totalpages++;
919  MemoryContextSwitchTo(perRangeCxt);
920  }
921  }
922  }
923 
924  MemoryContextSwitchTo(oldcxt);
925  MemoryContextDelete(perRangeCxt);
926 
927  if (buf != InvalidBuffer)
929 
930  /*
931  * XXX We have an approximation of the number of *pages* that our scan
932  * returns, but we don't have a precise idea of the number of heap tuples
933  * involved.
934  */
935  return totalpages * 10;
936 }
937 
938 /*
939  * Re-initialize state for a BRIN index scan
940  */
941 void
942 brinrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
943  ScanKey orderbys, int norderbys)
944 {
945  /*
946  * Other index AMs preprocess the scan keys at this point, or sometime
947  * early during the scan; this lets them optimize by removing redundant
948  * keys, or doing early returns when they are impossible to satisfy; see
949  * _bt_preprocess_keys for an example. Something like that could be added
950  * here someday, too.
951  */
952 
953  if (scankey && scan->numberOfKeys > 0)
954  memmove(scan->keyData, scankey,
955  scan->numberOfKeys * sizeof(ScanKeyData));
956 }
957 
958 /*
959  * Close down a BRIN index scan
960  */
961 void
963 {
964  BrinOpaque *opaque = (BrinOpaque *) scan->opaque;
965 
967  brin_free_desc(opaque->bo_bdesc);
968  pfree(opaque);
969 }
970 
971 /*
972  * Per-heap-tuple callback for table_index_build_scan.
973  *
974  * Note we don't worry about the page range at the end of the table here; it is
975  * present in the build state struct after we're called the last time, but not
976  * inserted into the index. Caller must ensure to do so, if appropriate.
977  */
978 static void
980  ItemPointer tid,
981  Datum *values,
982  bool *isnull,
983  bool tupleIsAlive,
984  void *brstate)
985 {
986  BrinBuildState *state = (BrinBuildState *) brstate;
987  BlockNumber thisblock;
988 
989  thisblock = ItemPointerGetBlockNumber(tid);
990 
991  /*
992  * If we're in a block that belongs to a future range, summarize what
993  * we've got and start afresh. Note the scan might have skipped many
994  * pages, if they were devoid of live tuples; make sure to insert index
995  * tuples for those too.
996  */
997  while (thisblock > state->bs_currRangeStart + state->bs_pagesPerRange - 1)
998  {
999 
1000  BRIN_elog((DEBUG2,
1001  "brinbuildCallback: completed a range: %u--%u",
1002  state->bs_currRangeStart,
1003  state->bs_currRangeStart + state->bs_pagesPerRange));
1004 
1005  /* create the index tuple and insert it */
1007 
1008  /* set state to correspond to the next range */
1009  state->bs_currRangeStart += state->bs_pagesPerRange;
1010 
1011  /* re-initialize state for it */
1012  brin_memtuple_initialize(state->bs_dtuple, state->bs_bdesc);
1013  }
1014 
1015  /* Accumulate the current tuple into the running state */
1016  (void) add_values_to_range(index, state->bs_bdesc, state->bs_dtuple,
1017  values, isnull);
1018 }
1019 
1020 /*
1021  * Per-heap-tuple callback for table_index_build_scan with parallelism.
1022  *
1023  * A version of the callback used by parallel index builds. The main difference
1024  * is that instead of writing the BRIN tuples into the index, we write them
1025  * into a shared tuplesort, and leave the insertion up to the leader (which may
1026  * reorder them a bit etc.). The callback also does not generate empty ranges,
1027  * those will be added by the leader when merging results from workers.
1028  */
1029 static void
1031  ItemPointer tid,
1032  Datum *values,
1033  bool *isnull,
1034  bool tupleIsAlive,
1035  void *brstate)
1036 {
1037  BrinBuildState *state = (BrinBuildState *) brstate;
1038  BlockNumber thisblock;
1039 
1040  thisblock = ItemPointerGetBlockNumber(tid);
1041 
1042  /*
1043  * If we're in a block that belongs to a different range, summarize what
1044  * we've got and start afresh. Note the scan might have skipped many
1045  * pages, if they were devoid of live tuples; we do not create empty BRIN
1046  * ranges here - the leader is responsible for filling them in.
1047  *
1048  * Unlike serial builds, parallel index builds allow synchronized seqscans
1049  * (because that's what parallel scans do). This means the block may wrap
1050  * around to the beginning of the relation, so the condition needs to
1051  * check for both future and past ranges.
1052  */
1053  if ((thisblock < state->bs_currRangeStart) ||
1054  (thisblock > state->bs_currRangeStart + state->bs_pagesPerRange - 1))
1055  {
1056 
1057  BRIN_elog((DEBUG2,
1058  "brinbuildCallbackParallel: completed a range: %u--%u",
1059  state->bs_currRangeStart,
1060  state->bs_currRangeStart + state->bs_pagesPerRange));
1061 
1062  /* create the index tuple and write it into the tuplesort */
1064 
1065  /*
1066  * Set state to correspond to the next range (for this block).
1067  *
1068  * This skips ranges that are either empty (and so we don't get any
1069  * tuples to summarize), or processed by other workers. We can't
1070  * differentiate those cases here easily, so we leave it up to the
1071  * leader to fill empty ranges where needed.
1072  */
1073  state->bs_currRangeStart
1074  = state->bs_pagesPerRange * (thisblock / state->bs_pagesPerRange);
1075 
1076  /* re-initialize state for it */
1077  brin_memtuple_initialize(state->bs_dtuple, state->bs_bdesc);
1078  }
1079 
1080  /* Accumulate the current tuple into the running state */
1081  (void) add_values_to_range(index, state->bs_bdesc, state->bs_dtuple,
1082  values, isnull);
1083 }
1084 
1085 /*
1086  * brinbuild() -- build a new BRIN index.
1087  */
1090 {
1091  IndexBuildResult *result;
1092  double reltuples;
1093  double idxtuples;
1094  BrinRevmap *revmap;
1096  Buffer meta;
1097  BlockNumber pagesPerRange;
1098 
1099  /*
1100  * We expect to be called exactly once for any index relation.
1101  */
1102  if (RelationGetNumberOfBlocks(index) != 0)
1103  elog(ERROR, "index \"%s\" already contains data",
1105 
1106  /*
1107  * Critical section not required, because on error the creation of the
1108  * whole relation will be rolled back.
1109  */
1110 
1114 
1117  MarkBufferDirty(meta);
1118 
1119  if (RelationNeedsWAL(index))
1120  {
1121  xl_brin_createidx xlrec;
1122  XLogRecPtr recptr;
1123  Page page;
1124 
1125  xlrec.version = BRIN_CURRENT_VERSION;
1127 
1128  XLogBeginInsert();
1129  XLogRegisterData((char *) &xlrec, SizeOfBrinCreateIdx);
1131 
1132  recptr = XLogInsert(RM_BRIN_ID, XLOG_BRIN_CREATE_INDEX);
1133 
1134  page = BufferGetPage(meta);
1135  PageSetLSN(page, recptr);
1136  }
1137 
1138  UnlockReleaseBuffer(meta);
1139 
1140  /*
1141  * Initialize our state, including the deformed tuple state.
1142  */
1143  revmap = brinRevmapInitialize(index, &pagesPerRange);
1144  state = initialize_brin_buildstate(index, revmap, pagesPerRange,
1146 
1147  /*
1148  * Attempt to launch parallel worker scan when required
1149  *
1150  * XXX plan_create_index_workers makes the number of workers dependent on
1151  * maintenance_work_mem, requiring 32MB for each worker. That makes sense
1152  * for btree, but not for BRIN, which can do away with much less memory.
1153  * So maybe make that somehow less strict, optionally?
1154  */
1155  if (indexInfo->ii_ParallelWorkers > 0)
1156  _brin_begin_parallel(state, heap, index, indexInfo->ii_Concurrent,
1157  indexInfo->ii_ParallelWorkers);
1158 
1159  /*
1160  * If parallel build requested and at least one worker process was
1161  * successfully launched, set up coordination state, wait for workers to
1162  * complete. Then read all tuples from the shared tuplesort and insert
1163  * them into the index.
1164  *
1165  * In serial mode, simply scan the table and build the index one index
1166  * tuple at a time.
1167  */
1168  if (state->bs_leader)
1169  {
1170  SortCoordinate coordinate;
1171 
1172  coordinate = (SortCoordinate) palloc0(sizeof(SortCoordinateData));
1173  coordinate->isWorker = false;
1174  coordinate->nParticipants =
1175  state->bs_leader->nparticipanttuplesorts;
1176  coordinate->sharedsort = state->bs_leader->sharedsort;
1177 
1178  /*
1179  * Begin leader tuplesort.
1180  *
1181  * In cases where parallelism is involved, the leader receives the
1182  * same share of maintenance_work_mem as a serial sort (it is
1183  * generally treated in the same way as a serial sort once we return).
1184  * Parallel worker Tuplesortstates will have received only a fraction
1185  * of maintenance_work_mem, though.
1186  *
1187  * We rely on the lifetime of the Leader Tuplesortstate almost not
1188  * overlapping with any worker Tuplesortstate's lifetime. There may
1189  * be some small overlap, but that's okay because we rely on leader
1190  * Tuplesortstate only allocating a small, fixed amount of memory
1191  * here. When its tuplesort_performsort() is called (by our caller),
1192  * and significant amounts of memory are likely to be used, all
1193  * workers must have already freed almost all memory held by their
1194  * Tuplesortstates (they are about to go away completely, too). The
1195  * overall effect is that maintenance_work_mem always represents an
1196  * absolute high watermark on the amount of memory used by a CREATE
1197  * INDEX operation, regardless of the use of parallelism or any other
1198  * factor.
1199  */
1200  state->bs_sortstate =
1202  TUPLESORT_NONE);
1203 
1204  _brin_end_parallel(state->bs_leader, state);
1205  }
1206  else /* no parallel index build */
1207  {
1208  /*
1209  * Now scan the relation. No syncscan allowed here because we want
1210  * the heap blocks in physical order (we want to produce the ranges
1211  * starting from block 0, and the callback also relies on this to not
1212  * generate summary for the same range twice).
1213  */
1214  reltuples = table_index_build_scan(heap, index, indexInfo, false, true,
1215  brinbuildCallback, (void *) state, NULL);
1216 
1217  /*
1218  * process the final batch
1219  *
1220  * XXX Note this does not update state->bs_currRangeStart, i.e. it
1221  * stays set to the last range added to the index. This is OK, because
1222  * that's what brin_fill_empty_ranges expects.
1223  */
1225 
1226  /*
1227  * Backfill the final ranges with empty data.
1228  *
1229  * This saves us from doing what amounts to full table scans when the
1230  * index with a predicate like WHERE (nonnull_column IS NULL), or
1231  * other very selective predicates.
1232  */
1234  state->bs_currRangeStart,
1235  state->bs_maxRangeStart);
1236 
1237  /* track the number of relation tuples */
1238  state->bs_reltuples = reltuples;
1239  }
1240 
1241  /* release resources */
1242  idxtuples = state->bs_numtuples;
1243  reltuples = state->bs_reltuples;
1244  brinRevmapTerminate(state->bs_rmAccess);
1246 
1247  /*
1248  * Return statistics
1249  */
1250  result = palloc_object(IndexBuildResult);
1251 
1252  result->heap_tuples = reltuples;
1253  result->index_tuples = idxtuples;
1254 
1255  return result;
1256 }
1257 
1258 void
1260 {
1261  Buffer metabuf;
1262 
1263  /* An empty BRIN index has a metapage only. */
1264  metabuf = ExtendBufferedRel(BMR_REL(index), INIT_FORKNUM, NULL,
1266 
1267  /* Initialize and xlog metabuffer. */
1271  MarkBufferDirty(metabuf);
1272  log_newpage_buffer(metabuf, true);
1273  END_CRIT_SECTION();
1274 
1275  UnlockReleaseBuffer(metabuf);
1276 }
1277 
1278 /*
1279  * brinbulkdelete
1280  * Since there are no per-heap-tuple index tuples in BRIN indexes,
1281  * there's not a lot we can do here.
1282  *
1283  * XXX we could mark item tuples as "dirty" (when a minimum or maximum heap
1284  * tuple is deleted), meaning the need to re-run summarization on the affected
1285  * range. Would need to add an extra flag in brintuples for that.
1286  */
1289  IndexBulkDeleteCallback callback, void *callback_state)
1290 {
1291  /* allocate stats if first time through, else re-use existing struct */
1292  if (stats == NULL)
1294 
1295  return stats;
1296 }
1297 
1298 /*
1299  * This routine is in charge of "vacuuming" a BRIN index: we just summarize
1300  * ranges that are currently unsummarized.
1301  */
1304 {
1305  Relation heapRel;
1306 
1307  /* No-op in ANALYZE ONLY mode */
1308  if (info->analyze_only)
1309  return stats;
1310 
1311  if (!stats)
1313  stats->num_pages = RelationGetNumberOfBlocks(info->index);
1314  /* rest of stats is initialized by zeroing */
1315 
1316  heapRel = table_open(IndexGetRelation(RelationGetRelid(info->index), false),
1317  AccessShareLock);
1318 
1319  brin_vacuum_scan(info->index, info->strategy);
1320 
1321  brinsummarize(info->index, heapRel, BRIN_ALL_BLOCKRANGES, false,
1322  &stats->num_index_tuples, &stats->num_index_tuples);
1323 
1324  table_close(heapRel, AccessShareLock);
1325 
1326  return stats;
1327 }
1328 
1329 /*
1330  * reloptions processor for BRIN indexes
1331  */
1332 bytea *
1333 brinoptions(Datum reloptions, bool validate)
1334 {
1335  static const relopt_parse_elt tab[] = {
1336  {"pages_per_range", RELOPT_TYPE_INT, offsetof(BrinOptions, pagesPerRange)},
1337  {"autosummarize", RELOPT_TYPE_BOOL, offsetof(BrinOptions, autosummarize)}
1338  };
1339 
1340  return (bytea *) build_reloptions(reloptions, validate,
1342  sizeof(BrinOptions),
1343  tab, lengthof(tab));
1344 }
1345 
1346 /*
1347  * SQL-callable function to scan through an index and summarize all ranges
1348  * that are not currently summarized.
1349  */
1350 Datum
1352 {
1353  Datum relation = PG_GETARG_DATUM(0);
1354 
1356  relation,
1358 }
1359 
1360 /*
1361  * SQL-callable function to summarize the indicated page range, if not already
1362  * summarized. If the second argument is BRIN_ALL_BLOCKRANGES, all
1363  * unsummarized ranges are summarized.
1364  */
1365 Datum
1367 {
1368  Oid indexoid = PG_GETARG_OID(0);
1369  int64 heapBlk64 = PG_GETARG_INT64(1);
1370  BlockNumber heapBlk;
1371  Oid heapoid;
1372  Relation indexRel;
1373  Relation heapRel;
1374  Oid save_userid;
1375  int save_sec_context;
1376  int save_nestlevel;
1377  double numSummarized = 0;
1378 
1379  if (RecoveryInProgress())
1380  ereport(ERROR,
1381  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1382  errmsg("recovery is in progress"),
1383  errhint("BRIN control functions cannot be executed during recovery.")));
1384 
1385  if (heapBlk64 > BRIN_ALL_BLOCKRANGES || heapBlk64 < 0)
1386  ereport(ERROR,
1387  (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
1388  errmsg("block number out of range: %lld",
1389  (long long) heapBlk64)));
1390  heapBlk = (BlockNumber) heapBlk64;
1391 
1392  /*
1393  * We must lock table before index to avoid deadlocks. However, if the
1394  * passed indexoid isn't an index then IndexGetRelation() will fail.
1395  * Rather than emitting a not-very-helpful error message, postpone
1396  * complaining, expecting that the is-it-an-index test below will fail.
1397  */
1398  heapoid = IndexGetRelation(indexoid, true);
1399  if (OidIsValid(heapoid))
1400  {
1401  heapRel = table_open(heapoid, ShareUpdateExclusiveLock);
1402 
1403  /*
1404  * Autovacuum calls us. For its benefit, switch to the table owner's
1405  * userid, so that any index functions are run as that user. Also
1406  * lock down security-restricted operations and arrange to make GUC
1407  * variable changes local to this command. This is harmless, albeit
1408  * unnecessary, when called from SQL, because we fail shortly if the
1409  * user does not own the index.
1410  */
1411  GetUserIdAndSecContext(&save_userid, &save_sec_context);
1412  SetUserIdAndSecContext(heapRel->rd_rel->relowner,
1413  save_sec_context | SECURITY_RESTRICTED_OPERATION);
1414  save_nestlevel = NewGUCNestLevel();
1416  }
1417  else
1418  {
1419  heapRel = NULL;
1420  /* Set these just to suppress "uninitialized variable" warnings */
1421  save_userid = InvalidOid;
1422  save_sec_context = -1;
1423  save_nestlevel = -1;
1424  }
1425 
1426  indexRel = index_open(indexoid, ShareUpdateExclusiveLock);
1427 
1428  /* Must be a BRIN index */
1429  if (indexRel->rd_rel->relkind != RELKIND_INDEX ||
1430  indexRel->rd_rel->relam != BRIN_AM_OID)
1431  ereport(ERROR,
1432  (errcode(ERRCODE_WRONG_OBJECT_TYPE),
1433  errmsg("\"%s\" is not a BRIN index",
1434  RelationGetRelationName(indexRel))));
1435 
1436  /* User must own the index (comparable to privileges needed for VACUUM) */
1437  if (heapRel != NULL && !object_ownercheck(RelationRelationId, indexoid, save_userid))
1439  RelationGetRelationName(indexRel));
1440 
1441  /*
1442  * Since we did the IndexGetRelation call above without any lock, it's
1443  * barely possible that a race against an index drop/recreation could have
1444  * netted us the wrong table. Recheck.
1445  */
1446  if (heapRel == NULL || heapoid != IndexGetRelation(indexoid, false))
1447  ereport(ERROR,
1449  errmsg("could not open parent table of index \"%s\"",
1450  RelationGetRelationName(indexRel))));
1451 
1452  /* see gin_clean_pending_list() */
1453  if (indexRel->rd_index->indisvalid)
1454  brinsummarize(indexRel, heapRel, heapBlk, true, &numSummarized, NULL);
1455  else
1456  ereport(DEBUG1,
1457  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1458  errmsg("index \"%s\" is not valid",
1459  RelationGetRelationName(indexRel))));
1460 
1461  /* Roll back any GUC changes executed by index functions */
1462  AtEOXact_GUC(false, save_nestlevel);
1463 
1464  /* Restore userid and security context */
1465  SetUserIdAndSecContext(save_userid, save_sec_context);
1466 
1469 
1470  PG_RETURN_INT32((int32) numSummarized);
1471 }
1472 
1473 /*
1474  * SQL-callable interface to mark a range as no longer summarized
1475  */
1476 Datum
1478 {
1479  Oid indexoid = PG_GETARG_OID(0);
1480  int64 heapBlk64 = PG_GETARG_INT64(1);
1481  BlockNumber heapBlk;
1482  Oid heapoid;
1483  Relation heapRel;
1484  Relation indexRel;
1485  bool done;
1486 
1487  if (RecoveryInProgress())
1488  ereport(ERROR,
1489  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1490  errmsg("recovery is in progress"),
1491  errhint("BRIN control functions cannot be executed during recovery.")));
1492 
1493  if (heapBlk64 > MaxBlockNumber || heapBlk64 < 0)
1494  ereport(ERROR,
1495  (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
1496  errmsg("block number out of range: %lld",
1497  (long long) heapBlk64)));
1498  heapBlk = (BlockNumber) heapBlk64;
1499 
1500  /*
1501  * We must lock table before index to avoid deadlocks. However, if the
1502  * passed indexoid isn't an index then IndexGetRelation() will fail.
1503  * Rather than emitting a not-very-helpful error message, postpone
1504  * complaining, expecting that the is-it-an-index test below will fail.
1505  *
1506  * Unlike brin_summarize_range(), autovacuum never calls this. Hence, we
1507  * don't switch userid.
1508  */
1509  heapoid = IndexGetRelation(indexoid, true);
1510  if (OidIsValid(heapoid))
1511  heapRel = table_open(heapoid, ShareUpdateExclusiveLock);
1512  else
1513  heapRel = NULL;
1514 
1515  indexRel = index_open(indexoid, ShareUpdateExclusiveLock);
1516 
1517  /* Must be a BRIN index */
1518  if (indexRel->rd_rel->relkind != RELKIND_INDEX ||
1519  indexRel->rd_rel->relam != BRIN_AM_OID)
1520  ereport(ERROR,
1521  (errcode(ERRCODE_WRONG_OBJECT_TYPE),
1522  errmsg("\"%s\" is not a BRIN index",
1523  RelationGetRelationName(indexRel))));
1524 
1525  /* User must own the index (comparable to privileges needed for VACUUM) */
1526  if (!object_ownercheck(RelationRelationId, indexoid, GetUserId()))
1528  RelationGetRelationName(indexRel));
1529 
1530  /*
1531  * Since we did the IndexGetRelation call above without any lock, it's
1532  * barely possible that a race against an index drop/recreation could have
1533  * netted us the wrong table. Recheck.
1534  */
1535  if (heapRel == NULL || heapoid != IndexGetRelation(indexoid, false))
1536  ereport(ERROR,
1538  errmsg("could not open parent table of index \"%s\"",
1539  RelationGetRelationName(indexRel))));
1540 
1541  /* see gin_clean_pending_list() */
1542  if (indexRel->rd_index->indisvalid)
1543  {
1544  /* the revmap does the hard work */
1545  do
1546  {
1547  done = brinRevmapDesummarizeRange(indexRel, heapBlk);
1548  }
1549  while (!done);
1550  }
1551  else
1552  ereport(DEBUG1,
1553  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1554  errmsg("index \"%s\" is not valid",
1555  RelationGetRelationName(indexRel))));
1556 
1559 
1560  PG_RETURN_VOID();
1561 }
1562 
1563 /*
1564  * Build a BrinDesc used to create or scan a BRIN index
1565  */
1566 BrinDesc *
1568 {
1569  BrinOpcInfo **opcinfo;
1570  BrinDesc *bdesc;
1571  TupleDesc tupdesc;
1572  int totalstored = 0;
1573  int keyno;
1574  long totalsize;
1575  MemoryContext cxt;
1576  MemoryContext oldcxt;
1577 
1579  "brin desc cxt",
1581  oldcxt = MemoryContextSwitchTo(cxt);
1582  tupdesc = RelationGetDescr(rel);
1583 
1584  /*
1585  * Obtain BrinOpcInfo for each indexed column. While at it, accumulate
1586  * the number of columns stored, since the number is opclass-defined.
1587  */
1588  opcinfo = palloc_array(BrinOpcInfo *, tupdesc->natts);
1589  for (keyno = 0; keyno < tupdesc->natts; keyno++)
1590  {
1591  FmgrInfo *opcInfoFn;
1592  Form_pg_attribute attr = TupleDescAttr(tupdesc, keyno);
1593 
1594  opcInfoFn = index_getprocinfo(rel, keyno + 1, BRIN_PROCNUM_OPCINFO);
1595 
1596  opcinfo[keyno] = (BrinOpcInfo *)
1597  DatumGetPointer(FunctionCall1(opcInfoFn, attr->atttypid));
1598  totalstored += opcinfo[keyno]->oi_nstored;
1599  }
1600 
1601  /* Allocate our result struct and fill it in */
1602  totalsize = offsetof(BrinDesc, bd_info) +
1603  sizeof(BrinOpcInfo *) * tupdesc->natts;
1604 
1605  bdesc = palloc(totalsize);
1606  bdesc->bd_context = cxt;
1607  bdesc->bd_index = rel;
1608  bdesc->bd_tupdesc = tupdesc;
1609  bdesc->bd_disktdesc = NULL; /* generated lazily */
1610  bdesc->bd_totalstored = totalstored;
1611 
1612  for (keyno = 0; keyno < tupdesc->natts; keyno++)
1613  bdesc->bd_info[keyno] = opcinfo[keyno];
1614  pfree(opcinfo);
1615 
1616  MemoryContextSwitchTo(oldcxt);
1617 
1618  return bdesc;
1619 }
1620 
1621 void
1623 {
1624  /* make sure the tupdesc is still valid */
1625  Assert(bdesc->bd_tupdesc->tdrefcount >= 1);
1626  /* no need for retail pfree */
1628 }
1629 
1630 /*
1631  * Fetch index's statistical data into *stats
1632  */
1633 void
1635 {
1636  Buffer metabuffer;
1637  Page metapage;
1638  BrinMetaPageData *metadata;
1639 
1640  metabuffer = ReadBuffer(index, BRIN_METAPAGE_BLKNO);
1641  LockBuffer(metabuffer, BUFFER_LOCK_SHARE);
1642  metapage = BufferGetPage(metabuffer);
1643  metadata = (BrinMetaPageData *) PageGetContents(metapage);
1644 
1645  stats->pagesPerRange = metadata->pagesPerRange;
1646  stats->revmapNumPages = metadata->lastRevmapPage - 1;
1647 
1648  UnlockReleaseBuffer(metabuffer);
1649 }
1650 
1651 /*
1652  * Initialize a BrinBuildState appropriate to create tuples on the given index.
1653  */
1654 static BrinBuildState *
1656  BlockNumber pagesPerRange, BlockNumber tablePages)
1657 {
1659  BlockNumber lastRange = 0;
1660 
1662 
1663  state->bs_irel = idxRel;
1664  state->bs_numtuples = 0;
1665  state->bs_reltuples = 0;
1666  state->bs_currentInsertBuf = InvalidBuffer;
1667  state->bs_pagesPerRange = pagesPerRange;
1668  state->bs_currRangeStart = 0;
1669  state->bs_rmAccess = revmap;
1670  state->bs_bdesc = brin_build_desc(idxRel);
1671  state->bs_dtuple = brin_new_memtuple(state->bs_bdesc);
1672  state->bs_leader = NULL;
1673  state->bs_worker_id = 0;
1674  state->bs_sortstate = NULL;
1675  state->bs_context = CurrentMemoryContext;
1676  state->bs_emptyTuple = NULL;
1677  state->bs_emptyTupleLen = 0;
1678 
1679  /* Remember the memory context to use for an empty tuple, if needed. */
1680  state->bs_context = CurrentMemoryContext;
1681  state->bs_emptyTuple = NULL;
1682  state->bs_emptyTupleLen = 0;
1683 
1684  /*
1685  * Calculate the start of the last page range. Page numbers are 0-based,
1686  * so to calculate the index we need to subtract one. The integer division
1687  * gives us the index of the page range.
1688  */
1689  if (tablePages > 0)
1690  lastRange = ((tablePages - 1) / pagesPerRange) * pagesPerRange;
1691 
1692  /* Now calculate the start of the next range. */
1693  state->bs_maxRangeStart = lastRange + state->bs_pagesPerRange;
1694 
1695  return state;
1696 }
1697 
1698 /*
1699  * Release resources associated with a BrinBuildState.
1700  */
1701 static void
1703 {
1704  /*
1705  * Release the last index buffer used. We might as well ensure that
1706  * whatever free space remains in that page is available in FSM, too.
1707  */
1708  if (!BufferIsInvalid(state->bs_currentInsertBuf))
1709  {
1710  Page page;
1711  Size freespace;
1712  BlockNumber blk;
1713 
1714  page = BufferGetPage(state->bs_currentInsertBuf);
1715  freespace = PageGetFreeSpace(page);
1716  blk = BufferGetBlockNumber(state->bs_currentInsertBuf);
1717  ReleaseBuffer(state->bs_currentInsertBuf);
1718  RecordPageWithFreeSpace(state->bs_irel, blk, freespace);
1719  FreeSpaceMapVacuumRange(state->bs_irel, blk, blk + 1);
1720  }
1721 
1722  brin_free_desc(state->bs_bdesc);
1723  pfree(state->bs_dtuple);
1724  pfree(state);
1725 }
1726 
1727 /*
1728  * On the given BRIN index, summarize the heap page range that corresponds
1729  * to the heap block number given.
1730  *
1731  * This routine can run in parallel with insertions into the heap. To avoid
1732  * missing those values from the summary tuple, we first insert a placeholder
1733  * index tuple into the index, then execute the heap scan; transactions
1734  * concurrent with the scan update the placeholder tuple. After the scan, we
1735  * union the placeholder tuple with the one computed by this routine. The
1736  * update of the index value happens in a loop, so that if somebody updates
1737  * the placeholder tuple after we read it, we detect the case and try again.
1738  * This ensures that the concurrently inserted tuples are not lost.
1739  *
1740  * A further corner case is this routine being asked to summarize the partial
1741  * range at the end of the table. heapNumBlocks is the (possibly outdated)
1742  * table size; if we notice that the requested range lies beyond that size,
1743  * we re-compute the table size after inserting the placeholder tuple, to
1744  * avoid missing pages that were appended recently.
1745  */
1746 static void
1748  BlockNumber heapBlk, BlockNumber heapNumBlks)
1749 {
1750  Buffer phbuf;
1751  BrinTuple *phtup;
1752  Size phsz;
1753  OffsetNumber offset;
1754  BlockNumber scanNumBlks;
1755 
1756  /*
1757  * Insert the placeholder tuple
1758  */
1759  phbuf = InvalidBuffer;
1760  phtup = brin_form_placeholder_tuple(state->bs_bdesc, heapBlk, &phsz);
1761  offset = brin_doinsert(state->bs_irel, state->bs_pagesPerRange,
1762  state->bs_rmAccess, &phbuf,
1763  heapBlk, phtup, phsz);
1764 
1765  /*
1766  * Compute range end. We hold ShareUpdateExclusive lock on table, so it
1767  * cannot shrink concurrently (but it can grow).
1768  */
1769  Assert(heapBlk % state->bs_pagesPerRange == 0);
1770  if (heapBlk + state->bs_pagesPerRange > heapNumBlks)
1771  {
1772  /*
1773  * If we're asked to scan what we believe to be the final range on the
1774  * table (i.e. a range that might be partial) we need to recompute our
1775  * idea of what the latest page is after inserting the placeholder
1776  * tuple. Anyone that grows the table later will update the
1777  * placeholder tuple, so it doesn't matter that we won't scan these
1778  * pages ourselves. Careful: the table might have been extended
1779  * beyond the current range, so clamp our result.
1780  *
1781  * Fortunately, this should occur infrequently.
1782  */
1783  scanNumBlks = Min(RelationGetNumberOfBlocks(heapRel) - heapBlk,
1784  state->bs_pagesPerRange);
1785  }
1786  else
1787  {
1788  /* Easy case: range is known to be complete */
1789  scanNumBlks = state->bs_pagesPerRange;
1790  }
1791 
1792  /*
1793  * Execute the partial heap scan covering the heap blocks in the specified
1794  * page range, summarizing the heap tuples in it. This scan stops just
1795  * short of brinbuildCallback creating the new index entry.
1796  *
1797  * Note that it is critical we use the "any visible" mode of
1798  * table_index_build_range_scan here: otherwise, we would miss tuples
1799  * inserted by transactions that are still in progress, among other corner
1800  * cases.
1801  */
1802  state->bs_currRangeStart = heapBlk;
1803  table_index_build_range_scan(heapRel, state->bs_irel, indexInfo, false, true, false,
1804  heapBlk, scanNumBlks,
1805  brinbuildCallback, (void *) state, NULL);
1806 
1807  /*
1808  * Now we update the values obtained by the scan with the placeholder
1809  * tuple. We do this in a loop which only terminates if we're able to
1810  * update the placeholder tuple successfully; if we are not, this means
1811  * somebody else modified the placeholder tuple after we read it.
1812  */
1813  for (;;)
1814  {
1815  BrinTuple *newtup;
1816  Size newsize;
1817  bool didupdate;
1818  bool samepage;
1819 
1821 
1822  /*
1823  * Update the summary tuple and try to update.
1824  */
1825  newtup = brin_form_tuple(state->bs_bdesc,
1826  heapBlk, state->bs_dtuple, &newsize);
1827  samepage = brin_can_do_samepage_update(phbuf, phsz, newsize);
1828  didupdate =
1829  brin_doupdate(state->bs_irel, state->bs_pagesPerRange,
1830  state->bs_rmAccess, heapBlk, phbuf, offset,
1831  phtup, phsz, newtup, newsize, samepage);
1832  brin_free_tuple(phtup);
1833  brin_free_tuple(newtup);
1834 
1835  /* If the update succeeded, we're done. */
1836  if (didupdate)
1837  break;
1838 
1839  /*
1840  * If the update didn't work, it might be because somebody updated the
1841  * placeholder tuple concurrently. Extract the new version, union it
1842  * with the values we have from the scan, and start over. (There are
1843  * other reasons for the update to fail, but it's simple to treat them
1844  * the same.)
1845  */
1846  phtup = brinGetTupleForHeapBlock(state->bs_rmAccess, heapBlk, &phbuf,
1847  &offset, &phsz, BUFFER_LOCK_SHARE);
1848  /* the placeholder tuple must exist */
1849  if (phtup == NULL)
1850  elog(ERROR, "missing placeholder tuple");
1851  phtup = brin_copy_tuple(phtup, phsz, NULL, NULL);
1853 
1854  /* merge it into the tuple from the heap scan */
1855  union_tuples(state->bs_bdesc, state->bs_dtuple, phtup);
1856  }
1857 
1858  ReleaseBuffer(phbuf);
1859 }
1860 
1861 /*
1862  * Summarize page ranges that are not already summarized. If pageRange is
1863  * BRIN_ALL_BLOCKRANGES then the whole table is scanned; otherwise, only the
1864  * page range containing the given heap page number is scanned.
1865  * If include_partial is true, then the partial range at the end of the table
1866  * is summarized, otherwise not.
1867  *
1868  * For each new index tuple inserted, *numSummarized (if not NULL) is
1869  * incremented; for each existing tuple, *numExisting (if not NULL) is
1870  * incremented.
1871  */
1872 static void
1874  bool include_partial, double *numSummarized, double *numExisting)
1875 {
1876  BrinRevmap *revmap;
1877  BrinBuildState *state = NULL;
1878  IndexInfo *indexInfo = NULL;
1879  BlockNumber heapNumBlocks;
1880  BlockNumber pagesPerRange;
1881  Buffer buf;
1882  BlockNumber startBlk;
1883 
1884  revmap = brinRevmapInitialize(index, &pagesPerRange);
1885 
1886  /* determine range of pages to process */
1887  heapNumBlocks = RelationGetNumberOfBlocks(heapRel);
1888  if (pageRange == BRIN_ALL_BLOCKRANGES)
1889  startBlk = 0;
1890  else
1891  {
1892  startBlk = (pageRange / pagesPerRange) * pagesPerRange;
1893  heapNumBlocks = Min(heapNumBlocks, startBlk + pagesPerRange);
1894  }
1895  if (startBlk > heapNumBlocks)
1896  {
1897  /* Nothing to do if start point is beyond end of table */
1898  brinRevmapTerminate(revmap);
1899  return;
1900  }
1901 
1902  /*
1903  * Scan the revmap to find unsummarized items.
1904  */
1905  buf = InvalidBuffer;
1906  for (; startBlk < heapNumBlocks; startBlk += pagesPerRange)
1907  {
1908  BrinTuple *tup;
1909  OffsetNumber off;
1910 
1911  /*
1912  * Unless requested to summarize even a partial range, go away now if
1913  * we think the next range is partial. Caller would pass true when it
1914  * is typically run once bulk data loading is done
1915  * (brin_summarize_new_values), and false when it is typically the
1916  * result of arbitrarily-scheduled maintenance command (vacuuming).
1917  */
1918  if (!include_partial &&
1919  (startBlk + pagesPerRange > heapNumBlocks))
1920  break;
1921 
1923 
1924  tup = brinGetTupleForHeapBlock(revmap, startBlk, &buf, &off, NULL,
1926  if (tup == NULL)
1927  {
1928  /* no revmap entry for this heap range. Summarize it. */
1929  if (state == NULL)
1930  {
1931  /* first time through */
1932  Assert(!indexInfo);
1934  pagesPerRange,
1936  indexInfo = BuildIndexInfo(index);
1937  }
1938  summarize_range(indexInfo, state, heapRel, startBlk, heapNumBlocks);
1939 
1940  /* and re-initialize state for the next range */
1941  brin_memtuple_initialize(state->bs_dtuple, state->bs_bdesc);
1942 
1943  if (numSummarized)
1944  *numSummarized += 1.0;
1945  }
1946  else
1947  {
1948  if (numExisting)
1949  *numExisting += 1.0;
1951  }
1952  }
1953 
1954  if (BufferIsValid(buf))
1955  ReleaseBuffer(buf);
1956 
1957  /* free resources */
1958  brinRevmapTerminate(revmap);
1959  if (state)
1960  {
1962  pfree(indexInfo);
1963  }
1964 }
1965 
1966 /*
1967  * Given a deformed tuple in the build state, convert it into the on-disk
1968  * format and insert it into the index, making the revmap point to it.
1969  */
1970 static void
1972 {
1973  BrinTuple *tup;
1974  Size size;
1975 
1976  tup = brin_form_tuple(state->bs_bdesc, state->bs_currRangeStart,
1977  state->bs_dtuple, &size);
1978  brin_doinsert(state->bs_irel, state->bs_pagesPerRange, state->bs_rmAccess,
1979  &state->bs_currentInsertBuf, state->bs_currRangeStart,
1980  tup, size);
1981  state->bs_numtuples++;
1982 
1983  pfree(tup);
1984 }
1985 
1986 /*
1987  * Given a deformed tuple in the build state, convert it into the on-disk
1988  * format and write it to a (shared) tuplesort (the leader will insert it
1989  * into the index later).
1990  */
1991 static void
1993 {
1994  BrinTuple *tup;
1995  Size size;
1996 
1997  /* don't insert empty tuples in parallel build */
1998  if (state->bs_dtuple->bt_empty_range)
1999  return;
2000 
2001  tup = brin_form_tuple(state->bs_bdesc, state->bs_currRangeStart,
2002  state->bs_dtuple, &size);
2003 
2004  /* write the BRIN tuple to the tuplesort */
2005  tuplesort_putbrintuple(state->bs_sortstate, tup, size);
2006 
2007  state->bs_numtuples++;
2008 
2009  pfree(tup);
2010 }
2011 
2012 /*
2013  * Given two deformed tuples, adjust the first one so that it's consistent
2014  * with the summary values in both.
2015  */
2016 static void
2018 {
2019  int keyno;
2020  BrinMemTuple *db;
2021  MemoryContext cxt;
2022  MemoryContext oldcxt;
2023 
2024  /* Use our own memory context to avoid retail pfree */
2026  "brin union",
2028  oldcxt = MemoryContextSwitchTo(cxt);
2029  db = brin_deform_tuple(bdesc, b, NULL);
2030  MemoryContextSwitchTo(oldcxt);
2031 
2032  /*
2033  * Check if the ranges are empty.
2034  *
2035  * If at least one of them is empty, we don't need to call per-key union
2036  * functions at all. If "b" is empty, we just use "a" as the result (it
2037  * might be empty fine, but that's fine). If "a" is empty but "b" is not,
2038  * we use "b" as the result (but we have to copy the data into "a" first).
2039  *
2040  * Only when both ranges are non-empty, we actually do the per-key merge.
2041  */
2042 
2043  /* If "b" is empty - ignore it and just use "a" (even if it's empty etc.). */
2044  if (db->bt_empty_range)
2045  {
2046  /* skip the per-key merge */
2047  MemoryContextDelete(cxt);
2048  return;
2049  }
2050 
2051  /*
2052  * Now we know "b" is not empty. If "a" is empty, then "b" is the result.
2053  * But we need to copy the data from "b" to "a" first, because that's how
2054  * we pass result out.
2055  *
2056  * We have to copy all the global/per-key flags etc. too.
2057  */
2058  if (a->bt_empty_range)
2059  {
2060  for (keyno = 0; keyno < bdesc->bd_tupdesc->natts; keyno++)
2061  {
2062  int i;
2063  BrinValues *col_a = &a->bt_columns[keyno];
2064  BrinValues *col_b = &db->bt_columns[keyno];
2065  BrinOpcInfo *opcinfo = bdesc->bd_info[keyno];
2066 
2067  col_a->bv_allnulls = col_b->bv_allnulls;
2068  col_a->bv_hasnulls = col_b->bv_hasnulls;
2069 
2070  /* If "b" has no data, we're done. */
2071  if (col_b->bv_allnulls)
2072  continue;
2073 
2074  for (i = 0; i < opcinfo->oi_nstored; i++)
2075  col_a->bv_values[i] =
2076  datumCopy(col_b->bv_values[i],
2077  opcinfo->oi_typcache[i]->typbyval,
2078  opcinfo->oi_typcache[i]->typlen);
2079  }
2080 
2081  /* "a" started empty, but "b" was not empty, so remember that */
2082  a->bt_empty_range = false;
2083 
2084  /* skip the per-key merge */
2085  MemoryContextDelete(cxt);
2086  return;
2087  }
2088 
2089  /* Now we know neither range is empty. */
2090  for (keyno = 0; keyno < bdesc->bd_tupdesc->natts; keyno++)
2091  {
2092  FmgrInfo *unionFn;
2093  BrinValues *col_a = &a->bt_columns[keyno];
2094  BrinValues *col_b = &db->bt_columns[keyno];
2095  BrinOpcInfo *opcinfo = bdesc->bd_info[keyno];
2096 
2097  if (opcinfo->oi_regular_nulls)
2098  {
2099  /* Does the "b" summary represent any NULL values? */
2100  bool b_has_nulls = (col_b->bv_hasnulls || col_b->bv_allnulls);
2101 
2102  /* Adjust "hasnulls". */
2103  if (!col_a->bv_allnulls && b_has_nulls)
2104  col_a->bv_hasnulls = true;
2105 
2106  /* If there are no values in B, there's nothing left to do. */
2107  if (col_b->bv_allnulls)
2108  continue;
2109 
2110  /*
2111  * Adjust "allnulls". If A doesn't have values, just copy the
2112  * values from B into A, and we're done. We cannot run the
2113  * operators in this case, because values in A might contain
2114  * garbage. Note we already established that B contains values.
2115  *
2116  * Also adjust "hasnulls" in order not to forget the summary
2117  * represents NULL values. This is not redundant with the earlier
2118  * update, because that only happens when allnulls=false.
2119  */
2120  if (col_a->bv_allnulls)
2121  {
2122  int i;
2123 
2124  col_a->bv_allnulls = false;
2125  col_a->bv_hasnulls = true;
2126 
2127  for (i = 0; i < opcinfo->oi_nstored; i++)
2128  col_a->bv_values[i] =
2129  datumCopy(col_b->bv_values[i],
2130  opcinfo->oi_typcache[i]->typbyval,
2131  opcinfo->oi_typcache[i]->typlen);
2132 
2133  continue;
2134  }
2135  }
2136 
2137  unionFn = index_getprocinfo(bdesc->bd_index, keyno + 1,
2139  FunctionCall3Coll(unionFn,
2140  bdesc->bd_index->rd_indcollation[keyno],
2141  PointerGetDatum(bdesc),
2142  PointerGetDatum(col_a),
2143  PointerGetDatum(col_b));
2144  }
2145 
2146  MemoryContextDelete(cxt);
2147 }
2148 
2149 /*
2150  * brin_vacuum_scan
2151  * Do a complete scan of the index during VACUUM.
2152  *
2153  * This routine scans the complete index looking for uncataloged index pages,
2154  * i.e. those that might have been lost due to a crash after index extension
2155  * and such.
2156  */
2157 static void
2159 {
2160  BlockNumber nblocks;
2161  BlockNumber blkno;
2162 
2163  /*
2164  * Scan the index in physical order, and clean up any possible mess in
2165  * each page.
2166  */
2167  nblocks = RelationGetNumberOfBlocks(idxrel);
2168  for (blkno = 0; blkno < nblocks; blkno++)
2169  {
2170  Buffer buf;
2171 
2173 
2174  buf = ReadBufferExtended(idxrel, MAIN_FORKNUM, blkno,
2175  RBM_NORMAL, strategy);
2176 
2177  brin_page_cleanup(idxrel, buf);
2178 
2179  ReleaseBuffer(buf);
2180  }
2181 
2182  /*
2183  * Update all upper pages in the index's FSM, as well. This ensures not
2184  * only that we propagate leaf-page FSM updates made by brin_page_cleanup,
2185  * but also that any pre-existing damage or out-of-dateness is repaired.
2186  */
2187  FreeSpaceMapVacuum(idxrel);
2188 }
2189 
2190 static bool
2192  const Datum *values, const bool *nulls)
2193 {
2194  int keyno;
2195 
2196  /* If the range starts empty, we're certainly going to modify it. */
2197  bool modified = dtup->bt_empty_range;
2198 
2199  /*
2200  * Compare the key values of the new tuple to the stored index values; our
2201  * deformed tuple will get updated if the new tuple doesn't fit the
2202  * original range (note this means we can't break out of the loop early).
2203  * Make a note of whether this happens, so that we know to insert the
2204  * modified tuple later.
2205  */
2206  for (keyno = 0; keyno < bdesc->bd_tupdesc->natts; keyno++)
2207  {
2208  Datum result;
2209  BrinValues *bval;
2210  FmgrInfo *addValue;
2211  bool has_nulls;
2212 
2213  bval = &dtup->bt_columns[keyno];
2214 
2215  /*
2216  * Does the range have actual NULL values? Either of the flags can be
2217  * set, but we ignore the state before adding first row.
2218  *
2219  * We have to remember this, because we'll modify the flags and we
2220  * need to know if the range started as empty.
2221  */
2222  has_nulls = ((!dtup->bt_empty_range) &&
2223  (bval->bv_hasnulls || bval->bv_allnulls));
2224 
2225  /*
2226  * If the value we're adding is NULL, handle it locally. Otherwise
2227  * call the BRIN_PROCNUM_ADDVALUE procedure.
2228  */
2229  if (bdesc->bd_info[keyno]->oi_regular_nulls && nulls[keyno])
2230  {
2231  /*
2232  * If the new value is null, we record that we saw it if it's the
2233  * first one; otherwise, there's nothing to do.
2234  */
2235  if (!bval->bv_hasnulls)
2236  {
2237  bval->bv_hasnulls = true;
2238  modified = true;
2239  }
2240 
2241  continue;
2242  }
2243 
2244  addValue = index_getprocinfo(idxRel, keyno + 1,
2246  result = FunctionCall4Coll(addValue,
2247  idxRel->rd_indcollation[keyno],
2248  PointerGetDatum(bdesc),
2249  PointerGetDatum(bval),
2250  values[keyno],
2251  nulls[keyno]);
2252  /* if that returned true, we need to insert the updated tuple */
2253  modified |= DatumGetBool(result);
2254 
2255  /*
2256  * If the range was had actual NULL values (i.e. did not start empty),
2257  * make sure we don't forget about the NULL values. Either the
2258  * allnulls flag is still set to true, or (if the opclass cleared it)
2259  * we need to set hasnulls=true.
2260  *
2261  * XXX This can only happen when the opclass modified the tuple, so
2262  * the modified flag should be set.
2263  */
2264  if (has_nulls && !(bval->bv_hasnulls || bval->bv_allnulls))
2265  {
2266  Assert(modified);
2267  bval->bv_hasnulls = true;
2268  }
2269  }
2270 
2271  /*
2272  * After updating summaries for all the keys, mark it as not empty.
2273  *
2274  * If we're actually changing the flag value (i.e. tuple started as
2275  * empty), we should have modified the tuple. So we should not see empty
2276  * range that was not modified.
2277  */
2278  Assert(!dtup->bt_empty_range || modified);
2279  dtup->bt_empty_range = false;
2280 
2281  return modified;
2282 }
2283 
2284 static bool
2285 check_null_keys(BrinValues *bval, ScanKey *nullkeys, int nnullkeys)
2286 {
2287  int keyno;
2288 
2289  /*
2290  * First check if there are any IS [NOT] NULL scan keys, and if we're
2291  * violating them.
2292  */
2293  for (keyno = 0; keyno < nnullkeys; keyno++)
2294  {
2295  ScanKey key = nullkeys[keyno];
2296 
2297  Assert(key->sk_attno == bval->bv_attno);
2298 
2299  /* Handle only IS NULL/IS NOT NULL tests */
2300  if (!(key->sk_flags & SK_ISNULL))
2301  continue;
2302 
2303  if (key->sk_flags & SK_SEARCHNULL)
2304  {
2305  /* IS NULL scan key, but range has no NULLs */
2306  if (!bval->bv_allnulls && !bval->bv_hasnulls)
2307  return false;
2308  }
2309  else if (key->sk_flags & SK_SEARCHNOTNULL)
2310  {
2311  /*
2312  * For IS NOT NULL, we can only skip ranges that are known to have
2313  * only nulls.
2314  */
2315  if (bval->bv_allnulls)
2316  return false;
2317  }
2318  else
2319  {
2320  /*
2321  * Neither IS NULL nor IS NOT NULL was used; assume all indexable
2322  * operators are strict and thus return false with NULL value in
2323  * the scan key.
2324  */
2325  return false;
2326  }
2327  }
2328 
2329  return true;
2330 }
2331 
2332 static void
2334  bool isconcurrent, int request)
2335 {
2336  ParallelContext *pcxt;
2337  int scantuplesortstates;
2338  Snapshot snapshot;
2339  Size estbrinshared;
2340  Size estsort;
2341  BrinShared *brinshared;
2342  Sharedsort *sharedsort;
2343  BrinLeader *brinleader = (BrinLeader *) palloc0(sizeof(BrinLeader));
2344  WalUsage *walusage;
2345  BufferUsage *bufferusage;
2346  bool leaderparticipates = true;
2347  int querylen;
2348 
2349 #ifdef DISABLE_LEADER_PARTICIPATION
2350  leaderparticipates = false;
2351 #endif
2352 
2353  /*
2354  * Enter parallel mode, and create context for parallel build of brin
2355  * index
2356  */
2358  Assert(request > 0);
2359  pcxt = CreateParallelContext("postgres", "_brin_parallel_build_main",
2360  request);
2361 
2362  scantuplesortstates = leaderparticipates ? request + 1 : request;
2363 
2364  /*
2365  * Prepare for scan of the base relation. In a normal index build, we use
2366  * SnapshotAny because we must retrieve all tuples and do our own time
2367  * qual checks (because we have to index RECENTLY_DEAD tuples). In a
2368  * concurrent build, we take a regular MVCC snapshot and index whatever's
2369  * live according to that.
2370  */
2371  if (!isconcurrent)
2372  snapshot = SnapshotAny;
2373  else
2375 
2376  /*
2377  * Estimate size for our own PARALLEL_KEY_BRIN_SHARED workspace.
2378  */
2379  estbrinshared = _brin_parallel_estimate_shared(heap, snapshot);
2380  shm_toc_estimate_chunk(&pcxt->estimator, estbrinshared);
2381  estsort = tuplesort_estimate_shared(scantuplesortstates);
2382  shm_toc_estimate_chunk(&pcxt->estimator, estsort);
2383 
2384  shm_toc_estimate_keys(&pcxt->estimator, 2);
2385 
2386  /*
2387  * Estimate space for WalUsage and BufferUsage -- PARALLEL_KEY_WAL_USAGE
2388  * and PARALLEL_KEY_BUFFER_USAGE.
2389  *
2390  * If there are no extensions loaded that care, we could skip this. We
2391  * have no way of knowing whether anyone's looking at pgWalUsage or
2392  * pgBufferUsage, so do it unconditionally.
2393  */
2395  mul_size(sizeof(WalUsage), pcxt->nworkers));
2396  shm_toc_estimate_keys(&pcxt->estimator, 1);
2398  mul_size(sizeof(BufferUsage), pcxt->nworkers));
2399  shm_toc_estimate_keys(&pcxt->estimator, 1);
2400 
2401  /* Finally, estimate PARALLEL_KEY_QUERY_TEXT space */
2402  if (debug_query_string)
2403  {
2404  querylen = strlen(debug_query_string);
2405  shm_toc_estimate_chunk(&pcxt->estimator, querylen + 1);
2406  shm_toc_estimate_keys(&pcxt->estimator, 1);
2407  }
2408  else
2409  querylen = 0; /* keep compiler quiet */
2410 
2411  /* Everyone's had a chance to ask for space, so now create the DSM */
2412  InitializeParallelDSM(pcxt);
2413 
2414  /* If no DSM segment was available, back out (do serial build) */
2415  if (pcxt->seg == NULL)
2416  {
2417  if (IsMVCCSnapshot(snapshot))
2418  UnregisterSnapshot(snapshot);
2419  DestroyParallelContext(pcxt);
2420  ExitParallelMode();
2421  return;
2422  }
2423 
2424  /* Store shared build state, for which we reserved space */
2425  brinshared = (BrinShared *) shm_toc_allocate(pcxt->toc, estbrinshared);
2426  /* Initialize immutable state */
2427  brinshared->heaprelid = RelationGetRelid(heap);
2428  brinshared->indexrelid = RelationGetRelid(index);
2429  brinshared->isconcurrent = isconcurrent;
2430  brinshared->scantuplesortstates = scantuplesortstates;
2431  brinshared->pagesPerRange = buildstate->bs_pagesPerRange;
2432  ConditionVariableInit(&brinshared->workersdonecv);
2433  SpinLockInit(&brinshared->mutex);
2434 
2435  /* Initialize mutable state */
2436  brinshared->nparticipantsdone = 0;
2437  brinshared->reltuples = 0.0;
2438  brinshared->indtuples = 0.0;
2439 
2441  ParallelTableScanFromBrinShared(brinshared),
2442  snapshot);
2443 
2444  /*
2445  * Store shared tuplesort-private state, for which we reserved space.
2446  * Then, initialize opaque state using tuplesort routine.
2447  */
2448  sharedsort = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);
2449  tuplesort_initialize_shared(sharedsort, scantuplesortstates,
2450  pcxt->seg);
2451 
2452  /*
2453  * Store shared tuplesort-private state, for which we reserved space.
2454  * Then, initialize opaque state using tuplesort routine.
2455  */
2456  shm_toc_insert(pcxt->toc, PARALLEL_KEY_BRIN_SHARED, brinshared);
2457  shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT, sharedsort);
2458 
2459  /* Store query string for workers */
2460  if (debug_query_string)
2461  {
2462  char *sharedquery;
2463 
2464  sharedquery = (char *) shm_toc_allocate(pcxt->toc, querylen + 1);
2465  memcpy(sharedquery, debug_query_string, querylen + 1);
2466  shm_toc_insert(pcxt->toc, PARALLEL_KEY_QUERY_TEXT, sharedquery);
2467  }
2468 
2469  /*
2470  * Allocate space for each worker's WalUsage and BufferUsage; no need to
2471  * initialize.
2472  */
2473  walusage = shm_toc_allocate(pcxt->toc,
2474  mul_size(sizeof(WalUsage), pcxt->nworkers));
2475  shm_toc_insert(pcxt->toc, PARALLEL_KEY_WAL_USAGE, walusage);
2476  bufferusage = shm_toc_allocate(pcxt->toc,
2477  mul_size(sizeof(BufferUsage), pcxt->nworkers));
2478  shm_toc_insert(pcxt->toc, PARALLEL_KEY_BUFFER_USAGE, bufferusage);
2479 
2480  /* Launch workers, saving status for leader/caller */
2481  LaunchParallelWorkers(pcxt);
2482  brinleader->pcxt = pcxt;
2483  brinleader->nparticipanttuplesorts = pcxt->nworkers_launched;
2484  if (leaderparticipates)
2485  brinleader->nparticipanttuplesorts++;
2486  brinleader->brinshared = brinshared;
2487  brinleader->sharedsort = sharedsort;
2488  brinleader->snapshot = snapshot;
2489  brinleader->walusage = walusage;
2490  brinleader->bufferusage = bufferusage;
2491 
2492  /* If no workers were successfully launched, back out (do serial build) */
2493  if (pcxt->nworkers_launched == 0)
2494  {
2495  _brin_end_parallel(brinleader, NULL);
2496  return;
2497  }
2498 
2499  /* Save leader state now that it's clear build will be parallel */
2500  buildstate->bs_leader = brinleader;
2501 
2502  /* Join heap scan ourselves */
2503  if (leaderparticipates)
2504  _brin_leader_participate_as_worker(buildstate, heap, index);
2505 
2506  /*
2507  * Caller needs to wait for all launched workers when we return. Make
2508  * sure that the failure-to-start case will not hang forever.
2509  */
2511 }
2512 
2513 /*
2514  * Shut down workers, destroy parallel context, and end parallel mode.
2515  */
2516 static void
2518 {
2519  int i;
2520  BrinTuple *btup;
2521  BrinMemTuple *memtuple = NULL;
2522  Size tuplen;
2523  BrinShared *brinshared = brinleader->brinshared;
2524  BlockNumber prevblkno = InvalidBlockNumber;
2525  MemoryContext rangeCxt,
2526  oldCxt;
2527 
2528  /* Shutdown worker processes */
2529  WaitForParallelWorkersToFinish(brinleader->pcxt);
2530 
2531  /*
2532  * If we didn't actually launch workers, we still have to make sure to
2533  * exit parallel mode.
2534  */
2535  if (!state)
2536  goto cleanup;
2537 
2538  /* copy the data into leader state (we have to wait for the workers ) */
2539  state->bs_reltuples = brinshared->reltuples;
2540  state->bs_numtuples = brinshared->indtuples;
2541 
2542  /* do the actual sort in the leader */
2543  tuplesort_performsort(state->bs_sortstate);
2544 
2545  /*
2546  * Initialize BrinMemTuple we'll use to union summaries from workers (in
2547  * case they happened to produce parts of the same paga range).
2548  */
2549  memtuple = brin_new_memtuple(state->bs_bdesc);
2550 
2551  /*
2552  * Create a memory context we'll reset to combine results for a single
2553  * page range (received from the workers). We don't expect huge number of
2554  * overlaps under regular circumstances, because for large tables the
2555  * chunk size is likely larger than the BRIN page range), but it can
2556  * happen, and the union functions may do all kinds of stuff. So we better
2557  * reset the context once in a while.
2558  */
2560  "brin union",
2562  oldCxt = MemoryContextSwitchTo(rangeCxt);
2563 
2564  /*
2565  * Read the BRIN tuples from the shared tuplesort, sorted by block number.
2566  * That probably gives us an index that is cheaper to scan, thanks to
2567  * mostly getting data from the same index page as before.
2568  */
2569  while ((btup = tuplesort_getbrintuple(state->bs_sortstate, &tuplen, true)) != NULL)
2570  {
2571  /* Ranges should be multiples of pages_per_range for the index. */
2572  Assert(btup->bt_blkno % brinshared->pagesPerRange == 0);
2573 
2574  /*
2575  * Do we need to union summaries for the same page range?
2576  *
2577  * If this is the first brin tuple we read, then just deform it into
2578  * the memtuple, and continue with the next one from tuplesort. We
2579  * however may need to insert empty summaries into the index.
2580  *
2581  * If it's the same block as the last we saw, we simply union the brin
2582  * tuple into it, and we're done - we don't even need to insert empty
2583  * ranges, because that was done earlier when we saw the first brin
2584  * tuple (for this range).
2585  *
2586  * Finally, if it's not the first brin tuple, and it's not the same
2587  * page range, we need to do the insert and then deform the tuple into
2588  * the memtuple. Then we'll insert empty ranges before the new brin
2589  * tuple, if needed.
2590  */
2591  if (prevblkno == InvalidBlockNumber)
2592  {
2593  /* First brin tuples, just deform into memtuple. */
2594  memtuple = brin_deform_tuple(state->bs_bdesc, btup, memtuple);
2595 
2596  /* continue to insert empty pages before thisblock */
2597  }
2598  else if (memtuple->bt_blkno == btup->bt_blkno)
2599  {
2600  /*
2601  * Not the first brin tuple, but same page range as the previous
2602  * one, so we can merge it into the memtuple.
2603  */
2604  union_tuples(state->bs_bdesc, memtuple, btup);
2605  continue;
2606  }
2607  else
2608  {
2609  BrinTuple *tmp;
2610  Size len;
2611 
2612  /*
2613  * We got brin tuple for a different page range, so form a brin
2614  * tuple from the memtuple, insert it, and re-init the memtuple
2615  * from the new brin tuple.
2616  */
2617  tmp = brin_form_tuple(state->bs_bdesc, memtuple->bt_blkno,
2618  memtuple, &len);
2619 
2620  brin_doinsert(state->bs_irel, state->bs_pagesPerRange, state->bs_rmAccess,
2621  &state->bs_currentInsertBuf, tmp->bt_blkno, tmp, len);
2622 
2623  /*
2624  * Reset the per-output-range context. This frees all the memory
2625  * possibly allocated by the union functions, and also the BRIN
2626  * tuple we just formed and inserted.
2627  */
2628  MemoryContextReset(rangeCxt);
2629 
2630  memtuple = brin_deform_tuple(state->bs_bdesc, btup, memtuple);
2631 
2632  /* continue to insert empty pages before thisblock */
2633  }
2634 
2635  /* Fill empty ranges for all ranges missing in the tuplesort. */
2636  brin_fill_empty_ranges(state, prevblkno, btup->bt_blkno);
2637 
2638  prevblkno = btup->bt_blkno;
2639  }
2640 
2641  tuplesort_end(state->bs_sortstate);
2642 
2643  /* Fill the BRIN tuple for the last page range with data. */
2644  if (prevblkno != InvalidBlockNumber)
2645  {
2646  BrinTuple *tmp;
2647  Size len;
2648 
2649  tmp = brin_form_tuple(state->bs_bdesc, memtuple->bt_blkno,
2650  memtuple, &len);
2651 
2652  brin_doinsert(state->bs_irel, state->bs_pagesPerRange, state->bs_rmAccess,
2653  &state->bs_currentInsertBuf, tmp->bt_blkno, tmp, len);
2654 
2655  pfree(tmp);
2656  }
2657 
2658  /* Fill empty ranges at the end, for all ranges missing in the tuplesort. */
2659  brin_fill_empty_ranges(state, prevblkno, state->bs_maxRangeStart);
2660 
2661  /*
2662  * Switch back to the original memory context, and destroy the one we
2663  * created to isolate the union_tuple calls.
2664  */
2665  MemoryContextSwitchTo(oldCxt);
2666  MemoryContextDelete(rangeCxt);
2667 
2668  /*
2669  * Next, accumulate WAL usage. (This must wait for the workers to finish,
2670  * or we might get incomplete data.)
2671  */
2672  for (i = 0; i < brinleader->pcxt->nworkers_launched; i++)
2673  InstrAccumParallelQuery(&brinleader->bufferusage[i], &brinleader->walusage[i]);
2674 
2675 cleanup:
2676 
2677  /* Free last reference to MVCC snapshot, if one was used */
2678  if (IsMVCCSnapshot(brinleader->snapshot))
2679  UnregisterSnapshot(brinleader->snapshot);
2680  DestroyParallelContext(brinleader->pcxt);
2681  ExitParallelMode();
2682 }
2683 
2684 /*
2685  * Returns size of shared memory required to store state for a parallel
2686  * brin index build based on the snapshot its parallel scan will use.
2687  */
2688 static Size
2690 {
2691  /* c.f. shm_toc_allocate as to why BUFFERALIGN is used */
2692  return add_size(BUFFERALIGN(sizeof(BrinShared)),
2693  table_parallelscan_estimate(heap, snapshot));
2694 }
2695 
2696 /*
2697  * Within leader, participate as a parallel worker.
2698  */
2699 static void
2701 {
2702  BrinLeader *brinleader = buildstate->bs_leader;
2703  int sortmem;
2704 
2705  /*
2706  * Might as well use reliable figure when doling out maintenance_work_mem
2707  * (when requested number of workers were not launched, this will be
2708  * somewhat higher than it is for other workers).
2709  */
2710  sortmem = maintenance_work_mem / brinleader->nparticipanttuplesorts;
2711 
2712  /* Perform work common to all participants */
2713  _brin_parallel_scan_and_build(buildstate, brinleader->brinshared,
2714  brinleader->sharedsort, heap, index, sortmem, true);
2715 }
2716 
2717 /*
2718  * Perform a worker's portion of a parallel sort.
2719  *
2720  * This generates a tuplesort for the worker portion of the table.
2721  *
2722  * sortmem is the amount of working memory to use within each worker,
2723  * expressed in KBs.
2724  *
2725  * When this returns, workers are done, and need only release resources.
2726  */
2727 static void
2729  BrinShared *brinshared, Sharedsort *sharedsort,
2730  Relation heap, Relation index,
2731  int sortmem, bool progress)
2732 {
2733  SortCoordinate coordinate;
2734  TableScanDesc scan;
2735  double reltuples;
2736  IndexInfo *indexInfo;
2737 
2738  /* Initialize local tuplesort coordination state */
2739  coordinate = palloc0(sizeof(SortCoordinateData));
2740  coordinate->isWorker = true;
2741  coordinate->nParticipants = -1;
2742  coordinate->sharedsort = sharedsort;
2743 
2744  /* Begin "partial" tuplesort */
2745  state->bs_sortstate = tuplesort_begin_index_brin(sortmem, coordinate,
2746  TUPLESORT_NONE);
2747 
2748  /* Join parallel scan */
2749  indexInfo = BuildIndexInfo(index);
2750  indexInfo->ii_Concurrent = brinshared->isconcurrent;
2751 
2752  scan = table_beginscan_parallel(heap,
2753  ParallelTableScanFromBrinShared(brinshared));
2754 
2755  reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
2757 
2758  /* insert the last item */
2760 
2761  /* sort the BRIN ranges built by this worker */
2762  tuplesort_performsort(state->bs_sortstate);
2763 
2764  state->bs_reltuples += reltuples;
2765 
2766  /*
2767  * Done. Record ambuild statistics.
2768  */
2769  SpinLockAcquire(&brinshared->mutex);
2770  brinshared->nparticipantsdone++;
2771  brinshared->reltuples += state->bs_reltuples;
2772  brinshared->indtuples += state->bs_numtuples;
2773  SpinLockRelease(&brinshared->mutex);
2774 
2775  /* Notify leader */
2776  ConditionVariableSignal(&brinshared->workersdonecv);
2777 
2778  tuplesort_end(state->bs_sortstate);
2779 }
2780 
2781 /*
2782  * Perform work within a launched parallel process.
2783  */
2784 void
2786 {
2787  char *sharedquery;
2788  BrinShared *brinshared;
2789  Sharedsort *sharedsort;
2790  BrinBuildState *buildstate;
2791  Relation heapRel;
2792  Relation indexRel;
2793  LOCKMODE heapLockmode;
2794  LOCKMODE indexLockmode;
2795  WalUsage *walusage;
2796  BufferUsage *bufferusage;
2797  int sortmem;
2798 
2799  /*
2800  * The only possible status flag that can be set to the parallel worker is
2801  * PROC_IN_SAFE_IC.
2802  */
2803  Assert((MyProc->statusFlags == 0) ||
2805 
2806  /* Set debug_query_string for individual workers first */
2807  sharedquery = shm_toc_lookup(toc, PARALLEL_KEY_QUERY_TEXT, true);
2808  debug_query_string = sharedquery;
2809 
2810  /* Report the query string from leader */
2812 
2813  /* Look up brin shared state */
2814  brinshared = shm_toc_lookup(toc, PARALLEL_KEY_BRIN_SHARED, false);
2815 
2816  /* Open relations using lock modes known to be obtained by index.c */
2817  if (!brinshared->isconcurrent)
2818  {
2819  heapLockmode = ShareLock;
2820  indexLockmode = AccessExclusiveLock;
2821  }
2822  else
2823  {
2824  heapLockmode = ShareUpdateExclusiveLock;
2825  indexLockmode = RowExclusiveLock;
2826  }
2827 
2828  /* Open relations within worker */
2829  heapRel = table_open(brinshared->heaprelid, heapLockmode);
2830  indexRel = index_open(brinshared->indexrelid, indexLockmode);
2831 
2832  buildstate = initialize_brin_buildstate(indexRel, NULL,
2833  brinshared->pagesPerRange,
2835 
2836  /* Look up shared state private to tuplesort.c */
2837  sharedsort = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT, false);
2838  tuplesort_attach_shared(sharedsort, seg);
2839 
2840  /* Prepare to track buffer usage during parallel execution */
2842 
2843  /*
2844  * Might as well use reliable figure when doling out maintenance_work_mem
2845  * (when requested number of workers were not launched, this will be
2846  * somewhat higher than it is for other workers).
2847  */
2848  sortmem = maintenance_work_mem / brinshared->scantuplesortstates;
2849 
2850  _brin_parallel_scan_and_build(buildstate, brinshared, sharedsort,
2851  heapRel, indexRel, sortmem, false);
2852 
2853  /* Report WAL/buffer usage during parallel execution */
2854  bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false);
2855  walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false);
2857  &walusage[ParallelWorkerNumber]);
2858 
2859  index_close(indexRel, indexLockmode);
2860  table_close(heapRel, heapLockmode);
2861 }
2862 
2863 /*
2864  * brin_build_empty_tuple
2865  * Maybe initialize a BRIN tuple representing empty range.
2866  *
2867  * Returns a BRIN tuple representing an empty page range starting at the
2868  * specified block number. The empty tuple is initialized only once, when it's
2869  * needed for the first time, stored in the memory context bs_context to ensure
2870  * proper life span, and reused on following calls. All empty tuples are
2871  * exactly the same except for the bs_blkno field, which is set to the value
2872  * in blkno parameter.
2873  */
2874 static void
2876 {
2877  /* First time an empty tuple is requested? If yes, initialize it. */
2878  if (state->bs_emptyTuple == NULL)
2879  {
2880  MemoryContext oldcxt;
2881  BrinMemTuple *dtuple = brin_new_memtuple(state->bs_bdesc);
2882 
2883  /* Allocate the tuple in context for the whole index build. */
2884  oldcxt = MemoryContextSwitchTo(state->bs_context);
2885 
2886  state->bs_emptyTuple = brin_form_tuple(state->bs_bdesc, blkno, dtuple,
2887  &state->bs_emptyTupleLen);
2888 
2889  MemoryContextSwitchTo(oldcxt);
2890  }
2891  else
2892  {
2893  /* If we already have an empty tuple, just update the block. */
2894  state->bs_emptyTuple->bt_blkno = blkno;
2895  }
2896 }
2897 
2898 /*
2899  * brin_fill_empty_ranges
2900  * Add BRIN index tuples representing empty page ranges.
2901  *
2902  * prevRange/nextRange determine for which page ranges to add empty summaries.
2903  * Both boundaries are exclusive, i.e. only ranges starting at blkno for which
2904  * (prevRange < blkno < nextRange) will be added to the index.
2905  *
2906  * If prevRange is InvalidBlockNumber, this means there was no previous page
2907  * range (i.e. the first empty range to add is for blkno=0).
2908  *
2909  * The empty tuple is built only once, and then reused for all future calls.
2910  */
2911 static void
2913  BlockNumber prevRange, BlockNumber nextRange)
2914 {
2915  BlockNumber blkno;
2916 
2917  /*
2918  * If we already summarized some ranges, we need to start with the next
2919  * one. Otherwise start from the first range of the table.
2920  */
2921  blkno = (prevRange == InvalidBlockNumber) ? 0 : (prevRange + state->bs_pagesPerRange);
2922 
2923  /* Generate empty ranges until we hit the next non-empty range. */
2924  while (blkno < nextRange)
2925  {
2926  /* Did we already build the empty tuple? If not, do it now. */
2927  brin_build_empty_tuple(state, blkno);
2928 
2929  brin_doinsert(state->bs_irel, state->bs_pagesPerRange, state->bs_rmAccess,
2930  &state->bs_currentInsertBuf,
2931  blkno, state->bs_emptyTuple, state->bs_emptyTupleLen);
2932 
2933  /* try next page range */
2934  blkno += state->bs_pagesPerRange;
2935  }
2936 }
@ ACLCHECK_NOT_OWNER
Definition: acl.h:185
void aclcheck_error(AclResult aclerr, ObjectType objtype, const char *objectname)
Definition: aclchk.c:2688
bool object_ownercheck(Oid classid, Oid objectid, Oid roleid)
Definition: aclchk.c:4130
int16 AttrNumber
Definition: attnum.h:21
bool AutoVacuumRequestWork(AutoVacuumWorkItemType type, Oid relationId, BlockNumber blkno)
Definition: autovacuum.c:3182
@ AVW_BRINSummarizeRange
Definition: autovacuum.h:25
int ParallelWorkerNumber
Definition: parallel.c:112
void InitializeParallelDSM(ParallelContext *pcxt)
Definition: parallel.c:205
void WaitForParallelWorkersToFinish(ParallelContext *pcxt)
Definition: parallel.c:775
void LaunchParallelWorkers(ParallelContext *pcxt)
Definition: parallel.c:552
void DestroyParallelContext(ParallelContext *pcxt)
Definition: parallel.c:929
ParallelContext * CreateParallelContext(const char *library_name, const char *function_name, int nworkers)
Definition: parallel.c:167
void WaitForParallelWorkersToAttach(ParallelContext *pcxt)
Definition: parallel.c:672
void pgstat_report_activity(BackendState state, const char *cmd_str)
@ STATE_RUNNING
uint32 BlockNumber
Definition: block.h:31
#define InvalidBlockNumber
Definition: block.h:33
#define MaxBlockNumber
Definition: block.h:35
static Datum values[MAXATTR]
Definition: bootstrap.c:152
static void cleanup(void)
Definition: bootstrap.c:682
#define PARALLEL_KEY_BUFFER_USAGE
Definition: brin.c:51
IndexBulkDeleteResult * brinvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
Definition: brin.c:1303
static void brin_vacuum_scan(Relation idxrel, BufferAccessStrategy strategy)
Definition: brin.c:2158
Datum brin_desummarize_range(PG_FUNCTION_ARGS)
Definition: brin.c:1477
void brinrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, ScanKey orderbys, int norderbys)
Definition: brin.c:942
static void terminate_brin_buildstate(BrinBuildState *state)
Definition: brin.c:1702
#define PARALLEL_KEY_BRIN_SHARED
Definition: brin.c:47
Datum brin_summarize_range(PG_FUNCTION_ARGS)
Definition: brin.c:1366
static void form_and_spill_tuple(BrinBuildState *state)
Definition: brin.c:1992
#define BRIN_ALL_BLOCKRANGES
Definition: brin.c:206
struct BrinShared BrinShared
Datum brin_summarize_new_values(PG_FUNCTION_ARGS)
Definition: brin.c:1351
IndexScanDesc brinbeginscan(Relation r, int nkeys, int norderbys)
Definition: brin.c:524
IndexBuildResult * brinbuild(Relation heap, Relation index, IndexInfo *indexInfo)
Definition: brin.c:1089
int64 bringetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
Definition: brin.c:552
static void brinsummarize(Relation index, Relation heapRel, BlockNumber pageRange, bool include_partial, double *numSummarized, double *numExisting)
Definition: brin.c:1873
static void form_and_insert_tuple(BrinBuildState *state)
Definition: brin.c:1971
void brinbuildempty(Relation index)
Definition: brin.c:1259
void brin_free_desc(BrinDesc *bdesc)
Definition: brin.c:1622
void brininsertcleanup(IndexInfo *indexInfo)
Definition: brin.c:501
struct BrinInsertState BrinInsertState
static void union_tuples(BrinDesc *bdesc, BrinMemTuple *a, BrinTuple *b)
Definition: brin.c:2017
static BrinBuildState * initialize_brin_buildstate(Relation idxRel, BrinRevmap *revmap, BlockNumber pagesPerRange, BlockNumber tablePages)
Definition: brin.c:1655
static void _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, bool isconcurrent, int request)
Definition: brin.c:2333
void brinGetStats(Relation index, BrinStatsData *stats)
Definition: brin.c:1634
BrinDesc * brin_build_desc(Relation rel)
Definition: brin.c:1567
static void _brin_leader_participate_as_worker(BrinBuildState *buildstate, Relation heap, Relation index)
Definition: brin.c:2700
static bool add_values_to_range(Relation idxRel, BrinDesc *bdesc, BrinMemTuple *dtup, const Datum *values, const bool *nulls)
Definition: brin.c:2191
static void _brin_parallel_scan_and_build(BrinBuildState *buildstate, BrinShared *brinshared, Sharedsort *sharedsort, Relation heap, Relation index, int sortmem, bool progress)
Definition: brin.c:2728
static void _brin_end_parallel(BrinLeader *brinleader, BrinBuildState *state)
Definition: brin.c:2517
static Size _brin_parallel_estimate_shared(Relation heap, Snapshot snapshot)
Definition: brin.c:2689
struct BrinBuildState BrinBuildState
struct BrinLeader BrinLeader
static void brin_fill_empty_ranges(BrinBuildState *state, BlockNumber prevRange, BlockNumber maxRange)
Definition: brin.c:2912
struct BrinOpaque BrinOpaque
static void summarize_range(IndexInfo *indexInfo, BrinBuildState *state, Relation heapRel, BlockNumber heapBlk, BlockNumber heapNumBlks)
Definition: brin.c:1747
#define ParallelTableScanFromBrinShared(shared)
Definition: brin.c:113
#define PARALLEL_KEY_TUPLESORT
Definition: brin.c:48
static void brinbuildCallbackParallel(Relation index, ItemPointer tid, Datum *values, bool *isnull, bool tupleIsAlive, void *brstate)
Definition: brin.c:1030
bool brininsert(Relation idxRel, Datum *values, bool *nulls, ItemPointer heaptid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, IndexInfo *indexInfo)
Definition: brin.c:333
#define PARALLEL_KEY_QUERY_TEXT
Definition: brin.c:49
Datum brinhandler(PG_FUNCTION_ARGS)
Definition: brin.c:245
void _brin_parallel_build_main(dsm_segment *seg, shm_toc *toc)
Definition: brin.c:2785
static void brin_build_empty_tuple(BrinBuildState *state, BlockNumber blkno)
Definition: brin.c:2875
#define PARALLEL_KEY_WAL_USAGE
Definition: brin.c:50
bytea * brinoptions(Datum reloptions, bool validate)
Definition: brin.c:1333
IndexBulkDeleteResult * brinbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state)
Definition: brin.c:1288
static BrinInsertState * initialize_brin_insertstate(Relation idxRel, IndexInfo *indexInfo)
Definition: brin.c:304
static void brinbuildCallback(Relation index, ItemPointer tid, Datum *values, bool *isnull, bool tupleIsAlive, void *brstate)
Definition: brin.c:979
void brinendscan(IndexScanDesc scan)
Definition: brin.c:962
static bool check_null_keys(BrinValues *bval, ScanKey *nullkeys, int nnullkeys)
Definition: brin.c:2285
#define BrinGetPagesPerRange(relation)
Definition: brin.h:40
#define BrinGetAutoSummarize(relation)
Definition: brin.h:46
#define BRIN_LAST_OPTIONAL_PROCNUM
Definition: brin_internal.h:78
#define BRIN_PROCNUM_UNION
Definition: brin_internal.h:73
#define BRIN_PROCNUM_OPTIONS
Definition: brin_internal.h:75
#define BRIN_PROCNUM_OPCINFO
Definition: brin_internal.h:70
#define BRIN_PROCNUM_CONSISTENT
Definition: brin_internal.h:72
#define BRIN_elog(args)
Definition: brin_internal.h:85
#define BRIN_PROCNUM_ADDVALUE
Definition: brin_internal.h:71
#define BRIN_CURRENT_VERSION
Definition: brin_page.h:72
#define BRIN_METAPAGE_BLKNO
Definition: brin_page.h:75
bool brin_doupdate(Relation idxrel, BlockNumber pagesPerRange, BrinRevmap *revmap, BlockNumber heapBlk, Buffer oldbuf, OffsetNumber oldoff, const BrinTuple *origtup, Size origsz, const BrinTuple *newtup, Size newsz, bool samepage)
Definition: brin_pageops.c:53
void brin_page_cleanup(Relation idxrel, Buffer buf)
Definition: brin_pageops.c:624
OffsetNumber brin_doinsert(Relation idxrel, BlockNumber pagesPerRange, BrinRevmap *revmap, Buffer *buffer, BlockNumber heapBlk, BrinTuple *tup, Size itemsz)
Definition: brin_pageops.c:342
void brin_metapage_init(Page page, BlockNumber pagesPerRange, uint16 version)
Definition: brin_pageops.c:486
bool brin_can_do_samepage_update(Buffer buffer, Size origsz, Size newsz)
Definition: brin_pageops.c:323
bool brinRevmapDesummarizeRange(Relation idxrel, BlockNumber heapBlk)
Definition: brin_revmap.c:323
void brinRevmapTerminate(BrinRevmap *revmap)
Definition: brin_revmap.c:100
BrinTuple * brinGetTupleForHeapBlock(BrinRevmap *revmap, BlockNumber heapBlk, Buffer *buf, OffsetNumber *off, Size *size, int mode)
Definition: brin_revmap.c:194
BrinRevmap * brinRevmapInitialize(Relation idxrel, BlockNumber *pagesPerRange)
Definition: brin_revmap.c:70
BrinTuple * brin_form_tuple(BrinDesc *brdesc, BlockNumber blkno, BrinMemTuple *tuple, Size *size)
Definition: brin_tuple.c:99
BrinMemTuple * brin_new_memtuple(BrinDesc *brdesc)
Definition: brin_tuple.c:482
BrinMemTuple * brin_deform_tuple(BrinDesc *brdesc, BrinTuple *tuple, BrinMemTuple *dMemtuple)
Definition: brin_tuple.c:553
BrinMemTuple * brin_memtuple_initialize(BrinMemTuple *dtuple, BrinDesc *brdesc)
Definition: brin_tuple.c:511
BrinTuple * brin_copy_tuple(BrinTuple *tuple, Size len, BrinTuple *dest, Size *destsz)
Definition: brin_tuple.c:446
void brin_free_tuple(BrinTuple *tuple)
Definition: brin_tuple.c:433
BrinTuple * brin_form_placeholder_tuple(BrinDesc *brdesc, BlockNumber blkno, Size *size)
Definition: brin_tuple.c:388
bool brinvalidate(Oid opclassoid)
Definition: brin_validate.c:37
#define SizeOfBrinCreateIdx
Definition: brin_xlog.h:55
#define XLOG_BRIN_CREATE_INDEX
Definition: brin_xlog.h:31
int Buffer
Definition: buf.h:23
#define BufferIsInvalid(buffer)
Definition: buf.h:31
#define InvalidBuffer
Definition: buf.h:25
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:3377
Buffer ExtendBufferedRel(BufferManagerRelation bmr, ForkNumber forkNum, BufferAccessStrategy strategy, uint32 flags)
Definition: bufmgr.c:838
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:4560
void UnlockReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:4577
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:2189
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:4795
Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition: bufmgr.c:781
Buffer ReadBuffer(Relation reln, BlockNumber blockNum)
Definition: bufmgr.c:734
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:157
#define BUFFER_LOCK_SHARE
Definition: bufmgr.h:158
#define RelationGetNumberOfBlocks(reln)
Definition: bufmgr.h:229
static Page BufferGetPage(Buffer buffer)
Definition: bufmgr.h:350
@ EB_SKIP_EXTENSION_LOCK
Definition: bufmgr.h:73
@ EB_LOCK_FIRST
Definition: bufmgr.h:85
@ RBM_NORMAL
Definition: bufmgr.h:44
#define BMR_REL(p_rel)
Definition: bufmgr.h:106
static bool BufferIsValid(Buffer bufnum)
Definition: bufmgr.h:301
Size PageGetFreeSpace(Page page)
Definition: bufpage.c:907
static char * PageGetContents(Page page)
Definition: bufpage.h:254
Pointer Page
Definition: bufpage.h:78
static ItemId PageGetItemId(Page page, OffsetNumber offsetNumber)
Definition: bufpage.h:240
static void PageSetLSN(Page page, XLogRecPtr lsn)
Definition: bufpage.h:388
#define Min(x, y)
Definition: c.h:991
#define MAXALIGN(LEN)
Definition: c.h:798
signed int int32
Definition: c.h:481
#define PG_USED_FOR_ASSERTS_ONLY
Definition: c.h:169
#define BUFFERALIGN(LEN)
Definition: c.h:800
#define lengthof(array)
Definition: c.h:775
#define OidIsValid(objectId)
Definition: c.h:762
size_t Size
Definition: c.h:592
void ConditionVariableInit(ConditionVariable *cv)
void ConditionVariableSignal(ConditionVariable *cv)
Datum datumCopy(Datum value, bool typByVal, int typLen)
Definition: datum.c:132
int errhint(const char *fmt,...)
Definition: elog.c:1319
int errcode(int sqlerrcode)
Definition: elog.c:859
int errmsg(const char *fmt,...)
Definition: elog.c:1072
#define LOG
Definition: elog.h:31
#define DEBUG2
Definition: elog.h:29
#define DEBUG1
Definition: elog.h:30
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:224
#define ereport(elevel,...)
Definition: elog.h:149
#define palloc_object(type)
Definition: fe_memutils.h:62
#define palloc_array(type, count)
Definition: fe_memutils.h:64
#define palloc0_array(type, count)
Definition: fe_memutils.h:65
#define palloc0_object(type)
Definition: fe_memutils.h:63
Datum FunctionCall4Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, Datum arg2, Datum arg3, Datum arg4)
Definition: fmgr.c:1196
Datum Int64GetDatum(int64 X)
Definition: fmgr.c:1807
Datum FunctionCall3Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, Datum arg2, Datum arg3)
Definition: fmgr.c:1171
void fmgr_info_copy(FmgrInfo *dstinfo, FmgrInfo *srcinfo, MemoryContext destcxt)
Definition: fmgr.c:580
#define PG_RETURN_VOID()
Definition: fmgr.h:349
#define PG_GETARG_OID(n)
Definition: fmgr.h:275
#define DirectFunctionCall2(func, arg1, arg2)
Definition: fmgr.h:644
#define PG_GETARG_DATUM(n)
Definition: fmgr.h:268
#define PG_GETARG_INT64(n)
Definition: fmgr.h:283
#define FunctionCall1(flinfo, arg1)
Definition: fmgr.h:660
#define PG_RETURN_INT32(x)
Definition: fmgr.h:354
#define PG_RETURN_POINTER(x)
Definition: fmgr.h:361
#define PG_FUNCTION_ARGS
Definition: fmgr.h:193
void FreeSpaceMapVacuumRange(Relation rel, BlockNumber start, BlockNumber end)
Definition: freespace.c:354
void FreeSpaceMapVacuum(Relation rel)
Definition: freespace.c:335
void RecordPageWithFreeSpace(Relation rel, BlockNumber heapBlk, Size spaceAvail)
Definition: freespace.c:182
IndexScanDesc RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys)
Definition: genam.c:78
bool(* IndexBulkDeleteCallback)(ItemPointer itemptr, void *state)
Definition: genam.h:87
IndexUniqueCheck
Definition: genam.h:116
int maintenance_work_mem
Definition: globals.c:130
int NewGUCNestLevel(void)
Definition: guc.c:2237
void RestrictSearchPath(void)
Definition: guc.c:2248
void AtEOXact_GUC(bool isCommit, int nestLevel)
Definition: guc.c:2264
Oid IndexGetRelation(Oid indexId, bool missing_ok)
Definition: index.c:3520
IndexInfo * BuildIndexInfo(Relation index)
Definition: index.c:2407
void index_close(Relation relation, LOCKMODE lockmode)
Definition: indexam.c:177
FmgrInfo * index_getprocinfo(Relation irel, AttrNumber attnum, uint16 procnum)
Definition: indexam.c:863
Relation index_open(Oid relationId, LOCKMODE lockmode)
Definition: indexam.c:133
void InstrAccumParallelQuery(BufferUsage *bufusage, WalUsage *walusage)
Definition: instrument.c:218
void InstrEndParallelQuery(BufferUsage *bufusage, WalUsage *walusage)
Definition: instrument.c:208
void InstrStartParallelQuery(void)
Definition: instrument.c:200
int b
Definition: isn.c:70
int a
Definition: isn.c:69
int i
Definition: isn.c:73
#define ItemIdGetLength(itemId)
Definition: itemid.h:59
static OffsetNumber ItemPointerGetOffsetNumber(const ItemPointerData *pointer)
Definition: itemptr.h:124
static BlockNumber ItemPointerGetBlockNumber(const ItemPointerData *pointer)
Definition: itemptr.h:103
Assert(fmt[strlen(fmt) - 1] !='\n')
int LOCKMODE
Definition: lockdefs.h:26
#define AccessExclusiveLock
Definition: lockdefs.h:43
#define AccessShareLock
Definition: lockdefs.h:36
#define ShareUpdateExclusiveLock
Definition: lockdefs.h:39
#define ShareLock
Definition: lockdefs.h:40
#define RowExclusiveLock
Definition: lockdefs.h:38
void MemoryContextReset(MemoryContext context)
Definition: mcxt.c:371
void pfree(void *pointer)
Definition: mcxt.c:1508
void * palloc0(Size size)
Definition: mcxt.c:1334
MemoryContext CurrentMemoryContext
Definition: mcxt.c:131
void MemoryContextDelete(MemoryContext context)
Definition: mcxt.c:442
void * palloc(Size size)
Definition: mcxt.c:1304
#define AllocSetContextCreate
Definition: memutils.h:129
#define ALLOCSET_DEFAULT_SIZES
Definition: memutils.h:153
#define ALLOCSET_SMALL_SIZES
Definition: memutils.h:163
#define SECURITY_RESTRICTED_OPERATION
Definition: miscadmin.h:315
#define START_CRIT_SECTION()
Definition: miscadmin.h:149
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:122
#define END_CRIT_SECTION()
Definition: miscadmin.h:151
void GetUserIdAndSecContext(Oid *userid, int *sec_context)
Definition: miscinit.c:635
Oid GetUserId(void)
Definition: miscinit.c:514
void SetUserIdAndSecContext(Oid userid, int sec_context)
Definition: miscinit.c:642
#define makeNode(_type_)
Definition: nodes.h:155
uint16 OffsetNumber
Definition: off.h:24
#define FirstOffsetNumber
Definition: off.h:27
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
Definition: palloc.h:124
@ OBJECT_INDEX
Definition: parsenodes.h:2129
FormData_pg_attribute * Form_pg_attribute
Definition: pg_attribute.h:209
const void size_t len
static char * buf
Definition: pg_test_fsync.c:73
int progress
Definition: pgbench.c:261
#define ERRCODE_UNDEFINED_TABLE
Definition: pgbench.c:78
#define pgstat_count_index_scan(rel)
Definition: pgstat.h:625
const char * debug_query_string
Definition: postgres.c:87
static bool DatumGetBool(Datum X)
Definition: postgres.h:90
static Datum PointerGetDatum(const void *X)
Definition: postgres.h:322
uintptr_t Datum
Definition: postgres.h:64
static Pointer DatumGetPointer(Datum X)
Definition: postgres.h:312
static Datum Int32GetDatum(int32 X)
Definition: postgres.h:212
#define InvalidOid
Definition: postgres_ext.h:36
unsigned int Oid
Definition: postgres_ext.h:31
#define PROC_IN_SAFE_IC
Definition: proc.h:59
static void addrange(struct cvec *cv, chr from, chr to)
Definition: regc_cvec.c:90
#define RelationGetRelid(relation)
Definition: rel.h:505
#define RelationGetDescr(relation)
Definition: rel.h:531
#define RelationGetRelationName(relation)
Definition: rel.h:539
#define RelationNeedsWAL(relation)
Definition: rel.h:628
void * build_reloptions(Datum reloptions, bool validate, relopt_kind kind, Size relopt_struct_size, const relopt_parse_elt *relopt_elems, int num_relopt_elems)
Definition: reloptions.c:1908
@ RELOPT_KIND_BRIN
Definition: reloptions.h:52
@ RELOPT_TYPE_INT
Definition: reloptions.h:32
@ RELOPT_TYPE_BOOL
Definition: reloptions.h:31
@ MAIN_FORKNUM
Definition: relpath.h:50
@ INIT_FORKNUM
Definition: relpath.h:53
int slock_t
Definition: s_lock.h:735
void brincostestimate(PlannerInfo *root, IndexPath *path, double loop_count, Cost *indexStartupCost, Cost *indexTotalCost, Selectivity *indexSelectivity, double *indexCorrelation, double *indexPages)
Definition: selfuncs.c:7928
void shm_toc_insert(shm_toc *toc, uint64 key, void *address)
Definition: shm_toc.c:171
void * shm_toc_allocate(shm_toc *toc, Size nbytes)
Definition: shm_toc.c:88
void * shm_toc_lookup(shm_toc *toc, uint64 key, bool noError)
Definition: shm_toc.c:232
#define shm_toc_estimate_chunk(e, sz)
Definition: shm_toc.h:51
#define shm_toc_estimate_keys(e, cnt)
Definition: shm_toc.h:53
Size add_size(Size s1, Size s2)
Definition: shmem.c:493
Size mul_size(Size s1, Size s2)
Definition: shmem.c:510
#define SK_SEARCHNOTNULL
Definition: skey.h:122
#define SK_SEARCHNULL
Definition: skey.h:121
#define SK_ISNULL
Definition: skey.h:115
static pg_noinline void Size size
Definition: slab.c:607
Snapshot GetTransactionSnapshot(void)
Definition: snapmgr.c:216
void UnregisterSnapshot(Snapshot snapshot)
Definition: snapmgr.c:836
Snapshot RegisterSnapshot(Snapshot snapshot)
Definition: snapmgr.c:794
#define SnapshotAny
Definition: snapmgr.h:33
#define IsMVCCSnapshot(snapshot)
Definition: snapmgr.h:62
#define SpinLockInit(lock)
Definition: spin.h:60
#define SpinLockRelease(lock)
Definition: spin.h:64
#define SpinLockAcquire(lock)
Definition: spin.h:62
void relation_close(Relation relation, LOCKMODE lockmode)
Definition: relation.c:205
PGPROC * MyProc
Definition: proc.c:66
BlockNumber bs_maxRangeStart
Definition: brin.c:160
Size bs_emptyTupleLen
Definition: brin.c:166
MemoryContext bs_context
Definition: brin.c:167
BrinMemTuple * bs_dtuple
Definition: brin.c:163
Relation bs_irel
Definition: brin.c:154
BlockNumber bs_pagesPerRange
Definition: brin.c:158
double bs_numtuples
Definition: brin.c:155
Buffer bs_currentInsertBuf
Definition: brin.c:157
BrinRevmap * bs_rmAccess
Definition: brin.c:161
Tuplesortstate * bs_sortstate
Definition: brin.c:182
BrinLeader * bs_leader
Definition: brin.c:174
int bs_worker_id
Definition: brin.c:175
BlockNumber bs_currRangeStart
Definition: brin.c:159
double bs_reltuples
Definition: brin.c:156
BrinDesc * bs_bdesc
Definition: brin.c:162
BrinTuple * bs_emptyTuple
Definition: brin.c:165
int bd_totalstored
Definition: brin_internal.h:59
TupleDesc bd_tupdesc
Definition: brin_internal.h:53
BrinOpcInfo * bd_info[FLEXIBLE_ARRAY_MEMBER]
Definition: brin_internal.h:62
Relation bd_index
Definition: brin_internal.h:50
MemoryContext bd_context
Definition: brin_internal.h:47
TupleDesc bd_disktdesc
Definition: brin_internal.h:56
BrinDesc * bis_desc
Definition: brin.c:192
BrinRevmap * bis_rmAccess
Definition: brin.c:191
BlockNumber bis_pages_per_range
Definition: brin.c:193
int nparticipanttuplesorts
Definition: brin.c:130
WalUsage * walusage
Definition: brin.c:144
BrinShared * brinshared
Definition: brin.c:141
BufferUsage * bufferusage
Definition: brin.c:145
Snapshot snapshot
Definition: brin.c:143
Sharedsort * sharedsort
Definition: brin.c:142
ParallelContext * pcxt
Definition: brin.c:122
BrinValues bt_columns[FLEXIBLE_ARRAY_MEMBER]
Definition: brin_tuple.h:55
BlockNumber bt_blkno
Definition: brin_tuple.h:48
bool bt_placeholder
Definition: brin_tuple.h:46
bool bt_empty_range
Definition: brin_tuple.h:47
BlockNumber lastRevmapPage
Definition: brin_page.h:69
BlockNumber pagesPerRange
Definition: brin_page.h:68
BlockNumber bo_pagesPerRange
Definition: brin.c:201
BrinDesc * bo_bdesc
Definition: brin.c:203
BrinRevmap * bo_rmAccess
Definition: brin.c:202
TypeCacheEntry * oi_typcache[FLEXIBLE_ARRAY_MEMBER]
Definition: brin_internal.h:37
uint16 oi_nstored
Definition: brin_internal.h:28
bool oi_regular_nulls
Definition: brin_internal.h:31
slock_t mutex
Definition: brin.c:84
int scantuplesortstates
Definition: brin.c:68
int nparticipantsdone
Definition: brin.c:96
Oid heaprelid
Definition: brin.c:64
BlockNumber pagesPerRange
Definition: brin.c:67
ConditionVariable workersdonecv
Definition: brin.c:76
Oid indexrelid
Definition: brin.c:65
bool isconcurrent
Definition: brin.c:66
double indtuples
Definition: brin.c:98
double reltuples
Definition: brin.c:97
BlockNumber revmapNumPages
Definition: brin.h:35
BlockNumber pagesPerRange
Definition: brin.h:34
BlockNumber bt_blkno
Definition: brin_tuple.h:66
bool bv_hasnulls
Definition: brin_tuple.h:32
Datum * bv_values
Definition: brin_tuple.h:34
AttrNumber bv_attno
Definition: brin_tuple.h:31
bool bv_allnulls
Definition: brin_tuple.h:33
Definition: fmgr.h:57
ambuildphasename_function ambuildphasename
Definition: amapi.h:276
ambuildempty_function ambuildempty
Definition: amapi.h:267
amvacuumcleanup_function amvacuumcleanup
Definition: amapi.h:271
bool amclusterable
Definition: amapi.h:241
amoptions_function amoptions
Definition: amapi.h:274
amestimateparallelscan_function amestimateparallelscan
Definition: amapi.h:288
amrestrpos_function amrestrpos
Definition: amapi.h:285
aminsert_function aminsert
Definition: amapi.h:268
amendscan_function amendscan
Definition: amapi.h:283
uint16 amoptsprocnum
Definition: amapi.h:221
amparallelrescan_function amparallelrescan
Definition: amapi.h:290
Oid amkeytype
Definition: amapi.h:257
bool ampredlocks
Definition: amapi.h:243
uint16 amsupport
Definition: amapi.h:219
amcostestimate_function amcostestimate
Definition: amapi.h:273
bool amcanorderbyop
Definition: amapi.h:225
amadjustmembers_function amadjustmembers
Definition: amapi.h:278
ambuild_function ambuild
Definition: amapi.h:266
bool amstorage
Definition: amapi.h:239
uint16 amstrategies
Definition: amapi.h:217
bool amoptionalkey
Definition: amapi.h:233
amgettuple_function amgettuple
Definition: amapi.h:281
amcanreturn_function amcanreturn
Definition: amapi.h:272
bool amcanunique
Definition: amapi.h:229
amgetbitmap_function amgetbitmap
Definition: amapi.h:282
amproperty_function amproperty
Definition: amapi.h:275
ambulkdelete_function ambulkdelete
Definition: amapi.h:270
bool amsearcharray
Definition: amapi.h:235
bool amsummarizing
Definition: amapi.h:253
amvalidate_function amvalidate
Definition: amapi.h:277
ammarkpos_function ammarkpos
Definition: amapi.h:284
bool amcanmulticol
Definition: amapi.h:231
bool amusemaintenanceworkmem
Definition: amapi.h:251
ambeginscan_function ambeginscan
Definition: amapi.h:279
bool amcanparallel
Definition: amapi.h:245
amrescan_function amrescan
Definition: amapi.h:280
bool amcanorder
Definition: amapi.h:223
bool amcanbuildparallel
Definition: amapi.h:247
aminitparallelscan_function aminitparallelscan
Definition: amapi.h:289
uint8 amparallelvacuumoptions
Definition: amapi.h:255
aminsertcleanup_function aminsertcleanup
Definition: amapi.h:269
bool amcanbackward
Definition: amapi.h:227
bool amcaninclude
Definition: amapi.h:249
bool amsearchnulls
Definition: amapi.h:237
double heap_tuples
Definition: genam.h:32
double index_tuples
Definition: genam.h:33
BlockNumber num_pages
Definition: genam.h:77
double num_index_tuples
Definition: genam.h:79
void * ii_AmCache
Definition: execnodes.h:207
int ii_ParallelWorkers
Definition: execnodes.h:205
bool ii_Concurrent
Definition: execnodes.h:202
MemoryContext ii_Context
Definition: execnodes.h:208
struct ScanKeyData * keyData
Definition: relscan.h:122
Relation indexRelation
Definition: relscan.h:118
Relation index
Definition: genam.h:46
bool analyze_only
Definition: genam.h:48
BufferAccessStrategy strategy
Definition: genam.h:53
uint8 statusFlags
Definition: proc.h:238
dsm_segment * seg
Definition: parallel.h:42
shm_toc_estimator estimator
Definition: parallel.h:41
shm_toc * toc
Definition: parallel.h:44
int nworkers_launched
Definition: parallel.h:37
Form_pg_index rd_index
Definition: rel.h:192
Oid * rd_indcollation
Definition: rel.h:217
Form_pg_class rd_rel
Definition: rel.h:111
Oid sk_collation
Definition: skey.h:70
Sharedsort * sharedsort
Definition: tuplesort.h:58
int tdrefcount
Definition: tupdesc.h:84
bool typbyval
Definition: typcache.h:40
int16 typlen
Definition: typcache.h:39
Definition: type.h:95
Definition: regguts.h:323
Definition: c.h:674
BlockNumber pagesPerRange
Definition: brin_xlog.h:52
void table_close(Relation relation, LOCKMODE lockmode)
Definition: table.c:126
Relation table_open(Oid relationId, LOCKMODE lockmode)
Definition: table.c:40
TableScanDesc table_beginscan_parallel(Relation relation, ParallelTableScanDesc pscan)
Definition: tableam.c:165
Size table_parallelscan_estimate(Relation rel, Snapshot snapshot)
Definition: tableam.c:130
void table_parallelscan_initialize(Relation rel, ParallelTableScanDesc pscan, Snapshot snapshot)
Definition: tableam.c:145
static double table_index_build_range_scan(Relation table_rel, Relation index_rel, struct IndexInfo *index_info, bool allow_sync, bool anyvisible, bool progress, BlockNumber start_blockno, BlockNumber numblocks, IndexBuildCallback callback, void *callback_state, TableScanDesc scan)
Definition: tableam.h:1800
static double table_index_build_scan(Relation table_rel, Relation index_rel, struct IndexInfo *index_info, bool allow_sync, bool progress, IndexBuildCallback callback, void *callback_state, TableScanDesc scan)
Definition: tableam.h:1767
static void callback(struct sockaddr *addr, struct sockaddr *mask, void *unused)
Definition: test_ifaddrs.c:46
void tbm_add_page(TIDBitmap *tbm, BlockNumber pageno)
Definition: tidbitmap.c:443
#define TupleDescAttr(tupdesc, i)
Definition: tupdesc.h:92
void tuplesort_performsort(Tuplesortstate *state)
Definition: tuplesort.c:1379
void tuplesort_initialize_shared(Sharedsort *shared, int nWorkers, dsm_segment *seg)
Definition: tuplesort.c:2970
Size tuplesort_estimate_shared(int nWorkers)
Definition: tuplesort.c:2949
void tuplesort_end(Tuplesortstate *state)
Definition: tuplesort.c:966
void tuplesort_attach_shared(Sharedsort *shared, dsm_segment *seg)
Definition: tuplesort.c:2993
struct SortCoordinateData * SortCoordinate
Definition: tuplesort.h:61
#define TUPLESORT_NONE
Definition: tuplesort.h:93
Tuplesortstate * tuplesort_begin_index_brin(int workMem, SortCoordinate coordinate, int sortopt)
void tuplesort_putbrintuple(Tuplesortstate *state, BrinTuple *tuple, Size size)
BrinTuple * tuplesort_getbrintuple(Tuplesortstate *state, Size *len, bool forward)
#define VACUUM_OPTION_PARALLEL_CLEANUP
Definition: vacuum.h:62
void ExitParallelMode(void)
Definition: xact.c:1050
void EnterParallelMode(void)
Definition: xact.c:1037
bool RecoveryInProgress(void)
Definition: xlog.c:6201
uint64 XLogRecPtr
Definition: xlogdefs.h:21
void XLogRegisterData(char *data, uint32 len)
Definition: xloginsert.c:364
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:474
XLogRecPtr log_newpage_buffer(Buffer buffer, bool page_std)
Definition: xloginsert.c:1237
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition: xloginsert.c:242
void XLogBeginInsert(void)
Definition: xloginsert.c:149
#define REGBUF_STANDARD
Definition: xloginsert.h:34
#define REGBUF_WILL_INIT
Definition: xloginsert.h:33