PostgreSQL Source Code git master
Loading...
Searching...
No Matches
gininsert.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * gininsert.c
4 * insert routines for the postgres inverted index access method.
5 *
6 *
7 * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
8 * Portions Copyright (c) 1994, Regents of the University of California
9 *
10 * IDENTIFICATION
11 * src/backend/access/gin/gininsert.c
12 *-------------------------------------------------------------------------
13 */
14
15#include "postgres.h"
16
17#include "access/gin_private.h"
18#include "access/gin_tuple.h"
19#include "access/parallel.h"
20#include "access/table.h"
21#include "access/tableam.h"
22#include "access/xloginsert.h"
23#include "catalog/index.h"
25#include "commands/progress.h"
26#include "miscadmin.h"
27#include "nodes/execnodes.h"
28#include "pgstat.h"
29#include "storage/bufmgr.h"
30#include "storage/predicate.h"
31#include "tcop/tcopprot.h"
32#include "utils/datum.h"
33#include "utils/memutils.h"
34#include "utils/builtins.h"
35#include "utils/rel.h"
36#include "utils/typcache.h"
37
38
39/* Magic numbers for parallel state sharing */
40#define PARALLEL_KEY_GIN_SHARED UINT64CONST(0xB000000000000001)
41#define PARALLEL_KEY_TUPLESORT UINT64CONST(0xB000000000000002)
42#define PARALLEL_KEY_QUERY_TEXT UINT64CONST(0xB000000000000003)
43#define PARALLEL_KEY_WAL_USAGE UINT64CONST(0xB000000000000004)
44#define PARALLEL_KEY_BUFFER_USAGE UINT64CONST(0xB000000000000005)
45
46/*
47 * Status for index builds performed in parallel. This is allocated in a
48 * dynamic shared memory segment.
49 */
50typedef struct GinBuildShared
51{
52 /*
53 * These fields are not modified during the build. They primarily exist
54 * for the benefit of worker processes that need to create state
55 * corresponding to that used by the leader.
56 */
61
62 /*
63 * workersdonecv is used to monitor the progress of workers. All parallel
64 * participants must indicate that they are done before leader can use
65 * results built by the workers (and before leader can write the data into
66 * the index).
67 */
69
70 /*
71 * mutex protects all following fields
72 *
73 * These fields contain status information of interest to GIN index builds
74 * that must work just the same when an index is built in parallel.
75 */
77
78 /*
79 * Mutable state that is maintained by workers, and reported back to
80 * leader at end of the scans.
81 *
82 * nparticipantsdone is number of worker processes finished.
83 *
84 * reltuples is the total number of input heap tuples.
85 *
86 * indtuples is the total number of tuples that made it into the index.
87 */
89 double reltuples;
90 double indtuples;
91
92 /*
93 * ParallelTableScanDescData data follows. Can't directly embed here, as
94 * implementations of the parallel table scan desc interface might need
95 * stronger alignment.
96 */
98
99/*
100 * Return pointer to a GinBuildShared's parallel table scan.
101 *
102 * c.f. shm_toc_allocate as to why BUFFERALIGN is used, rather than just
103 * MAXALIGN.
104 */
105#define ParallelTableScanFromGinBuildShared(shared) \
106 (ParallelTableScanDesc) ((char *) (shared) + BUFFERALIGN(sizeof(GinBuildShared)))
107
108/*
109 * Status for leader in parallel index build.
110 */
111typedef struct GinLeader
112{
113 /* parallel context itself */
115
116 /*
117 * nparticipanttuplesorts is the exact number of worker processes
118 * successfully launched, plus one leader process if it participates as a
119 * worker (only DISABLE_LEADER_PARTICIPATION builds avoid leader
120 * participating as a worker).
121 */
123
124 /*
125 * Leader process convenience pointers to shared state (leader avoids TOC
126 * lookups).
127 *
128 * GinBuildShared is the shared state for entire build. sharedsort is the
129 * shared, tuplesort-managed state passed to each process tuplesort.
130 * snapshot is the snapshot used by the scan iff an MVCC snapshot is
131 * required.
132 */
139
140typedef struct
141{
143 double indtuples;
150
151 /*
152 * bs_leader is only present when a parallel index build is performed, and
153 * only in the leader process.
154 */
156
157 /* number of participating workers (including leader) */
159
160 /* used to pass information from workers to leader */
163
164 /*
165 * The sortstate is used by workers (including the leader). It has to be
166 * part of the build state, because that's the only thing passed to the
167 * build callback etc.
168 */
170
171 /*
172 * The sortstate used only within a single worker for the first merge pass
173 * happening there. In principle it doesn't need to be part of the build
174 * state and we could pass it around directly, but it's more convenient
175 * this way. And it's part of the build state, after all.
176 */
179
180
181/* parallel index builds */
183 bool isconcurrent, int request);
189 Relation heap, Relation index);
191 GinBuildShared *ginshared,
192 Sharedsort *sharedsort,
193 Relation heap, Relation index,
194 int sortmem, bool progress);
195
198
199static GinTuple *_gin_build_tuple(OffsetNumber attrnum, unsigned char category,
200 Datum key, int16 typlen, bool typbyval,
202 Size *len);
203
204/*
205 * Adds array of item pointers to tuple's posting list, or
206 * creates posting tree and tuple pointing to tree in case
207 * of not enough space. Max size of tuple is defined in
208 * GinFormTuple(). Returns a new, modified index tuple.
209 * items[] must be in sorted order with no duplicates.
210 */
211static IndexTuple
215 GinStatsData *buildStats, Buffer buffer)
216{
218 Datum key;
219 GinNullCategory category;
220 IndexTuple res;
222 *oldItems;
223 int oldNPosting,
225 nwritten;
227
229
230 attnum = gintuple_get_attrnum(ginstate, old);
231 key = gintuple_get_key(ginstate, old, &category);
232
233 /* merge the old and new posting lists */
235
238 &newNPosting);
239
240 /* Compress the posting list, and try to a build tuple with room for it */
241 res = NULL;
243 if (nwritten == newNPosting)
244 {
245 res = GinFormTuple(ginstate, attnum, key, category,
246 (char *) compressedList,
249 false);
250 }
251
254
255 if (!res)
256 {
257 /* posting list would be too big, convert to posting tree */
259
260 /*
261 * Initialize posting tree with the old tuple's posting list. It's
262 * surely small enough to fit on one posting-tree page, and should
263 * already be in order with no duplicates.
264 */
266 oldItems,
268 buildStats,
269 buffer);
270
271 /* Now insert the TIDs-to-be-added into the posting tree */
273 items, nitem,
274 buildStats);
275
276 /* And build a new posting-tree-only result tuple */
277 res = GinFormTuple(ginstate, attnum, key, category, NULL, 0, 0, true);
279 }
281
282 return res;
283}
284
285/*
286 * Build a fresh leaf tuple, either posting-list or posting-tree format
287 * depending on whether the given items list will fit.
288 * items[] must be in sorted order with no duplicates.
289 *
290 * This is basically the same logic as in addItemPointersToLeafTuple,
291 * but working from slightly different input.
292 */
293static IndexTuple
297 GinStatsData *buildStats, Buffer buffer)
298{
299 IndexTuple res = NULL;
301 int nwritten;
302
303 /* try to build a posting list tuple with all the items */
305 if (nwritten == nitem)
306 {
307 res = GinFormTuple(ginstate, attnum, key, category,
308 (char *) compressedList,
310 nitem, false);
311 }
313
314 if (!res)
315 {
316 /* posting list would be too big, build posting tree */
318
319 /*
320 * Build posting-tree-only result tuple. We do this first so as to
321 * fail quickly if the key is too big.
322 */
323 res = GinFormTuple(ginstate, attnum, key, category, NULL, 0, 0, true);
324
325 /*
326 * Initialize a new posting tree with the TIDs.
327 */
328 postingRoot = createPostingTree(ginstate->index, items, nitem,
329 buildStats, buffer);
330
331 /* And save the root link in the result tuple */
333 }
334
335 return res;
336}
337
338/*
339 * Insert one or more heap TIDs associated with the given key value.
340 * This will either add a single key entry, or enlarge a pre-existing entry.
341 *
342 * During an index build, buildStats is non-null and the counters
343 * it contains should be incremented as needed.
344 */
345void
349 GinStatsData *buildStats)
350{
351 GinBtreeData btree;
353 GinBtreeStack *stack;
354 IndexTuple itup;
355 Page page;
356
357 insertdata.isDelete = false;
358
359 ginPrepareEntryScan(&btree, attnum, key, category, ginstate);
360 btree.isBuild = (buildStats != NULL);
361
362 stack = ginFindLeafPage(&btree, false, false);
363 page = BufferGetPage(stack->buffer);
364
365 if (btree.findItem(&btree, stack))
366 {
367 /* found pre-existing entry */
368 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, stack->off));
369
370 if (GinIsPostingTree(itup))
371 {
372 /* add entries to existing posting tree */
374
375 /* release all stack */
377 freeGinBtreeStack(stack);
378
379 /* insert into posting tree */
381 items, nitem,
382 buildStats);
383 return;
384 }
385
388 /* modify an existing leaf entry */
389 itup = addItemPointersToLeafTuple(ginstate, itup,
390 items, nitem, buildStats, stack->buffer);
391
392 insertdata.isDelete = true;
393 }
394 else
395 {
398 /* no match, so construct a new leaf entry */
399 itup = buildFreshLeafTuple(ginstate, attnum, key, category,
400 items, nitem, buildStats, stack->buffer);
401
402 /*
403 * nEntries counts leaf tuples, so increment it only when we make a
404 * new one.
405 */
406 if (buildStats)
407 buildStats->nEntries++;
408 }
409
410 /* Insert the new or modified leaf tuple */
411 insertdata.entry = itup;
412 ginInsertValue(&btree, stack, &insertdata, buildStats);
413 pfree(itup);
414}
415
416/*
417 * Extract index entries for a single indexable item, and add them to the
418 * BuildAccumulator's state.
419 *
420 * This function is used only during initial index creation.
421 */
422static void
424 Datum value, bool isNull,
426{
427 Datum *entries;
428 GinNullCategory *categories;
429 int32 nentries;
431
433 entries = ginExtractEntries(buildstate->accum.ginstate, attnum,
434 value, isNull,
435 &nentries, &categories);
437
439 entries, categories, nentries);
440
441 buildstate->indtuples += nentries;
442
444}
445
446static void
448 bool *isnull, bool tupleIsAlive, void *state)
449{
452 int i;
453
455
456 for (i = 0; i < buildstate->ginstate.origTupdesc->natts; i++)
458 values[i], isnull[i], tid);
459
460 /* If we've maxed out our available memory, dump everything to the index */
461 if (buildstate->accum.allocatedMemory >= maintenance_work_mem * (Size) 1024)
462 {
463 ItemPointerData *list;
464 Datum key;
465 GinNullCategory category;
466 uint32 nlist;
468
469 ginBeginBAScan(&buildstate->accum);
470 while ((list = ginGetBAEntry(&buildstate->accum,
471 &attnum, &key, &category, &nlist)) != NULL)
472 {
473 /* there could be many entries, so be willing to abort here */
475 ginEntryInsert(&buildstate->ginstate, attnum, key, category,
476 list, nlist, &buildstate->buildStats);
477 }
478
480 ginInitBA(&buildstate->accum);
481 }
482
484}
485
486/*
487 * ginFlushBuildState
488 * Write all data from BuildAccumulator into the tuplesort.
489 *
490 * The number of TIDs written to the tuplesort at once is limited, to reduce
491 * the amount of memory needed when merging the intermediate results later.
492 * The leader will see up to two chunks per worker, so calculate the limit to
493 * not need more than MaxAllocSize overall.
494 *
495 * We don't need to worry about overflowing maintenance_work_mem. We can't
496 * build chunks larger than work_mem, and that limit was set so that workers
497 * produce sufficiently small chunks.
498 */
499static void
501{
502 ItemPointerData *list;
503 Datum key;
504 GinNullCategory category;
505 uint32 nlist;
508 uint32 maxlen;
509
510 /* maximum number of TIDs per chunk (two chunks per worker) */
511 maxlen = MaxAllocSize / sizeof(ItemPointerData);
512 maxlen /= (2 * buildstate->bs_num_workers);
513
514 ginBeginBAScan(&buildstate->accum);
515 while ((list = ginGetBAEntry(&buildstate->accum,
516 &attnum, &key, &category, &nlist)) != NULL)
517 {
518 /* information about the key */
520
521 /* start of the chunk */
522 uint32 offset = 0;
523
524 /* split the entry into smaller chunk with up to maxlen items */
525 while (offset < nlist)
526 {
527 /* GIN tuple and tuple length */
528 GinTuple *tup;
529 Size tuplen;
530 uint32 len = Min(maxlen, nlist - offset);
531
532 /* there could be many entries, so be willing to abort here */
534
535 tup = _gin_build_tuple(attnum, category,
536 key, attr->attlen, attr->attbyval,
537 &list[offset], len,
538 &tuplen);
539
540 offset += len;
541
542 tuplesort_putgintuple(buildstate->bs_worker_sort, tup, tuplen);
543
544 pfree(tup);
545 }
546 }
547
549 ginInitBA(&buildstate->accum);
550}
551
552/*
553 * ginBuildCallbackParallel
554 * Callback for the parallel index build.
555 *
556 * This is similar to the serial build callback ginBuildCallback, but
557 * instead of writing the accumulated entries into the index, each worker
558 * writes them into a (local) tuplesort.
559 *
560 * The worker then sorts and combines these entries, before writing them
561 * into a shared tuplesort for the leader (see _gin_parallel_scan_and_build
562 * for the whole process).
563 */
564static void
566 bool *isnull, bool tupleIsAlive, void *state)
567{
570 int i;
571
573
574 /*
575 * if scan wrapped around - flush accumulated entries and start anew
576 *
577 * With parallel scans, we don't have a guarantee the scan does not start
578 * half-way through the relation (serial builds disable sync scans and
579 * always start from block 0, parallel scans require allow_sync=true).
580 *
581 * Building the posting lists assumes the TIDs are monotonic and never go
582 * back, and the wrap around would break that. We handle that by detecting
583 * the wraparound, and flushing all entries. This means we'll later see
584 * two separate entries with non-overlapping TID lists (which can be
585 * combined by merge sort).
586 *
587 * To detect a wraparound, we remember the last TID seen by each worker
588 * (for any key). If the next TID seen by the worker is lower, the scan
589 * must have wrapped around.
590 */
591 if (ItemPointerCompare(tid, &buildstate->tid) < 0)
593
594 /* remember the TID we're about to process */
595 buildstate->tid = *tid;
596
597 for (i = 0; i < buildstate->ginstate.origTupdesc->natts; i++)
599 values[i], isnull[i], tid);
600
601 /*
602 * If we've maxed out our available memory, dump everything to the
603 * tuplesort. We use half the per-worker fraction of maintenance_work_mem,
604 * the other half is used for the tuplesort.
605 */
606 if (buildstate->accum.allocatedMemory >= buildstate->work_mem * (Size) 1024)
608
610}
611
614{
615 IndexBuildResult *result;
616 double reltuples;
621 ItemPointerData *list;
622 Datum key;
623 GinNullCategory category;
624 uint32 nlist;
627
629 elog(ERROR, "index \"%s\" already contains data",
631
632 initGinState(&buildstate.ginstate, index);
633 buildstate.indtuples = 0;
634 memset(&buildstate.buildStats, 0, sizeof(GinStatsData));
635
636 /* Initialize fields for parallel build too. */
637 buildstate.bs_numtuples = 0;
638 buildstate.bs_reltuples = 0;
639 buildstate.bs_leader = NULL;
640 memset(&buildstate.tid, 0, sizeof(ItemPointerData));
641
642 /* initialize the meta page */
644
645 /* initialize the root page */
647
653
654
658
659 /* count the root as first entry page */
660 buildstate.buildStats.nEntryPages++;
661
662 /*
663 * create a temporary memory context that is used to hold data not yet
664 * dumped out to the index
665 */
667 "Gin build temporary context",
669
670 /*
671 * create a temporary memory context that is used for calling
672 * ginExtractEntries(), and can be reset after each tuple
673 */
675 "Gin build temporary context for user-defined function",
677
678 buildstate.accum.ginstate = &buildstate.ginstate;
679 ginInitBA(&buildstate.accum);
680
681 /* Report table scan phase started */
684
685 /*
686 * Attempt to launch parallel worker scan when required
687 *
688 * XXX plan_create_index_workers makes the number of workers dependent on
689 * maintenance_work_mem, requiring 32MB for each worker. For GIN that's
690 * reasonable too, because we sort the data just like btree. It does
691 * ignore the memory used to accumulate data in memory (set by work_mem),
692 * but there is no way to communicate that to plan_create_index_workers.
693 */
694 if (indexInfo->ii_ParallelWorkers > 0)
695 _gin_begin_parallel(state, heap, index, indexInfo->ii_Concurrent,
696 indexInfo->ii_ParallelWorkers);
697
698 /*
699 * If parallel build requested and at least one worker process was
700 * successfully launched, set up coordination state, wait for workers to
701 * complete. Then read all tuples from the shared tuplesort and insert
702 * them into the index.
703 *
704 * In serial mode, simply scan the table and build the index one index
705 * tuple at a time.
706 */
707 if (state->bs_leader)
708 {
710
712 coordinate->isWorker = false;
713 coordinate->nParticipants =
714 state->bs_leader->nparticipanttuplesorts;
715 coordinate->sharedsort = state->bs_leader->sharedsort;
716
717 /*
718 * Begin leader tuplesort.
719 *
720 * In cases where parallelism is involved, the leader receives the
721 * same share of maintenance_work_mem as a serial sort (it is
722 * generally treated in the same way as a serial sort once we return).
723 * Parallel worker Tuplesortstates will have received only a fraction
724 * of maintenance_work_mem, though.
725 *
726 * We rely on the lifetime of the Leader Tuplesortstate almost not
727 * overlapping with any worker Tuplesortstate's lifetime. There may
728 * be some small overlap, but that's okay because we rely on leader
729 * Tuplesortstate only allocating a small, fixed amount of memory
730 * here. When its tuplesort_performsort() is called (by our caller),
731 * and significant amounts of memory are likely to be used, all
732 * workers must have already freed almost all memory held by their
733 * Tuplesortstates (they are about to go away completely, too). The
734 * overall effect is that maintenance_work_mem always represents an
735 * absolute high watermark on the amount of memory used by a CREATE
736 * INDEX operation, regardless of the use of parallelism or any other
737 * factor.
738 */
739 state->bs_sortstate =
743
744 /* scan the relation in parallel and merge per-worker results */
745 reltuples = _gin_parallel_merge(state);
746
747 _gin_end_parallel(state->bs_leader, state);
748 }
749 else /* no parallel index build */
750 {
751 /*
752 * Do the heap scan. We disallow sync scan here because
753 * dataPlaceToPage prefers to receive tuples in TID order.
754 */
755 reltuples = table_index_build_scan(heap, index, indexInfo, false, true,
757
758 /* dump remaining entries to the index */
761 while ((list = ginGetBAEntry(&buildstate.accum,
762 &attnum, &key, &category, &nlist)) != NULL)
763 {
764 /* there could be many entries, so be willing to abort here */
766 ginEntryInsert(&buildstate.ginstate, attnum, key, category,
767 list, nlist, &buildstate.buildStats);
768 }
770 }
771
774
775 /*
776 * Update metapage stats
777 */
778 buildstate.buildStats.nTotalPages = RelationGetNumberOfBlocks(index);
779 ginUpdateStats(index, &buildstate.buildStats, true);
780
781 /*
782 * We didn't write WAL records as we built the index, so if WAL-logging is
783 * required, write all pages to the WAL now.
784 */
786 {
789 true);
790 }
791
792 /*
793 * Return statistics
794 */
796
797 result->heap_tuples = reltuples;
798 result->index_tuples = buildstate.indtuples;
799
800 return result;
801}
802
803/*
804 * ginbuildempty() -- build an empty gin index in the initialization fork
805 */
806void
808{
811
812 /* An empty GIN index has two pages. */
817
818 /* Initialize and xlog metabuffer and root buffer. */
827
828 /* Unlock and release the buffers. */
831}
832
833/*
834 * Insert index entries for a single indexable item during "normal"
835 * (non-fast-update) insertion
836 */
837static void
839 Datum value, bool isNull,
840 ItemPointer item)
841{
842 Datum *entries;
843 GinNullCategory *categories;
844 int32 i,
845 nentries;
846
847 entries = ginExtractEntries(ginstate, attnum, value, isNull,
848 &nentries, &categories);
849
850 for (i = 0; i < nentries; i++)
851 ginEntryInsert(ginstate, attnum, entries[i], categories[i],
852 item, 1, NULL);
853}
854
855bool
859 bool indexUnchanged,
860 IndexInfo *indexInfo)
861{
862 GinState *ginstate = (GinState *) indexInfo->ii_AmCache;
865 int i;
866
867 /* Initialize GinState cache if first call in this statement */
868 if (ginstate == NULL)
869 {
871 ginstate = palloc_object(GinState);
872 initGinState(ginstate, index);
873 indexInfo->ii_AmCache = ginstate;
875 }
876
878 "Gin insert temporary context",
880
882
884 {
886
887 memset(&collector, 0, sizeof(GinTupleCollector));
888
889 for (i = 0; i < ginstate->origTupdesc->natts; i++)
891 (OffsetNumber) (i + 1),
892 values[i], isnull[i],
893 ht_ctid);
894
896 }
897 else
898 {
899 for (i = 0; i < ginstate->origTupdesc->natts; i++)
900 ginHeapTupleInsert(ginstate, (OffsetNumber) (i + 1),
901 values[i], isnull[i],
902 ht_ctid);
903 }
904
907
908 return false;
909}
910
911/*
912 * Create parallel context, and launch workers for leader.
913 *
914 * buildstate argument should be initialized (with the exception of the
915 * tuplesort states, which may later be created based on shared
916 * state initially set up here).
917 *
918 * isconcurrent indicates if operation is CREATE INDEX CONCURRENTLY.
919 *
920 * request is the target number of parallel worker processes to launch.
921 *
922 * Sets buildstate's GinLeader, which caller must use to shut down parallel
923 * mode by passing it to _gin_end_parallel() at the very end of its index
924 * build. If not even a single worker process can be launched, this is
925 * never set, and caller should proceed with a serial index build.
926 */
927static void
929 bool isconcurrent, int request)
930{
931 ParallelContext *pcxt;
932 int scantuplesortstates;
933 Snapshot snapshot;
936 GinBuildShared *ginshared;
937 Sharedsort *sharedsort;
939 WalUsage *walusage;
940 BufferUsage *bufferusage;
941 bool leaderparticipates = true;
942 int querylen;
943
944#ifdef DISABLE_LEADER_PARTICIPATION
945 leaderparticipates = false;
946#endif
947
948 /*
949 * Enter parallel mode, and create context for parallel build of gin index
950 */
952 Assert(request > 0);
953 pcxt = CreateParallelContext("postgres", "_gin_parallel_build_main",
954 request);
955
956 scantuplesortstates = leaderparticipates ? request + 1 : request;
957
958 /*
959 * Prepare for scan of the base relation. In a normal index build, we use
960 * SnapshotAny because we must retrieve all tuples and do our own time
961 * qual checks (because we have to index RECENTLY_DEAD tuples). In a
962 * concurrent build, we take a regular MVCC snapshot and index whatever's
963 * live according to that.
964 */
965 if (!isconcurrent)
966 snapshot = SnapshotAny;
967 else
969
970 /*
971 * Estimate size for our own PARALLEL_KEY_GIN_SHARED workspace.
972 */
975 estsort = tuplesort_estimate_shared(scantuplesortstates);
977
979
980 /*
981 * Estimate space for WalUsage and BufferUsage -- PARALLEL_KEY_WAL_USAGE
982 * and PARALLEL_KEY_BUFFER_USAGE.
983 *
984 * If there are no extensions loaded that care, we could skip this. We
985 * have no way of knowing whether anyone's looking at pgWalUsage or
986 * pgBufferUsage, so do it unconditionally.
987 */
989 mul_size(sizeof(WalUsage), pcxt->nworkers));
992 mul_size(sizeof(BufferUsage), pcxt->nworkers));
994
995 /* Finally, estimate PARALLEL_KEY_QUERY_TEXT space */
997 {
1001 }
1002 else
1003 querylen = 0; /* keep compiler quiet */
1004
1005 /* Everyone's had a chance to ask for space, so now create the DSM */
1007
1008 /* If no DSM segment was available, back out (do serial build) */
1009 if (pcxt->seg == NULL)
1010 {
1011 if (IsMVCCSnapshot(snapshot))
1012 UnregisterSnapshot(snapshot);
1015 return;
1016 }
1017
1018 /* Store shared build state, for which we reserved space */
1019 ginshared = (GinBuildShared *) shm_toc_allocate(pcxt->toc, estginshared);
1020 /* Initialize immutable state */
1021 ginshared->heaprelid = RelationGetRelid(heap);
1022 ginshared->indexrelid = RelationGetRelid(index);
1023 ginshared->isconcurrent = isconcurrent;
1024 ginshared->scantuplesortstates = scantuplesortstates;
1025
1027 SpinLockInit(&ginshared->mutex);
1028
1029 /* Initialize mutable state */
1030 ginshared->nparticipantsdone = 0;
1031 ginshared->reltuples = 0.0;
1032 ginshared->indtuples = 0.0;
1033
1036 snapshot);
1037
1038 /*
1039 * Store shared tuplesort-private state, for which we reserved space.
1040 * Then, initialize opaque state using tuplesort routine.
1041 */
1042 sharedsort = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);
1043 tuplesort_initialize_shared(sharedsort, scantuplesortstates,
1044 pcxt->seg);
1045
1046 shm_toc_insert(pcxt->toc, PARALLEL_KEY_GIN_SHARED, ginshared);
1047 shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT, sharedsort);
1048
1049 /* Store query string for workers */
1051 {
1052 char *sharedquery;
1053
1054 sharedquery = (char *) shm_toc_allocate(pcxt->toc, querylen + 1);
1057 }
1058
1059 /*
1060 * Allocate space for each worker's WalUsage and BufferUsage; no need to
1061 * initialize.
1062 */
1063 walusage = shm_toc_allocate(pcxt->toc,
1064 mul_size(sizeof(WalUsage), pcxt->nworkers));
1065 shm_toc_insert(pcxt->toc, PARALLEL_KEY_WAL_USAGE, walusage);
1066 bufferusage = shm_toc_allocate(pcxt->toc,
1067 mul_size(sizeof(BufferUsage), pcxt->nworkers));
1068 shm_toc_insert(pcxt->toc, PARALLEL_KEY_BUFFER_USAGE, bufferusage);
1069
1070 /* Launch workers, saving status for leader/caller */
1072 ginleader->pcxt = pcxt;
1073 ginleader->nparticipanttuplesorts = pcxt->nworkers_launched;
1075 ginleader->nparticipanttuplesorts++;
1076 ginleader->ginshared = ginshared;
1077 ginleader->sharedsort = sharedsort;
1078 ginleader->snapshot = snapshot;
1079 ginleader->walusage = walusage;
1080 ginleader->bufferusage = bufferusage;
1081
1082 /* If no workers were successfully launched, back out (do serial build) */
1083 if (pcxt->nworkers_launched == 0)
1084 {
1086 return;
1087 }
1088
1089 /* Save leader state now that it's clear build will be parallel */
1090 buildstate->bs_leader = ginleader;
1091
1092 /* Join heap scan ourselves */
1095
1096 /*
1097 * Caller needs to wait for all launched workers when we return. Make
1098 * sure that the failure-to-start case will not hang forever.
1099 */
1101}
1102
1103/*
1104 * Shut down workers, destroy parallel context, and end parallel mode.
1105 */
1106static void
1108{
1109 int i;
1110
1111 /* Shutdown worker processes */
1113
1114 /*
1115 * Next, accumulate WAL usage. (This must wait for the workers to finish,
1116 * or we might get incomplete data.)
1117 */
1118 for (i = 0; i < ginleader->pcxt->nworkers_launched; i++)
1119 InstrAccumParallelQuery(&ginleader->bufferusage[i], &ginleader->walusage[i]);
1120
1121 /* Free last reference to MVCC snapshot, if one was used */
1122 if (IsMVCCSnapshot(ginleader->snapshot))
1123 UnregisterSnapshot(ginleader->snapshot);
1126}
1127
1128/*
1129 * Within leader, wait for end of heap scan.
1130 *
1131 * When called, parallel heap scan started by _gin_begin_parallel() will
1132 * already be underway within worker processes (when leader participates
1133 * as a worker, we should end up here just as workers are finishing).
1134 *
1135 * Returns the total number of heap tuples scanned.
1136 */
1137static double
1139{
1140 GinBuildShared *ginshared = state->bs_leader->ginshared;
1141 int nparticipanttuplesorts;
1142
1143 nparticipanttuplesorts = state->bs_leader->nparticipanttuplesorts;
1144 for (;;)
1145 {
1146 SpinLockAcquire(&ginshared->mutex);
1147 if (ginshared->nparticipantsdone == nparticipanttuplesorts)
1148 {
1149 /* copy the data into leader state */
1150 state->bs_reltuples = ginshared->reltuples;
1151 state->bs_numtuples = ginshared->indtuples;
1152
1153 SpinLockRelease(&ginshared->mutex);
1154 break;
1155 }
1156 SpinLockRelease(&ginshared->mutex);
1157
1160 }
1161
1163
1164 return state->bs_reltuples;
1165}
1166
1167/*
1168 * Buffer used to accumulate TIDs from multiple GinTuples for the same key
1169 * (we read these from the tuplesort, sorted by the key).
1170 *
1171 * This is similar to BuildAccumulator in that it's used to collect TIDs
1172 * in memory before inserting them into the index, but it's much simpler
1173 * as it only deals with a single index key at a time.
1174 *
1175 * When adding TIDs to the buffer, we make sure to keep them sorted, both
1176 * during the initial table scan (and detecting when the scan wraps around),
1177 * and during merging (where we do mergesort).
1178 */
1179typedef struct GinBuffer
1180{
1183 Datum key; /* 0 if no key (and keylen == 0) */
1184 Size keylen; /* number of bytes (not typlen) */
1185
1186 /* type info */
1189
1190 /* Number of TIDs to collect before attempt to write some out. */
1192
1193 /* array of TID values */
1196 SortSupport ssup; /* for sorting/comparing keys */
1199
1200/*
1201 * Check that TID array contains valid values, and that it's sorted (if we
1202 * expect it to be).
1203 */
1204static void
1206{
1207#ifdef USE_ASSERT_CHECKING
1208 /* we should not have a buffer with no TIDs to sort */
1209 Assert(buffer->items != NULL);
1210 Assert(buffer->nitems > 0);
1211
1212 for (int i = 0; i < buffer->nitems; i++)
1213 {
1214 Assert(ItemPointerIsValid(&buffer->items[i]));
1215
1216 /* don't check ordering for the first TID item */
1217 if (i == 0)
1218 continue;
1219
1220 Assert(ItemPointerCompare(&buffer->items[i - 1], &buffer->items[i]) < 0);
1221 }
1222#endif
1223}
1224
1225/*
1226 * GinBuffer checks
1227 *
1228 * Make sure the nitems/items fields are consistent (either the array is empty
1229 * or not empty, the fields need to agree). If there are items, check ordering.
1230 */
1231static void
1233{
1234#ifdef USE_ASSERT_CHECKING
1235 /* if we have any items, the array must exist */
1236 Assert(!((buffer->nitems > 0) && (buffer->items == NULL)));
1237
1238 /*
1239 * The buffer may be empty, in which case we must not call the check of
1240 * item pointers, because that assumes non-emptiness.
1241 */
1242 if (buffer->nitems == 0)
1243 return;
1244
1245 /* Make sure the item pointers are valid and sorted. */
1247#endif
1248}
1249
1250/*
1251 * GinBufferInit
1252 * Initialize buffer to store tuples for a GIN index.
1253 *
1254 * Initialize the buffer used to accumulate TID for a single key at a time
1255 * (we process the data sorted), so we know when we received all data for
1256 * a given key.
1257 *
1258 * Initializes sort support procedures for all index attributes.
1259 */
1260static GinBuffer *
1262{
1264 int i,
1265 nKeys;
1267
1268 /*
1269 * How many items can we fit into the memory limit? We don't want to end
1270 * with too many TIDs. and 64kB seems more than enough. But maybe this
1271 * should be tied to maintenance_work_mem or something like that?
1272 */
1273 buffer->maxitems = (64 * 1024L) / sizeof(ItemPointerData);
1274
1276
1277 buffer->ssup = palloc0_array(SortSupportData, nKeys);
1278
1279 /*
1280 * Lookup ordering operator for the index key data type, and initialize
1281 * the sort support function.
1282 */
1283 for (i = 0; i < nKeys; i++)
1284 {
1285 Oid cmpFunc;
1286 SortSupport sortKey = &buffer->ssup[i];
1288
1289 sortKey->ssup_cxt = CurrentMemoryContext;
1290 sortKey->ssup_collation = index->rd_indcollation[i];
1291
1292 if (!OidIsValid(sortKey->ssup_collation))
1293 sortKey->ssup_collation = DEFAULT_COLLATION_OID;
1294
1295 sortKey->ssup_nulls_first = false;
1296 sortKey->ssup_attno = i + 1;
1297 sortKey->abbreviate = false;
1298
1299 Assert(sortKey->ssup_attno != 0);
1300
1301 /*
1302 * If the compare proc isn't specified in the opclass definition, look
1303 * up the index key type's default btree comparator.
1304 */
1306 if (cmpFunc == InvalidOid)
1307 {
1308 TypeCacheEntry *typentry;
1309
1310 typentry = lookup_type_cache(att->atttypid,
1312 if (!OidIsValid(typentry->cmp_proc_finfo.fn_oid))
1313 ereport(ERROR,
1315 errmsg("could not identify a comparison function for type %s",
1316 format_type_be(att->atttypid))));
1317
1318 cmpFunc = typentry->cmp_proc_finfo.fn_oid;
1319 }
1320
1322 }
1323
1324 return buffer;
1325}
1326
1327/* Is the buffer empty, i.e. has no TID values in the array? */
1328static bool
1330{
1331 return (buffer->nitems == 0);
1332}
1333
1334/*
1335 * GinBufferKeyEquals
1336 * Can the buffer store TIDs for the provided GIN tuple (same key)?
1337 *
1338 * Compare if the tuple matches the already accumulated data in the GIN
1339 * buffer. Compare scalar fields first, before the actual key.
1340 *
1341 * Returns true if the key matches, and the TID belongs to the buffer, or
1342 * false if the key does not match.
1343 */
1344static bool
1346{
1347 int r;
1348 Datum tupkey;
1349
1350 AssertCheckGinBuffer(buffer);
1351
1352 if (tup->attrnum != buffer->attnum)
1353 return false;
1354
1355 /* same attribute should have the same type info */
1356 Assert(tup->typbyval == buffer->typbyval);
1357 Assert(tup->typlen == buffer->typlen);
1358
1359 if (tup->category != buffer->category)
1360 return false;
1361
1362 /*
1363 * For NULL/empty keys, this means equality, for normal keys we need to
1364 * compare the actual key value.
1365 */
1366 if (buffer->category != GIN_CAT_NORM_KEY)
1367 return true;
1368
1369 /*
1370 * For the tuple, get either the first sizeof(Datum) bytes for byval
1371 * types, or a pointer to the beginning of the data array.
1372 */
1373 tupkey = (buffer->typbyval) ? *(Datum *) tup->data : PointerGetDatum(tup->data);
1374
1375 r = ApplySortComparator(buffer->key, false,
1376 tupkey, false,
1377 &buffer->ssup[buffer->attnum - 1]);
1378
1379 return (r == 0);
1380}
1381
1382/*
1383 * GinBufferShouldTrim
1384 * Should we trim the list of item pointers?
1385 *
1386 * By trimming we understand writing out and removing the tuple IDs that
1387 * we know can't change by future merges. We can deduce the TID up to which
1388 * this is guaranteed from the "first" TID in each GIN tuple, which provides
1389 * a "horizon" (for a given key) thanks to the sort.
1390 *
1391 * We don't want to do this too often - compressing longer TID lists is more
1392 * efficient. But we also don't want to accumulate too many TIDs, for two
1393 * reasons. First, it consumes memory and we might exceed maintenance_work_mem
1394 * (or whatever limit applies), even if that's unlikely because TIDs are very
1395 * small so we can fit a lot of them. Second, and more importantly, long TID
1396 * lists are an issue if the scan wraps around, because a key may get a very
1397 * wide list (with min/max TID for that key), forcing "full" mergesorts for
1398 * every list merged into it (instead of the efficient append).
1399 *
1400 * So we look at two things when deciding if to trim - if the resulting list
1401 * (after adding TIDs from the new tuple) would be too long, and if there is
1402 * enough TIDs to trim (with values less than "first" TID from the new tuple),
1403 * we do the trim. By enough we mean at least 128 TIDs (mostly an arbitrary
1404 * number).
1405 *
1406 * We try freezing TIDs at the beginning of the list first, before attempting
1407 * to trim the buffer. This may allow trimming the data earlier, reducing the
1408 * memory usage and excluding it from the mergesort.
1409 */
1410static bool
1412{
1413 /*
1414 * Check if the last TID in the current list is frozen. This is the case
1415 * when merging non-overlapping lists, e.g. in each parallel worker.
1416 */
1417 if ((buffer->nitems > 0) &&
1418 (ItemPointerCompare(&buffer->items[buffer->nitems - 1],
1419 GinTupleGetFirst(tup)) == 0))
1420 buffer->nfrozen = buffer->nitems;
1421
1422 /*
1423 * Now find the last TID we know to be frozen, i.e. the last TID right
1424 * before the new GIN tuple.
1425 *
1426 * Start with the first not-yet-frozen tuple, and walk until we find the
1427 * first TID that's higher. If we already know the whole list is frozen
1428 * (i.e. nfrozen == nitems), this does nothing.
1429 *
1430 * XXX This might do a binary search for sufficiently long lists, but it
1431 * does not seem worth the complexity. Overlapping lists should be rare
1432 * common, TID comparisons are cheap, and we should quickly freeze most of
1433 * the list.
1434 */
1435 for (int i = buffer->nfrozen; i < buffer->nitems; i++)
1436 {
1437 /* Is the TID after the first TID of the new tuple? Can't freeze. */
1438 if (ItemPointerCompare(&buffer->items[i],
1439 GinTupleGetFirst(tup)) > 0)
1440 break;
1441
1442 buffer->nfrozen++;
1443 }
1444
1445 /* not enough TIDs to trim (1024 is somewhat arbitrary number) */
1446 if (buffer->nfrozen < 1024)
1447 return false;
1448
1449 /* no need to trim if we have not hit the memory limit yet */
1450 if ((buffer->nitems + tup->nitems) < buffer->maxitems)
1451 return false;
1452
1453 /*
1454 * OK, we have enough frozen TIDs to flush, and we have hit the memory
1455 * limit, so it's time to write it out.
1456 */
1457 return true;
1458}
1459
1460/*
1461 * GinBufferStoreTuple
1462 * Add data (especially TID list) from a GIN tuple to the buffer.
1463 *
1464 * The buffer is expected to be empty (in which case it's initialized), or
1465 * having the same key. The TID values from the tuple are combined with the
1466 * stored values using a merge sort.
1467 *
1468 * The tuples (for the same key) are expected to be sorted by first TID. But
1469 * this does not guarantee the lists do not overlap, especially in the leader,
1470 * because the workers process interleaving data. There should be no overlaps
1471 * in a single worker - it could happen when the parallel scan wraps around,
1472 * but we detect that and flush the data (see ginBuildCallbackParallel).
1473 *
1474 * By sorting the GinTuple not only by key, but also by the first TID, we make
1475 * it more less likely the lists will overlap during merge. We merge them using
1476 * mergesort, but it's cheaper to just append one list to the other.
1477 *
1478 * How often can the lists overlap? There should be no overlaps in workers,
1479 * and in the leader we can see overlaps between lists built by different
1480 * workers. But the workers merge the items as much as possible, so there
1481 * should not be too many.
1482 */
1483static void
1485{
1487 Datum key;
1488
1489 AssertCheckGinBuffer(buffer);
1490
1493
1494 /* if the buffer is empty, set the fields (and copy the key) */
1495 if (GinBufferIsEmpty(buffer))
1496 {
1497 buffer->category = tup->category;
1498 buffer->keylen = tup->keylen;
1499 buffer->attnum = tup->attrnum;
1500
1501 buffer->typlen = tup->typlen;
1502 buffer->typbyval = tup->typbyval;
1503
1504 if (tup->category == GIN_CAT_NORM_KEY)
1505 buffer->key = datumCopy(key, buffer->typbyval, buffer->typlen);
1506 else
1507 buffer->key = (Datum) 0;
1508 }
1509
1510 /* add the new TIDs into the buffer, combine using merge-sort */
1511 {
1512 int nnew;
1513 ItemPointer new;
1514
1515 /*
1516 * Resize the array - we do this first, because we'll dereference the
1517 * first unfrozen TID, which would fail if the array is NULL. We'll
1518 * still pass 0 as number of elements in that array though.
1519 */
1520 if (buffer->items == NULL)
1521 buffer->items = palloc((buffer->nitems + tup->nitems) * sizeof(ItemPointerData));
1522 else
1523 buffer->items = repalloc(buffer->items,
1524 (buffer->nitems + tup->nitems) * sizeof(ItemPointerData));
1525
1526 new = ginMergeItemPointers(&buffer->items[buffer->nfrozen], /* first unfrozen */
1527 (buffer->nitems - buffer->nfrozen), /* num of unfrozen */
1528 items, tup->nitems, &nnew);
1529
1530 Assert(nnew == (tup->nitems + (buffer->nitems - buffer->nfrozen)));
1531
1532 memcpy(&buffer->items[buffer->nfrozen], new,
1533 nnew * sizeof(ItemPointerData));
1534
1535 pfree(new);
1536
1537 buffer->nitems += tup->nitems;
1538
1540 }
1541
1542 /* free the decompressed TID list */
1543 pfree(items);
1544}
1545
1546/*
1547 * GinBufferReset
1548 * Reset the buffer into a state as if it contains no data.
1549 */
1550static void
1552{
1553 Assert(!GinBufferIsEmpty(buffer));
1554
1555 /* release byref values, do nothing for by-val ones */
1556 if ((buffer->category == GIN_CAT_NORM_KEY) && !buffer->typbyval)
1557 pfree(DatumGetPointer(buffer->key));
1558
1559 /*
1560 * Not required, but makes it more likely to trigger NULL dereference if
1561 * using the value incorrectly, etc.
1562 */
1563 buffer->key = (Datum) 0;
1564
1565 buffer->attnum = 0;
1566 buffer->category = 0;
1567 buffer->keylen = 0;
1568 buffer->nitems = 0;
1569 buffer->nfrozen = 0;
1570
1571 buffer->typlen = 0;
1572 buffer->typbyval = 0;
1573}
1574
1575/*
1576 * GinBufferTrim
1577 * Discard the "frozen" part of the TID list (which should have been
1578 * written to disk/index before this call).
1579 */
1580static void
1582{
1583 Assert((buffer->nfrozen > 0) && (buffer->nfrozen <= buffer->nitems));
1584
1585 memmove(&buffer->items[0], &buffer->items[buffer->nfrozen],
1586 sizeof(ItemPointerData) * (buffer->nitems - buffer->nfrozen));
1587
1588 buffer->nitems -= buffer->nfrozen;
1589 buffer->nfrozen = 0;
1590}
1591
1592/*
1593 * GinBufferFree
1594 * Release memory associated with the GinBuffer (including TID array).
1595 */
1596static void
1598{
1599 if (buffer->items)
1600 pfree(buffer->items);
1601
1602 /* release byref values, do nothing for by-val ones */
1603 if (!GinBufferIsEmpty(buffer) &&
1604 (buffer->category == GIN_CAT_NORM_KEY) && !buffer->typbyval)
1605 pfree(DatumGetPointer(buffer->key));
1606
1607 pfree(buffer);
1608}
1609
1610/*
1611 * GinBufferCanAddKey
1612 * Check if a given GIN tuple can be added to the current buffer.
1613 *
1614 * Returns true if the buffer is either empty or for the same index key.
1615 */
1616static bool
1618{
1619 /* empty buffer can accept data for any key */
1620 if (GinBufferIsEmpty(buffer))
1621 return true;
1622
1623 /* otherwise just data for the same key */
1624 return GinBufferKeyEquals(buffer, tup);
1625}
1626
1627/*
1628 * Within leader, wait for end of heap scan and merge per-worker results.
1629 *
1630 * After waiting for all workers to finish, merge the per-worker results into
1631 * the complete index. The results from each worker are sorted by block number
1632 * (start of the page range). While combining the per-worker results we merge
1633 * summaries for the same page range, and also fill-in empty summaries for
1634 * ranges without any tuples.
1635 *
1636 * Returns the total number of heap tuples scanned.
1637 */
1638static double
1640{
1641 GinTuple *tup;
1642 Size tuplen;
1643 double reltuples = 0;
1644 GinBuffer *buffer;
1645
1646 /* GIN tuples from workers, merged by leader */
1647 double numtuples = 0;
1648
1649 /* wait for workers to scan table and produce partial results */
1650 reltuples = _gin_parallel_heapscan(state);
1651
1652 /* Execute the sort */
1655
1656 /* do the actual sort in the leader */
1657 tuplesort_performsort(state->bs_sortstate);
1658
1659 /*
1660 * Initialize buffer to combine entries for the same key.
1661 *
1662 * The leader is allowed to use the whole maintenance_work_mem buffer to
1663 * combine data. The parallel workers already completed.
1664 */
1665 buffer = GinBufferInit(state->ginstate.index);
1666
1667 /*
1668 * Set the progress target for the next phase. Reset the block number
1669 * values set by table_index_build_scan
1670 */
1671 {
1672 const int progress_index[] = {
1677 };
1678 const int64 progress_vals[] = {
1680 state->bs_numtuples,
1681 0, 0
1682 };
1683
1685 }
1686
1687 /*
1688 * Read the GIN tuples from the shared tuplesort, sorted by category and
1689 * key. That probably gives us order matching how data is organized in the
1690 * index.
1691 *
1692 * We don't insert the GIN tuples right away, but instead accumulate as
1693 * many TIDs for the same key as possible, and then insert that at once.
1694 * This way we don't need to decompress/recompress the posting lists, etc.
1695 */
1696 while ((tup = tuplesort_getgintuple(state->bs_sortstate, &tuplen, true)) != NULL)
1697 {
1699
1701
1702 /*
1703 * If the buffer can accept the new GIN tuple, just store it there and
1704 * we're done. If it's a different key (or maybe too much data) flush
1705 * the current contents into the index first.
1706 */
1707 if (!GinBufferCanAddKey(buffer, tup))
1708 {
1709 /*
1710 * Buffer is not empty and it's storing a different key - flush
1711 * the data into the insert, and start a new entry for current
1712 * GinTuple.
1713 */
1715
1717
1718 ginEntryInsert(&state->ginstate,
1719 buffer->attnum, buffer->key, buffer->category,
1720 buffer->items, buffer->nitems, &state->buildStats);
1721
1723 MemoryContextReset(state->tmpCtx);
1724
1725 /* discard the existing data */
1726 GinBufferReset(buffer);
1727 }
1728
1729 /*
1730 * We're about to add a GIN tuple to the buffer - check the memory
1731 * limit first, and maybe write out some of the data into the index
1732 * first, if needed (and possible). We only flush the part of the TID
1733 * list that we know won't change, and only if there's enough data for
1734 * compression to work well.
1735 */
1736 if (GinBufferShouldTrim(buffer, tup))
1737 {
1738 Assert(buffer->nfrozen > 0);
1739
1740 /*
1741 * Buffer is not empty and it's storing a different key - flush
1742 * the data into the insert, and start a new entry for current
1743 * GinTuple.
1744 */
1746
1748
1749 ginEntryInsert(&state->ginstate,
1750 buffer->attnum, buffer->key, buffer->category,
1751 buffer->items, buffer->nfrozen, &state->buildStats);
1752
1754 MemoryContextReset(state->tmpCtx);
1755
1756 /* truncate the data we've just discarded */
1757 GinBufferTrim(buffer);
1758 }
1759
1760 /*
1761 * Remember data for the current tuple (either remember the new key,
1762 * or append if to the existing data).
1763 */
1764 GinBufferStoreTuple(buffer, tup);
1765
1766 /* Report progress */
1768 ++numtuples);
1769 }
1770
1771 /* flush data remaining in the buffer (for the last key) */
1772 if (!GinBufferIsEmpty(buffer))
1773 {
1775
1776 ginEntryInsert(&state->ginstate,
1777 buffer->attnum, buffer->key, buffer->category,
1778 buffer->items, buffer->nitems, &state->buildStats);
1779
1780 /* discard the existing data */
1781 GinBufferReset(buffer);
1782
1783 /* Report progress */
1785 ++numtuples);
1786 }
1787
1788 /* release all the memory */
1789 GinBufferFree(buffer);
1790
1791 tuplesort_end(state->bs_sortstate);
1792
1793 return reltuples;
1794}
1795
1796/*
1797 * Returns size of shared memory required to store state for a parallel
1798 * gin index build based on the snapshot its parallel scan will use.
1799 */
1800static Size
1802{
1803 /* c.f. shm_toc_allocate as to why BUFFERALIGN is used */
1804 return add_size(BUFFERALIGN(sizeof(GinBuildShared)),
1805 table_parallelscan_estimate(heap, snapshot));
1806}
1807
1808/*
1809 * Within leader, participate as a parallel worker.
1810 */
1811static void
1813{
1814 GinLeader *ginleader = buildstate->bs_leader;
1815 int sortmem;
1816
1817 /*
1818 * Might as well use reliable figure when doling out maintenance_work_mem
1819 * (when requested number of workers were not launched, this will be
1820 * somewhat higher than it is for other workers).
1821 */
1823
1824 /* Perform work common to all participants */
1826 ginleader->sharedsort, heap, index,
1827 sortmem, true);
1828}
1829
1830/*
1831 * _gin_process_worker_data
1832 * First phase of the key merging, happening in the worker.
1833 *
1834 * Depending on the number of distinct keys, the TID lists produced by the
1835 * callback may be very short (due to frequent evictions in the callback).
1836 * But combining many tiny lists is expensive, so we try to do as much as
1837 * possible in the workers and only then pass the results to the leader.
1838 *
1839 * We read the tuples sorted by the key, and merge them into larger lists.
1840 * At the moment there's no memory limit, so this will just produce one
1841 * huge (sorted) list per key in each worker. Which means the leader will
1842 * do a very limited number of mergesorts, which is good.
1843 */
1844static void
1846 bool progress)
1847{
1848 GinTuple *tup;
1849 Size tuplen;
1850
1851 GinBuffer *buffer;
1852
1853 /*
1854 * Initialize buffer to combine entries for the same key.
1855 *
1856 * The workers are limited to the same amount of memory as during the sort
1857 * in ginBuildCallbackParallel. But this probably should be the 32MB used
1858 * during planning, just like there.
1859 */
1860 buffer = GinBufferInit(state->ginstate.index);
1861
1862 /* sort the raw per-worker data */
1863 if (progress)
1866
1867 tuplesort_performsort(state->bs_worker_sort);
1868
1869 /* reset the number of GIN tuples produced by this worker */
1870 state->bs_numtuples = 0;
1871
1872 if (progress)
1875
1876 /*
1877 * Read the GIN tuples from the shared tuplesort, sorted by the key, and
1878 * merge them into larger chunks for the leader to combine.
1879 */
1880 while ((tup = tuplesort_getgintuple(worker_sort, &tuplen, true)) != NULL)
1881 {
1882
1884
1885 /*
1886 * If the buffer can accept the new GIN tuple, just store it there and
1887 * we're done. If it's a different key (or maybe too much data) flush
1888 * the current contents into the index first.
1889 */
1890 if (!GinBufferCanAddKey(buffer, tup))
1891 {
1892 GinTuple *ntup;
1893 Size ntuplen;
1894
1895 /*
1896 * Buffer is not empty and it's storing a different key - flush
1897 * the data into the insert, and start a new entry for current
1898 * GinTuple.
1899 */
1901
1902 ntup = _gin_build_tuple(buffer->attnum, buffer->category,
1903 buffer->key, buffer->typlen, buffer->typbyval,
1904 buffer->items, buffer->nitems, &ntuplen);
1905
1906 tuplesort_putgintuple(state->bs_sortstate, ntup, ntuplen);
1907 state->bs_numtuples++;
1908
1909 pfree(ntup);
1910
1911 /* discard the existing data */
1912 GinBufferReset(buffer);
1913 }
1914
1915 /*
1916 * We're about to add a GIN tuple to the buffer - check the memory
1917 * limit first, and maybe write out some of the data into the index
1918 * first, if needed (and possible). We only flush the part of the TID
1919 * list that we know won't change, and only if there's enough data for
1920 * compression to work well.
1921 */
1922 if (GinBufferShouldTrim(buffer, tup))
1923 {
1924 GinTuple *ntup;
1925 Size ntuplen;
1926
1927 Assert(buffer->nfrozen > 0);
1928
1929 /*
1930 * Buffer is not empty and it's storing a different key - flush
1931 * the data into the insert, and start a new entry for current
1932 * GinTuple.
1933 */
1935
1936 ntup = _gin_build_tuple(buffer->attnum, buffer->category,
1937 buffer->key, buffer->typlen, buffer->typbyval,
1938 buffer->items, buffer->nfrozen, &ntuplen);
1939
1940 tuplesort_putgintuple(state->bs_sortstate, ntup, ntuplen);
1941
1942 pfree(ntup);
1943
1944 /* truncate the data we've just discarded */
1945 GinBufferTrim(buffer);
1946 }
1947
1948 /*
1949 * Remember data for the current tuple (either remember the new key,
1950 * or append if to the existing data).
1951 */
1952 GinBufferStoreTuple(buffer, tup);
1953 }
1954
1955 /* flush data remaining in the buffer (for the last key) */
1956 if (!GinBufferIsEmpty(buffer))
1957 {
1958 GinTuple *ntup;
1959 Size ntuplen;
1960
1962
1963 ntup = _gin_build_tuple(buffer->attnum, buffer->category,
1964 buffer->key, buffer->typlen, buffer->typbyval,
1965 buffer->items, buffer->nitems, &ntuplen);
1966
1967 tuplesort_putgintuple(state->bs_sortstate, ntup, ntuplen);
1968 state->bs_numtuples++;
1969
1970 pfree(ntup);
1971
1972 /* discard the existing data */
1973 GinBufferReset(buffer);
1974 }
1975
1976 /* release all the memory */
1977 GinBufferFree(buffer);
1978
1980}
1981
1982/*
1983 * Perform a worker's portion of a parallel GIN index build sort.
1984 *
1985 * This generates a tuplesort for the worker portion of the table.
1986 *
1987 * sortmem is the amount of working memory to use within each worker,
1988 * expressed in KBs.
1989 *
1990 * When this returns, workers are done, and need only release resources.
1991 *
1992 * Before feeding data into a shared tuplesort (for the leader process),
1993 * the workers process data in two phases.
1994 *
1995 * 1) A worker reads a portion of rows from the table, accumulates entries
1996 * in memory, and flushes them into a private tuplesort (e.g. because of
1997 * using too much memory).
1998 *
1999 * 2) The private tuplesort gets sorted (by key and TID), the worker reads
2000 * the data again, and combines the entries as much as possible. This has
2001 * to happen eventually, and this way it's done in workers in parallel.
2002 *
2003 * Finally, the combined entries are written into the shared tuplesort, so
2004 * that the leader can process them.
2005 *
2006 * How well this works (compared to just writing entries into the shared
2007 * tuplesort) depends on the data set. For large tables with many distinct
2008 * keys this helps a lot. With many distinct keys it's likely the buffers has
2009 * to be flushed often, generating many entries with the same key and short
2010 * TID lists. These entries need to be sorted and merged at some point,
2011 * before writing them to the index. The merging is quite expensive, it can
2012 * easily be ~50% of a serial build, and doing as much of it in the workers
2013 * means it's parallelized. The leader still has to merge results from the
2014 * workers, but it's much more efficient to merge few large entries than
2015 * many tiny ones.
2016 *
2017 * This also reduces the amount of data the workers pass to the leader through
2018 * the shared tuplesort. OTOH the workers need more space for the private sort,
2019 * possibly up to 2x of the data, if no entries be merged in a worker. But this
2020 * is very unlikely, and the only consequence is inefficiency, so we ignore it.
2021 */
2022static void
2024 GinBuildShared *ginshared, Sharedsort *sharedsort,
2025 Relation heap, Relation index,
2026 int sortmem, bool progress)
2027{
2029 TableScanDesc scan;
2030 double reltuples;
2031 IndexInfo *indexInfo;
2032
2033 /* Initialize local tuplesort coordination state */
2035 coordinate->isWorker = true;
2036 coordinate->nParticipants = -1;
2037 coordinate->sharedsort = sharedsort;
2038
2039 /* remember how much space is allowed for the accumulated entries */
2040 state->work_mem = (sortmem / 2);
2041
2042 /* remember how many workers participate in the build */
2043 state->bs_num_workers = ginshared->scantuplesortstates;
2044
2045 /* Begin "partial" tuplesort */
2046 state->bs_sortstate = tuplesort_begin_index_gin(heap, index,
2047 state->work_mem,
2048 coordinate,
2050
2051 /* Local per-worker sort of raw-data */
2052 state->bs_worker_sort = tuplesort_begin_index_gin(heap, index,
2053 state->work_mem,
2054 NULL,
2056
2057 /* Join parallel scan */
2058 indexInfo = BuildIndexInfo(index);
2059 indexInfo->ii_Concurrent = ginshared->isconcurrent;
2060
2061 scan = table_beginscan_parallel(heap,
2063
2064 reltuples = table_index_build_scan(heap, index, indexInfo, true, progress,
2066
2067 /* write remaining accumulated entries */
2069
2070 /*
2071 * Do the first phase of in-worker processing - sort the data produced by
2072 * the callback, and combine them into much larger chunks and place that
2073 * into the shared tuplestore for leader to process.
2074 */
2075 _gin_process_worker_data(state, state->bs_worker_sort, progress);
2076
2077 /* sort the GIN tuples built by this worker */
2078 tuplesort_performsort(state->bs_sortstate);
2079
2080 state->bs_reltuples += reltuples;
2081
2082 /*
2083 * Done. Record ambuild statistics.
2084 */
2085 SpinLockAcquire(&ginshared->mutex);
2086 ginshared->nparticipantsdone++;
2087 ginshared->reltuples += state->bs_reltuples;
2088 ginshared->indtuples += state->bs_numtuples;
2089 SpinLockRelease(&ginshared->mutex);
2090
2091 /* Notify leader */
2093
2094 tuplesort_end(state->bs_sortstate);
2095}
2096
2097/*
2098 * Perform work within a launched parallel process.
2099 */
2100void
2102{
2103 char *sharedquery;
2104 GinBuildShared *ginshared;
2105 Sharedsort *sharedsort;
2107 Relation heapRel;
2108 Relation indexRel;
2111 WalUsage *walusage;
2112 BufferUsage *bufferusage;
2113 int sortmem;
2114
2115 /*
2116 * The only possible status flag that can be set to the parallel worker is
2117 * PROC_IN_SAFE_IC.
2118 */
2119 Assert((MyProc->statusFlags == 0) ||
2121
2122 /* Set debug_query_string for individual workers first */
2125
2126 /* Report the query string from leader */
2128
2129 /* Look up gin shared state */
2130 ginshared = shm_toc_lookup(toc, PARALLEL_KEY_GIN_SHARED, false);
2131
2132 /* Open relations using lock modes known to be obtained by index.c */
2133 if (!ginshared->isconcurrent)
2134 {
2137 }
2138 else
2139 {
2142 }
2143
2144 /* Open relations within worker */
2145 heapRel = table_open(ginshared->heaprelid, heapLockmode);
2146 indexRel = index_open(ginshared->indexrelid, indexLockmode);
2147
2148 /* initialize the GIN build state */
2149 initGinState(&buildstate.ginstate, indexRel);
2150 buildstate.indtuples = 0;
2151 memset(&buildstate.buildStats, 0, sizeof(GinStatsData));
2152 memset(&buildstate.tid, 0, sizeof(ItemPointerData));
2153
2154 /*
2155 * create a temporary memory context that is used to hold data not yet
2156 * dumped out to the index
2157 */
2159 "Gin build temporary context",
2161
2162 /*
2163 * create a temporary memory context that is used for calling
2164 * ginExtractEntries(), and can be reset after each tuple
2165 */
2167 "Gin build temporary context for user-defined function",
2169
2170 buildstate.accum.ginstate = &buildstate.ginstate;
2171 ginInitBA(&buildstate.accum);
2172
2173
2174 /* Look up shared state private to tuplesort.c */
2175 sharedsort = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT, false);
2176 tuplesort_attach_shared(sharedsort, seg);
2177
2178 /* Prepare to track buffer usage during parallel execution */
2180
2181 /*
2182 * Might as well use reliable figure when doling out maintenance_work_mem
2183 * (when requested number of workers were not launched, this will be
2184 * somewhat higher than it is for other workers).
2185 */
2187
2188 _gin_parallel_scan_and_build(&buildstate, ginshared, sharedsort,
2189 heapRel, indexRel, sortmem, false);
2190
2191 /* Report WAL/buffer usage during parallel execution */
2192 bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false);
2193 walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false);
2195 &walusage[ParallelWorkerNumber]);
2196
2197 index_close(indexRel, indexLockmode);
2198 table_close(heapRel, heapLockmode);
2199}
2200
2201/*
2202 * Used to keep track of compressed TID lists when building a GIN tuple.
2203 */
2204typedef struct
2205{
2206 dlist_node node; /* linked list pointers */
2209
2210/*
2211 * _gin_build_tuple
2212 * Serialize the state for an index key into a tuple for tuplesort.
2213 *
2214 * The tuple has a number of scalar fields (mostly matching the build state),
2215 * and then a data array that stores the key first, and then the TID list.
2216 *
2217 * For by-reference data types, we store the actual data. For by-val types
2218 * we simply copy the whole Datum, so that we don't have to care about stuff
2219 * like endianness etc. We could make it a little bit smaller, but it's not
2220 * worth it - it's a tiny fraction of the data, and we need to MAXALIGN the
2221 * start of the TID list anyway. So we wouldn't save anything. (This would
2222 * not be a good idea for the permanent in-index data, since we'd prefer
2223 * that that not depend on sizeof(Datum). But this is just a transient
2224 * representation to use while sorting the data.)
2225 *
2226 * The TID list is serialized as compressed - it's highly compressible, and
2227 * we already have ginCompressPostingList for this purpose. The list may be
2228 * pretty long, so we compress it into multiple segments and then copy all
2229 * of that into the GIN tuple.
2230 */
2231static GinTuple *
2232_gin_build_tuple(OffsetNumber attrnum, unsigned char category,
2233 Datum key, int16 typlen, bool typbyval,
2235 Size *len)
2236{
2237 GinTuple *tuple;
2238 char *ptr;
2239
2240 Size tuplen;
2241 int keylen;
2242
2243 dlist_mutable_iter iter;
2244 dlist_head segments;
2245 int ncompressed;
2247
2248 /*
2249 * Calculate how long is the key value. Only keys with GIN_CAT_NORM_KEY
2250 * have actual non-empty key. We include varlena headers and \0 bytes for
2251 * strings, to make it easier to access the data in-line.
2252 *
2253 * For byval types we simply copy the whole Datum. We could store just the
2254 * necessary bytes, but this is simpler to work with and not worth the
2255 * extra complexity. Moreover we still need to do the MAXALIGN to allow
2256 * direct access to items pointers.
2257 *
2258 * XXX Note that for byval types we store the whole datum, no matter what
2259 * the typlen value is.
2260 */
2261 if (category != GIN_CAT_NORM_KEY)
2262 keylen = 0;
2263 else if (typbyval)
2264 keylen = sizeof(Datum);
2265 else if (typlen > 0)
2266 keylen = typlen;
2267 else if (typlen == -1)
2268 keylen = VARSIZE_ANY(DatumGetPointer(key));
2269 else if (typlen == -2)
2270 keylen = strlen(DatumGetPointer(key)) + 1;
2271 else
2272 elog(ERROR, "unexpected typlen value (%d)", typlen);
2273
2274 /* compress the item pointers */
2275 ncompressed = 0;
2276 compresslen = 0;
2277 dlist_init(&segments);
2278
2279 /* generate compressed segments of TID list chunks */
2280 while (ncompressed < nitems)
2281 {
2282 int cnt;
2284
2286 (nitems - ncompressed),
2287 UINT16_MAX,
2288 &cnt);
2289
2290 ncompressed += cnt;
2292
2293 dlist_push_tail(&segments, &seginfo->node);
2294 }
2295
2296 /*
2297 * Determine GIN tuple length with all the data included. Be careful about
2298 * alignment, to allow direct access to compressed segments (those require
2299 * only SHORTALIGN).
2300 */
2301 tuplen = SHORTALIGN(offsetof(GinTuple, data) + keylen) + compresslen;
2302
2303 *len = tuplen;
2304
2305 /*
2306 * Allocate space for the whole GIN tuple.
2307 *
2308 * The palloc0 is needed - writetup_index_gin will write the whole tuple
2309 * to disk, so we need to make sure the padding bytes are defined
2310 * (otherwise valgrind would report this).
2311 */
2312 tuple = palloc0(tuplen);
2313
2314 tuple->tuplen = tuplen;
2315 tuple->attrnum = attrnum;
2316 tuple->category = category;
2317 tuple->keylen = keylen;
2318 tuple->nitems = nitems;
2319
2320 /* key type info */
2321 tuple->typlen = typlen;
2322 tuple->typbyval = typbyval;
2323
2324 /*
2325 * Copy the key and items into the tuple. First the key value, which we
2326 * can simply copy right at the beginning of the data array.
2327 */
2328 if (category == GIN_CAT_NORM_KEY)
2329 {
2330 if (typbyval)
2331 {
2332 memcpy(tuple->data, &key, sizeof(Datum));
2333 }
2334 else if (typlen > 0) /* byref, fixed length */
2335 {
2336 memcpy(tuple->data, DatumGetPointer(key), typlen);
2337 }
2338 else if (typlen == -1)
2339 {
2340 memcpy(tuple->data, DatumGetPointer(key), keylen);
2341 }
2342 else if (typlen == -2)
2343 {
2344 memcpy(tuple->data, DatumGetPointer(key), keylen);
2345 }
2346 }
2347
2348 /* finally, copy the TIDs into the array */
2349 ptr = (char *) tuple + SHORTALIGN(offsetof(GinTuple, data) + keylen);
2350
2351 /* copy in the compressed data, and free the segments */
2352 dlist_foreach_modify(iter, &segments)
2353 {
2355
2356 memcpy(ptr, seginfo->seg, SizeOfGinPostingList(seginfo->seg));
2357
2358 ptr += SizeOfGinPostingList(seginfo->seg);
2359
2360 dlist_delete(&seginfo->node);
2361
2362 pfree(seginfo->seg);
2363 pfree(seginfo);
2364 }
2365
2366 return tuple;
2367}
2368
2369/*
2370 * _gin_parse_tuple_key
2371 * Return a Datum representing the key stored in the tuple.
2372 *
2373 * Most of the tuple fields are directly accessible, the only thing that
2374 * needs more care is the key and the TID list.
2375 *
2376 * For the key, this returns a regular Datum representing it. It's either the
2377 * actual key value, or a pointer to the beginning of the data array (which is
2378 * where the data was copied by _gin_build_tuple).
2379 */
2380static Datum
2382{
2383 Datum key;
2384
2385 if (a->category != GIN_CAT_NORM_KEY)
2386 return (Datum) 0;
2387
2388 if (a->typbyval)
2389 {
2390 memcpy(&key, a->data, a->keylen);
2391 return key;
2392 }
2393
2394 return PointerGetDatum(a->data);
2395}
2396
2397/*
2398* _gin_parse_tuple_items
2399 * Return a pointer to a palloc'd array of decompressed TID array.
2400 */
2401static ItemPointer
2403{
2404 int len;
2405 char *ptr;
2406 int ndecoded;
2408
2409 len = a->tuplen - SHORTALIGN(offsetof(GinTuple, data) + a->keylen);
2410 ptr = (char *) a + SHORTALIGN(offsetof(GinTuple, data) + a->keylen);
2411
2413
2414 Assert(ndecoded == a->nitems);
2415
2416 return items;
2417}
2418
2419/*
2420 * _gin_compare_tuples
2421 * Compare GIN tuples, used by tuplesort during parallel index build.
2422 *
2423 * The scalar fields (attrnum, category) are compared first, the key value is
2424 * compared last. The comparisons are done using type-specific sort support
2425 * functions.
2426 *
2427 * If the key value matches, we compare the first TID value in the TID list,
2428 * which means the tuples are merged in an order in which they are most
2429 * likely to be simply concatenated. (This "first" TID will also allow us
2430 * to determine a point up to which the list is fully determined and can be
2431 * written into the index to enforce a memory limit etc.)
2432 */
2433int
2435{
2436 int r;
2437 Datum keya,
2438 keyb;
2439
2440 if (a->attrnum < b->attrnum)
2441 return -1;
2442
2443 if (a->attrnum > b->attrnum)
2444 return 1;
2445
2446 if (a->category < b->category)
2447 return -1;
2448
2449 if (a->category > b->category)
2450 return 1;
2451
2452 if (a->category == GIN_CAT_NORM_KEY)
2453 {
2456
2457 r = ApplySortComparator(keya, false,
2458 keyb, false,
2459 &ssup[a->attrnum - 1]);
2460
2461 /* if the key is the same, consider the first TID in the array */
2462 return (r != 0) ? r : ItemPointerCompare(GinTupleGetFirst(a),
2464 }
2465
2468}
int ParallelWorkerNumber
Definition parallel.c:115
void InitializeParallelDSM(ParallelContext *pcxt)
Definition parallel.c:211
void WaitForParallelWorkersToFinish(ParallelContext *pcxt)
Definition parallel.c:803
void LaunchParallelWorkers(ParallelContext *pcxt)
Definition parallel.c:581
void DestroyParallelContext(ParallelContext *pcxt)
Definition parallel.c:957
ParallelContext * CreateParallelContext(const char *library_name, const char *function_name, int nworkers)
Definition parallel.c:173
void WaitForParallelWorkersToAttach(ParallelContext *pcxt)
Definition parallel.c:700
void pgstat_progress_update_param(int index, int64 val)
void pgstat_progress_update_multi_param(int nparam, const int *index, const int64 *val)
void pgstat_report_activity(BackendState state, const char *cmd_str)
@ STATE_RUNNING
uint32 BlockNumber
Definition block.h:31
static Datum values[MAXATTR]
Definition bootstrap.c:155
int Buffer
Definition buf.h:23
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition bufmgr.c:4356
Buffer ExtendBufferedRel(BufferManagerRelation bmr, ForkNumber forkNum, BufferAccessStrategy strategy, uint32 flags)
Definition bufmgr.c:964
void UnlockReleaseBuffer(Buffer buffer)
Definition bufmgr.c:5518
void MarkBufferDirty(Buffer buffer)
Definition bufmgr.c:3056
#define RelationGetNumberOfBlocks(reln)
Definition bufmgr.h:307
static Page BufferGetPage(Buffer buffer)
Definition bufmgr.h:466
static void LockBuffer(Buffer buffer, BufferLockMode mode)
Definition bufmgr.h:328
@ EB_SKIP_EXTENSION_LOCK
Definition bufmgr.h:75
@ EB_LOCK_FIRST
Definition bufmgr.h:87
#define BMR_REL(p_rel)
Definition bufmgr.h:114
static ItemId PageGetItemId(Page page, OffsetNumber offsetNumber)
Definition bufpage.h:243
static void * PageGetItem(PageData *page, const ItemIdData *itemId)
Definition bufpage.h:353
PageData * Page
Definition bufpage.h:81
#define Min(x, y)
Definition c.h:997
#define BUFFERALIGN(LEN)
Definition c.h:828
#define Assert(condition)
Definition c.h:873
int64_t int64
Definition c.h:543
int16_t int16
Definition c.h:541
#define SHORTALIGN(LEN)
Definition c.h:822
int32_t int32
Definition c.h:542
uint32_t uint32
Definition c.h:546
#define OidIsValid(objectId)
Definition c.h:788
size_t Size
Definition c.h:619
bool ConditionVariableCancelSleep(void)
void ConditionVariableInit(ConditionVariable *cv)
void ConditionVariableSleep(ConditionVariable *cv, uint32 wait_event_info)
void ConditionVariableSignal(ConditionVariable *cv)
Datum datumCopy(Datum value, bool typByVal, int typLen)
Definition datum.c:132
int errcode(int sqlerrcode)
Definition elog.c:863
int errmsg(const char *fmt,...)
Definition elog.c:1080
#define ERROR
Definition elog.h:39
#define elog(elevel,...)
Definition elog.h:226
#define ereport(elevel,...)
Definition elog.h:150
#define palloc_object(type)
Definition fe_memutils.h:74
#define MaxAllocSize
Definition fe_memutils.h:22
#define palloc0_array(type, count)
Definition fe_memutils.h:77
#define palloc0_object(type)
Definition fe_memutils.h:75
char * format_type_be(Oid type_oid)
IndexUniqueCheck
Definition genam.h:122
#define GIN_COMPARE_PROC
Definition gin.h:24
#define PROGRESS_GIN_PHASE_PERFORMSORT_2
Definition gin.h:49
#define PROGRESS_GIN_PHASE_MERGE_1
Definition gin.h:48
#define PROGRESS_GIN_PHASE_PERFORMSORT_1
Definition gin.h:47
#define PROGRESS_GIN_PHASE_MERGE_2
Definition gin.h:50
#define PROGRESS_GIN_PHASE_INDEXBUILD_TABLESCAN
Definition gin.h:46
#define GIN_UNLOCK
Definition gin_private.h:50
#define GinGetUseFastUpdate(relation)
Definition gin_private.h:35
static ItemPointer GinTupleGetFirst(GinTuple *tup)
Definition gin_tuple.h:35
#define GinIsPostingTree(itup)
Definition ginblock.h:231
#define GIN_CAT_NORM_KEY
Definition ginblock.h:208
#define SizeOfGinPostingList(plist)
Definition ginblock.h:342
#define GIN_LEAF
Definition ginblock.h:42
#define GinGetPostingTree(itup)
Definition ginblock.h:233
signed char GinNullCategory
Definition ginblock.h:206
#define GinSetPostingTree(itup, blkno)
Definition ginblock.h:232
#define GinMaxItemSize
Definition ginblock.h:248
void freeGinBtreeStack(GinBtreeStack *stack)
Definition ginbtree.c:198
void ginInsertValue(GinBtree btree, GinBtreeStack *stack, void *insertdata, GinStatsData *buildStats)
Definition ginbtree.c:816
GinBtreeStack * ginFindLeafPage(GinBtree btree, bool searchMode, bool rootConflictCheck)
Definition ginbtree.c:83
void ginBeginBAScan(BuildAccumulator *accum)
Definition ginbulk.c:256
ItemPointerData * ginGetBAEntry(BuildAccumulator *accum, OffsetNumber *attnum, Datum *key, GinNullCategory *category, uint32 *n)
Definition ginbulk.c:267
void ginInsertBAEntries(BuildAccumulator *accum, ItemPointer heapptr, OffsetNumber attnum, Datum *entries, GinNullCategory *categories, int32 nentries)
Definition ginbulk.c:209
void ginInitBA(BuildAccumulator *accum)
Definition ginbulk.c:109
BlockNumber createPostingTree(Relation index, ItemPointerData *items, uint32 nitems, GinStatsData *buildStats, Buffer entrybuffer)
void ginInsertItemPointers(Relation index, BlockNumber rootBlkno, ItemPointerData *items, uint32 nitem, GinStatsData *buildStats)
ItemPointer ginReadTuple(GinState *ginstate, OffsetNumber attnum, IndexTuple itup, int *nitems)
IndexTuple GinFormTuple(GinState *ginstate, OffsetNumber attnum, Datum key, GinNullCategory category, Pointer data, Size dataSize, int nipd, bool errorTooBig)
void ginPrepareEntryScan(GinBtree btree, OffsetNumber attnum, Datum key, GinNullCategory category, GinState *ginstate)
void ginHeapTupleFastCollect(GinState *ginstate, GinTupleCollector *collector, OffsetNumber attnum, Datum value, bool isNull, ItemPointer ht_ctid)
Definition ginfast.c:483
void ginHeapTupleFastInsert(GinState *ginstate, GinTupleCollector *collector)
Definition ginfast.c:219
static void ginBuildCallbackParallel(Relation index, ItemPointer tid, Datum *values, bool *isnull, bool tupleIsAlive, void *state)
Definition gininsert.c:565
#define PARALLEL_KEY_BUFFER_USAGE
Definition gininsert.c:44
static void AssertCheckItemPointers(GinBuffer *buffer)
Definition gininsert.c:1205
int _gin_compare_tuples(GinTuple *a, GinTuple *b, SortSupport ssup)
Definition gininsert.c:2434
static IndexTuple addItemPointersToLeafTuple(GinState *ginstate, IndexTuple old, ItemPointerData *items, uint32 nitem, GinStatsData *buildStats, Buffer buffer)
Definition gininsert.c:212
static bool GinBufferIsEmpty(GinBuffer *buffer)
Definition gininsert.c:1329
#define PARALLEL_KEY_GIN_SHARED
Definition gininsert.c:40
IndexBuildResult * ginbuild(Relation heap, Relation index, IndexInfo *indexInfo)
Definition gininsert.c:613
static GinBuffer * GinBufferInit(Relation index)
Definition gininsert.c:1261
static IndexTuple buildFreshLeafTuple(GinState *ginstate, OffsetNumber attnum, Datum key, GinNullCategory category, ItemPointerData *items, uint32 nitem, GinStatsData *buildStats, Buffer buffer)
Definition gininsert.c:294
static void GinBufferReset(GinBuffer *buffer)
Definition gininsert.c:1551
static void ginHeapTupleBulkInsert(GinBuildState *buildstate, OffsetNumber attnum, Datum value, bool isNull, ItemPointer heapptr)
Definition gininsert.c:423
static void AssertCheckGinBuffer(GinBuffer *buffer)
Definition gininsert.c:1232
static void _gin_parallel_scan_and_build(GinBuildState *state, GinBuildShared *ginshared, Sharedsort *sharedsort, Relation heap, Relation index, int sortmem, bool progress)
Definition gininsert.c:2023
void ginEntryInsert(GinState *ginstate, OffsetNumber attnum, Datum key, GinNullCategory category, ItemPointerData *items, uint32 nitem, GinStatsData *buildStats)
Definition gininsert.c:346
static void GinBufferTrim(GinBuffer *buffer)
Definition gininsert.c:1581
static Size _gin_parallel_estimate_shared(Relation heap, Snapshot snapshot)
Definition gininsert.c:1801
static void _gin_end_parallel(GinLeader *ginleader, GinBuildState *state)
Definition gininsert.c:1107
static void _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, bool isconcurrent, int request)
Definition gininsert.c:928
static void ginFlushBuildState(GinBuildState *buildstate, Relation index)
Definition gininsert.c:500
static void _gin_process_worker_data(GinBuildState *state, Tuplesortstate *worker_sort, bool progress)
Definition gininsert.c:1845
static bool GinBufferKeyEquals(GinBuffer *buffer, GinTuple *tup)
Definition gininsert.c:1345
static double _gin_parallel_merge(GinBuildState *state)
Definition gininsert.c:1639
static void ginHeapTupleInsert(GinState *ginstate, OffsetNumber attnum, Datum value, bool isNull, ItemPointer item)
Definition gininsert.c:838
static bool GinBufferCanAddKey(GinBuffer *buffer, GinTuple *tup)
Definition gininsert.c:1617
void _gin_parallel_build_main(dsm_segment *seg, shm_toc *toc)
Definition gininsert.c:2101
void ginbuildempty(Relation index)
Definition gininsert.c:807
bool gininsert(Relation index, Datum *values, bool *isnull, ItemPointer ht_ctid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, IndexInfo *indexInfo)
Definition gininsert.c:856
static void _gin_leader_participate_as_worker(GinBuildState *buildstate, Relation heap, Relation index)
Definition gininsert.c:1812
static GinTuple * _gin_build_tuple(OffsetNumber attrnum, unsigned char category, Datum key, int16 typlen, bool typbyval, ItemPointerData *items, uint32 nitems, Size *len)
Definition gininsert.c:2232
static Datum _gin_parse_tuple_key(GinTuple *a)
Definition gininsert.c:2381
#define PARALLEL_KEY_TUPLESORT
Definition gininsert.c:41
static bool GinBufferShouldTrim(GinBuffer *buffer, GinTuple *tup)
Definition gininsert.c:1411
#define PARALLEL_KEY_QUERY_TEXT
Definition gininsert.c:42
static void GinBufferFree(GinBuffer *buffer)
Definition gininsert.c:1597
static ItemPointer _gin_parse_tuple_items(GinTuple *a)
Definition gininsert.c:2402
#define ParallelTableScanFromGinBuildShared(shared)
Definition gininsert.c:105
#define PARALLEL_KEY_WAL_USAGE
Definition gininsert.c:43
static void GinBufferStoreTuple(GinBuffer *buffer, GinTuple *tup)
Definition gininsert.c:1484
static void ginBuildCallback(Relation index, ItemPointer tid, Datum *values, bool *isnull, bool tupleIsAlive, void *state)
Definition gininsert.c:447
static double _gin_parallel_heapscan(GinBuildState *state)
Definition gininsert.c:1138
ItemPointer ginPostingListDecodeAllSegments(GinPostingList *segment, int len, int *ndecoded_out)
GinPostingList * ginCompressPostingList(const ItemPointerData *ipd, int nipd, int maxsize, int *nwritten)
ItemPointer ginMergeItemPointers(ItemPointerData *a, uint32 na, ItemPointerData *b, uint32 nb, int *nmerged)
OffsetNumber gintuple_get_attrnum(GinState *ginstate, IndexTuple tuple)
Definition ginutil.c:232
Buffer GinNewBuffer(Relation index)
Definition ginutil.c:306
void GinInitBuffer(Buffer b, uint32 f)
Definition ginutil.c:356
Datum * ginExtractEntries(GinState *ginstate, OffsetNumber attnum, Datum value, bool isNull, int32 *nentries, GinNullCategory **categories)
Definition ginutil.c:451
Datum gintuple_get_key(GinState *ginstate, IndexTuple tuple, GinNullCategory *category)
Definition ginutil.c:265
void GinInitMetabuffer(Buffer b)
Definition ginutil.c:362
void initGinState(GinState *state, Relation index)
Definition ginutil.c:103
void ginUpdateStats(Relation index, const GinStatsData *stats, bool is_build)
Definition ginutil.c:618
int maintenance_work_mem
Definition globals.c:133
static void dlist_init(dlist_head *head)
Definition ilist.h:314
static void dlist_delete(dlist_node *node)
Definition ilist.h:405
#define dlist_foreach_modify(iter, lhead)
Definition ilist.h:640
static void dlist_push_tail(dlist_head *head, dlist_node *node)
Definition ilist.h:364
#define dlist_container(type, membername, ptr)
Definition ilist.h:593
#define nitems(x)
Definition indent.h:31
IndexInfo * BuildIndexInfo(Relation index)
Definition index.c:2426
void index_close(Relation relation, LOCKMODE lockmode)
Definition indexam.c:177
RegProcedure index_getprocid(Relation irel, AttrNumber attnum, uint16 procnum)
Definition indexam.c:883
Relation index_open(Oid relationId, LOCKMODE lockmode)
Definition indexam.c:133
static struct @172 value
void InstrAccumParallelQuery(BufferUsage *bufusage, WalUsage *walusage)
Definition instrument.c:215
void InstrEndParallelQuery(BufferUsage *bufusage, WalUsage *walusage)
Definition instrument.c:205
void InstrStartParallelQuery(void)
Definition instrument.c:197
int b
Definition isn.c:74
int a
Definition isn.c:73
int i
Definition isn.c:77
int32 ItemPointerCompare(const ItemPointerData *arg1, const ItemPointerData *arg2)
Definition itemptr.c:51
static bool ItemPointerIsValid(const ItemPointerData *pointer)
Definition itemptr.h:83
IndexTupleData * IndexTuple
Definition itup.h:53
int LOCKMODE
Definition lockdefs.h:26
#define AccessExclusiveLock
Definition lockdefs.h:43
#define ShareUpdateExclusiveLock
Definition lockdefs.h:39
#define ShareLock
Definition lockdefs.h:40
#define RowExclusiveLock
Definition lockdefs.h:38
void MemoryContextReset(MemoryContext context)
Definition mcxt.c:403
void * repalloc(void *pointer, Size size)
Definition mcxt.c:1632
void pfree(void *pointer)
Definition mcxt.c:1616
void * palloc0(Size size)
Definition mcxt.c:1417
void * palloc(Size size)
Definition mcxt.c:1387
MemoryContext CurrentMemoryContext
Definition mcxt.c:160
void MemoryContextDelete(MemoryContext context)
Definition mcxt.c:472
#define AllocSetContextCreate
Definition memutils.h:129
#define ALLOCSET_DEFAULT_SIZES
Definition memutils.h:160
#define START_CRIT_SECTION()
Definition miscadmin.h:150
#define CHECK_FOR_INTERRUPTS()
Definition miscadmin.h:123
#define END_CRIT_SECTION()
Definition miscadmin.h:152
uint16 OffsetNumber
Definition off.h:24
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
Definition palloc.h:124
int16 attnum
FormData_pg_attribute * Form_pg_attribute
const void size_t len
const void * data
static int progress
Definition pgbench.c:262
const char * debug_query_string
Definition postgres.c:89
static Datum PointerGetDatum(const void *X)
Definition postgres.h:352
uint64_t Datum
Definition postgres.h:70
static Pointer DatumGetPointer(Datum X)
Definition postgres.h:342
#define InvalidOid
unsigned int Oid
void CheckForSerializableConflictIn(Relation relation, const ItemPointerData *tid, BlockNumber blkno)
Definition predicate.c:4334
static int fb(int x)
#define PROC_IN_SAFE_IC
Definition proc.h:59
#define PROGRESS_CREATEIDX_TUPLES_TOTAL
Definition progress.h:106
#define PROGRESS_SCAN_BLOCKS_DONE
Definition progress.h:142
#define PROGRESS_CREATEIDX_TUPLES_DONE
Definition progress.h:107
#define PROGRESS_CREATEIDX_SUBPHASE
Definition progress.h:105
#define PROGRESS_SCAN_BLOCKS_TOTAL
Definition progress.h:141
#define RelationGetRelid(relation)
Definition rel.h:514
#define RelationGetDescr(relation)
Definition rel.h:540
#define RelationGetRelationName(relation)
Definition rel.h:548
#define RelationNeedsWAL(relation)
Definition rel.h:637
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition rel.h:533
@ MAIN_FORKNUM
Definition relpath.h:58
@ INIT_FORKNUM
Definition relpath.h:61
void * shm_toc_allocate(shm_toc *toc, Size nbytes)
Definition shm_toc.c:88
void shm_toc_insert(shm_toc *toc, uint64 key, void *address)
Definition shm_toc.c:171
void * shm_toc_lookup(shm_toc *toc, uint64 key, bool noError)
Definition shm_toc.c:232
#define shm_toc_estimate_chunk(e, sz)
Definition shm_toc.h:51
#define shm_toc_estimate_keys(e, cnt)
Definition shm_toc.h:53
Size add_size(Size s1, Size s2)
Definition shmem.c:495
Size mul_size(Size s1, Size s2)
Definition shmem.c:510
Snapshot GetTransactionSnapshot(void)
Definition snapmgr.c:272
void UnregisterSnapshot(Snapshot snapshot)
Definition snapmgr.c:866
Snapshot RegisterSnapshot(Snapshot snapshot)
Definition snapmgr.c:824
#define SnapshotAny
Definition snapmgr.h:33
#define IsMVCCSnapshot(snapshot)
Definition snapmgr.h:55
void PrepareSortSupportComparisonShim(Oid cmpFunc, SortSupport ssup)
Definition sortsupport.c:68
static int ApplySortComparator(Datum datum1, bool isNull1, Datum datum2, bool isNull2, SortSupport ssup)
#define SpinLockInit(lock)
Definition spin.h:57
#define SpinLockRelease(lock)
Definition spin.h:61
#define SpinLockAcquire(lock)
Definition spin.h:59
PGPROC * MyProc
Definition proc.c:67
Oid fn_oid
Definition fmgr.h:59
bool(* findItem)(GinBtree, GinBtreeStack *)
OffsetNumber off
Size keylen
Definition gininsert.c:1184
GinNullCategory category
Definition gininsert.c:1182
OffsetNumber attnum
Definition gininsert.c:1181
Datum key
Definition gininsert.c:1183
SortSupport ssup
Definition gininsert.c:1196
ItemPointerData * items
Definition gininsert.c:1197
int16 typlen
Definition gininsert.c:1187
bool typbyval
Definition gininsert.c:1188
double reltuples
Definition gininsert.c:89
int scantuplesortstates
Definition gininsert.c:60
double indtuples
Definition gininsert.c:90
bool isconcurrent
Definition gininsert.c:59
slock_t mutex
Definition gininsert.c:76
ConditionVariable workersdonecv
Definition gininsert.c:68
int nparticipantsdone
Definition gininsert.c:88
double indtuples
Definition gininsert.c:143
GinState ginstate
Definition gininsert.c:142
double bs_reltuples
Definition gininsert.c:162
MemoryContext tmpCtx
Definition gininsert.c:145
GinLeader * bs_leader
Definition gininsert.c:155
GinStatsData buildStats
Definition gininsert.c:144
Tuplesortstate * bs_worker_sort
Definition gininsert.c:177
ItemPointerData tid
Definition gininsert.c:148
double bs_numtuples
Definition gininsert.c:161
Tuplesortstate * bs_sortstate
Definition gininsert.c:169
MemoryContext funcCtx
Definition gininsert.c:146
BuildAccumulator accum
Definition gininsert.c:147
Sharedsort * sharedsort
Definition gininsert.c:134
int nparticipanttuplesorts
Definition gininsert.c:122
ParallelContext * pcxt
Definition gininsert.c:114
BufferUsage * bufferusage
Definition gininsert.c:137
Snapshot snapshot
Definition gininsert.c:135
GinBuildShared * ginshared
Definition gininsert.c:133
WalUsage * walusage
Definition gininsert.c:136
dlist_node node
Definition gininsert.c:2206
GinPostingList * seg
Definition gininsert.c:2207
TupleDesc origTupdesc
Definition gin_private.h:74
Relation index
Definition gin_private.h:60
int64 nEntries
Definition gin.h:61
char data[FLEXIBLE_ARRAY_MEMBER]
Definition gin_tuple.h:31
int nitems
Definition gin_tuple.h:30
int16 typlen
Definition gin_tuple.h:27
bool typbyval
Definition gin_tuple.h:28
signed char category
Definition gin_tuple.h:29
int tuplen
Definition gin_tuple.h:24
uint16 keylen
Definition gin_tuple.h:26
OffsetNumber attrnum
Definition gin_tuple.h:25
double heap_tuples
Definition genam.h:38
double index_tuples
Definition genam.h:39
void * ii_AmCache
Definition execnodes.h:225
int ii_ParallelWorkers
Definition execnodes.h:220
bool ii_Concurrent
Definition execnodes.h:212
MemoryContext ii_Context
Definition execnodes.h:228
uint8 statusFlags
Definition proc.h:265
dsm_segment * seg
Definition parallel.h:42
shm_toc_estimator estimator
Definition parallel.h:41
shm_toc * toc
Definition parallel.h:44
int nworkers_launched
Definition parallel.h:37
FmgrInfo cmp_proc_finfo
Definition typcache.h:77
dlist_node * cur
Definition ilist.h:200
Definition type.h:96
void table_close(Relation relation, LOCKMODE lockmode)
Definition table.c:126
Relation table_open(Oid relationId, LOCKMODE lockmode)
Definition table.c:40
TableScanDesc table_beginscan_parallel(Relation relation, ParallelTableScanDesc pscan)
Definition tableam.c:166
Size table_parallelscan_estimate(Relation rel, Snapshot snapshot)
Definition tableam.c:131
void table_parallelscan_initialize(Relation rel, ParallelTableScanDesc pscan, Snapshot snapshot)
Definition tableam.c:146
static double table_index_build_scan(Relation table_rel, Relation index_rel, IndexInfo *index_info, bool allow_sync, bool progress, IndexBuildCallback callback, void *callback_state, TableScanDesc scan)
Definition tableam.h:1754
static ItemArray items
static FormData_pg_attribute * TupleDescAttr(TupleDesc tupdesc, int i)
Definition tupdesc.h:160
static CompactAttribute * TupleDescCompactAttr(TupleDesc tupdesc, int i)
Definition tupdesc.h:175
void tuplesort_performsort(Tuplesortstate *state)
Definition tuplesort.c:1348
void tuplesort_initialize_shared(Sharedsort *shared, int nWorkers, dsm_segment *seg)
Definition tuplesort.c:2921
Size tuplesort_estimate_shared(int nWorkers)
Definition tuplesort.c:2900
void tuplesort_end(Tuplesortstate *state)
Definition tuplesort.c:936
void tuplesort_attach_shared(Sharedsort *shared, dsm_segment *seg)
Definition tuplesort.c:2944
#define TUPLESORT_NONE
Definition tuplesort.h:67
Tuplesortstate * tuplesort_begin_index_gin(Relation heapRel, Relation indexRel, int workMem, SortCoordinate coordinate, int sortopt)
GinTuple * tuplesort_getgintuple(Tuplesortstate *state, Size *len, bool forward)
void tuplesort_putgintuple(Tuplesortstate *state, GinTuple *tuple, Size size)
TypeCacheEntry * lookup_type_cache(Oid type_id, int flags)
Definition typcache.c:386
#define TYPECACHE_CMP_PROC_FINFO
Definition typcache.h:144
static Size VARSIZE_ANY(const void *PTR)
Definition varatt.h:460
void ExitParallelMode(void)
Definition xact.c:1065
void EnterParallelMode(void)
Definition xact.c:1052
void log_newpage_range(Relation rel, ForkNumber forknum, BlockNumber startblk, BlockNumber endblk, bool page_std)
XLogRecPtr log_newpage_buffer(Buffer buffer, bool page_std)