PostgreSQL Source Code git master
Loading...
Searching...
No Matches
gininsert.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * gininsert.c
4 * insert routines for the postgres inverted index access method.
5 *
6 *
7 * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
8 * Portions Copyright (c) 1994, Regents of the University of California
9 *
10 * IDENTIFICATION
11 * src/backend/access/gin/gininsert.c
12 *-------------------------------------------------------------------------
13 */
14
15#include "postgres.h"
16
17#include "access/gin_private.h"
18#include "access/gin_tuple.h"
19#include "access/parallel.h"
20#include "access/table.h"
21#include "access/tableam.h"
22#include "access/xloginsert.h"
23#include "catalog/index.h"
25#include "commands/progress.h"
26#include "miscadmin.h"
27#include "nodes/execnodes.h"
28#include "pgstat.h"
29#include "storage/bufmgr.h"
30#include "storage/proc.h"
31#include "storage/predicate.h"
32#include "tcop/tcopprot.h"
33#include "utils/datum.h"
34#include "utils/memutils.h"
35#include "utils/builtins.h"
36#include "utils/rel.h"
37#include "utils/typcache.h"
38
39
40/* Magic numbers for parallel state sharing */
41#define PARALLEL_KEY_GIN_SHARED UINT64CONST(0xB000000000000001)
42#define PARALLEL_KEY_TUPLESORT UINT64CONST(0xB000000000000002)
43#define PARALLEL_KEY_QUERY_TEXT UINT64CONST(0xB000000000000003)
44#define PARALLEL_KEY_WAL_USAGE UINT64CONST(0xB000000000000004)
45#define PARALLEL_KEY_BUFFER_USAGE UINT64CONST(0xB000000000000005)
46
47/*
48 * Status for index builds performed in parallel. This is allocated in a
49 * dynamic shared memory segment.
50 */
51typedef struct GinBuildShared
52{
53 /*
54 * These fields are not modified during the build. They primarily exist
55 * for the benefit of worker processes that need to create state
56 * corresponding to that used by the leader.
57 */
62
63 /*
64 * workersdonecv is used to monitor the progress of workers. All parallel
65 * participants must indicate that they are done before leader can use
66 * results built by the workers (and before leader can write the data into
67 * the index).
68 */
70
71 /*
72 * mutex protects all following fields
73 *
74 * These fields contain status information of interest to GIN index builds
75 * that must work just the same when an index is built in parallel.
76 */
78
79 /*
80 * Mutable state that is maintained by workers, and reported back to
81 * leader at end of the scans.
82 *
83 * nparticipantsdone is number of worker processes finished.
84 *
85 * reltuples is the total number of input heap tuples.
86 *
87 * indtuples is the total number of tuples that made it into the index.
88 */
90 double reltuples;
91 double indtuples;
92
93 /*
94 * ParallelTableScanDescData data follows. Can't directly embed here, as
95 * implementations of the parallel table scan desc interface might need
96 * stronger alignment.
97 */
99
100/*
101 * Return pointer to a GinBuildShared's parallel table scan.
102 *
103 * c.f. shm_toc_allocate as to why BUFFERALIGN is used, rather than just
104 * MAXALIGN.
105 */
106#define ParallelTableScanFromGinBuildShared(shared) \
107 (ParallelTableScanDesc) ((char *) (shared) + BUFFERALIGN(sizeof(GinBuildShared)))
108
109/*
110 * Status for leader in parallel index build.
111 */
112typedef struct GinLeader
113{
114 /* parallel context itself */
116
117 /*
118 * nparticipanttuplesorts is the exact number of worker processes
119 * successfully launched, plus one leader process if it participates as a
120 * worker (only DISABLE_LEADER_PARTICIPATION builds avoid leader
121 * participating as a worker).
122 */
124
125 /*
126 * Leader process convenience pointers to shared state (leader avoids TOC
127 * lookups).
128 *
129 * GinBuildShared is the shared state for entire build. sharedsort is the
130 * shared, tuplesort-managed state passed to each process tuplesort.
131 * snapshot is the snapshot used by the scan iff an MVCC snapshot is
132 * required.
133 */
140
141typedef struct
142{
144 double indtuples;
151
152 /*
153 * bs_leader is only present when a parallel index build is performed, and
154 * only in the leader process.
155 */
157
158 /* number of participating workers (including leader) */
160
161 /* used to pass information from workers to leader */
164
165 /*
166 * The sortstate is used by workers (including the leader). It has to be
167 * part of the build state, because that's the only thing passed to the
168 * build callback etc.
169 */
171
172 /*
173 * The sortstate used only within a single worker for the first merge pass
174 * happening there. In principle it doesn't need to be part of the build
175 * state and we could pass it around directly, but it's more convenient
176 * this way. And it's part of the build state, after all.
177 */
180
181
182/* parallel index builds */
184 bool isconcurrent, int request);
190 Relation heap, Relation index);
192 GinBuildShared *ginshared,
193 Sharedsort *sharedsort,
194 Relation heap, Relation index,
195 int sortmem, bool progress);
196
199
200static GinTuple *_gin_build_tuple(OffsetNumber attrnum, unsigned char category,
201 Datum key, int16 typlen, bool typbyval,
203 Size *len);
204
205/*
206 * Adds array of item pointers to tuple's posting list, or
207 * creates posting tree and tuple pointing to tree in case
208 * of not enough space. Max size of tuple is defined in
209 * GinFormTuple(). Returns a new, modified index tuple.
210 * items[] must be in sorted order with no duplicates.
211 */
212static IndexTuple
216 GinStatsData *buildStats, Buffer buffer)
217{
219 Datum key;
220 GinNullCategory category;
221 IndexTuple res;
223 *oldItems;
224 int oldNPosting,
226 nwritten;
228
230
231 attnum = gintuple_get_attrnum(ginstate, old);
232 key = gintuple_get_key(ginstate, old, &category);
233
234 /* merge the old and new posting lists */
236
239 &newNPosting);
240
241 /* Compress the posting list, and try to a build tuple with room for it */
242 res = NULL;
244 if (nwritten == newNPosting)
245 {
246 res = GinFormTuple(ginstate, attnum, key, category,
247 (char *) compressedList,
250 false);
251 }
252
255
256 if (!res)
257 {
258 /* posting list would be too big, convert to posting tree */
260
261 /*
262 * Initialize posting tree with the old tuple's posting list. It's
263 * surely small enough to fit on one posting-tree page, and should
264 * already be in order with no duplicates.
265 */
267 oldItems,
269 buildStats,
270 buffer);
271
272 /* Now insert the TIDs-to-be-added into the posting tree */
274 items, nitem,
275 buildStats);
276
277 /* And build a new posting-tree-only result tuple */
278 res = GinFormTuple(ginstate, attnum, key, category, NULL, 0, 0, true);
280 }
282
283 return res;
284}
285
286/*
287 * Build a fresh leaf tuple, either posting-list or posting-tree format
288 * depending on whether the given items list will fit.
289 * items[] must be in sorted order with no duplicates.
290 *
291 * This is basically the same logic as in addItemPointersToLeafTuple,
292 * but working from slightly different input.
293 */
294static IndexTuple
298 GinStatsData *buildStats, Buffer buffer)
299{
300 IndexTuple res = NULL;
302 int nwritten;
303
304 /* try to build a posting list tuple with all the items */
306 if (nwritten == nitem)
307 {
308 res = GinFormTuple(ginstate, attnum, key, category,
309 (char *) compressedList,
311 nitem, false);
312 }
314
315 if (!res)
316 {
317 /* posting list would be too big, build posting tree */
319
320 /*
321 * Build posting-tree-only result tuple. We do this first so as to
322 * fail quickly if the key is too big.
323 */
324 res = GinFormTuple(ginstate, attnum, key, category, NULL, 0, 0, true);
325
326 /*
327 * Initialize a new posting tree with the TIDs.
328 */
329 postingRoot = createPostingTree(ginstate->index, items, nitem,
330 buildStats, buffer);
331
332 /* And save the root link in the result tuple */
334 }
335
336 return res;
337}
338
339/*
340 * Insert one or more heap TIDs associated with the given key value.
341 * This will either add a single key entry, or enlarge a pre-existing entry.
342 *
343 * During an index build, buildStats is non-null and the counters
344 * it contains should be incremented as needed.
345 */
346void
350 GinStatsData *buildStats)
351{
352 GinBtreeData btree;
354 GinBtreeStack *stack;
355 IndexTuple itup;
356 Page page;
357
358 insertdata.isDelete = false;
359
360 ginPrepareEntryScan(&btree, attnum, key, category, ginstate);
361 btree.isBuild = (buildStats != NULL);
362
363 stack = ginFindLeafPage(&btree, false, false);
364 page = BufferGetPage(stack->buffer);
365
366 if (btree.findItem(&btree, stack))
367 {
368 /* found pre-existing entry */
369 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, stack->off));
370
371 if (GinIsPostingTree(itup))
372 {
373 /* add entries to existing posting tree */
375
376 /* release all stack */
378 freeGinBtreeStack(stack);
379
380 /* insert into posting tree */
382 items, nitem,
383 buildStats);
384 return;
385 }
386
389 /* modify an existing leaf entry */
390 itup = addItemPointersToLeafTuple(ginstate, itup,
391 items, nitem, buildStats, stack->buffer);
392
393 insertdata.isDelete = true;
394 }
395 else
396 {
399 /* no match, so construct a new leaf entry */
400 itup = buildFreshLeafTuple(ginstate, attnum, key, category,
401 items, nitem, buildStats, stack->buffer);
402
403 /*
404 * nEntries counts leaf tuples, so increment it only when we make a
405 * new one.
406 */
407 if (buildStats)
408 buildStats->nEntries++;
409 }
410
411 /* Insert the new or modified leaf tuple */
412 insertdata.entry = itup;
413 ginInsertValue(&btree, stack, &insertdata, buildStats);
414 pfree(itup);
415}
416
417/*
418 * Extract index entries for a single indexable item, and add them to the
419 * BuildAccumulator's state.
420 *
421 * This function is used only during initial index creation.
422 */
423static void
425 Datum value, bool isNull,
427{
428 Datum *entries;
429 GinNullCategory *categories;
430 int32 nentries;
432
434 entries = ginExtractEntries(buildstate->accum.ginstate, attnum,
435 value, isNull,
436 &nentries, &categories);
438
440 entries, categories, nentries);
441
442 buildstate->indtuples += nentries;
443
445}
446
447static void
449 bool *isnull, bool tupleIsAlive, void *state)
450{
453 int i;
454
456
457 for (i = 0; i < buildstate->ginstate.origTupdesc->natts; i++)
459 values[i], isnull[i], tid);
460
461 /* If we've maxed out our available memory, dump everything to the index */
462 if (buildstate->accum.allocatedMemory >= maintenance_work_mem * (Size) 1024)
463 {
464 ItemPointerData *list;
465 Datum key;
466 GinNullCategory category;
467 uint32 nlist;
469
470 ginBeginBAScan(&buildstate->accum);
471 while ((list = ginGetBAEntry(&buildstate->accum,
472 &attnum, &key, &category, &nlist)) != NULL)
473 {
474 /* there could be many entries, so be willing to abort here */
476 ginEntryInsert(&buildstate->ginstate, attnum, key, category,
477 list, nlist, &buildstate->buildStats);
478 }
479
481 ginInitBA(&buildstate->accum);
482 }
483
485}
486
487/*
488 * ginFlushBuildState
489 * Write all data from BuildAccumulator into the tuplesort.
490 *
491 * The number of TIDs written to the tuplesort at once is limited, to reduce
492 * the amount of memory needed when merging the intermediate results later.
493 * The leader will see up to two chunks per worker, so calculate the limit to
494 * not need more than MaxAllocSize overall.
495 *
496 * We don't need to worry about overflowing maintenance_work_mem. We can't
497 * build chunks larger than work_mem, and that limit was set so that workers
498 * produce sufficiently small chunks.
499 */
500static void
502{
503 ItemPointerData *list;
504 Datum key;
505 GinNullCategory category;
506 uint32 nlist;
509 uint32 maxlen;
510
511 /* maximum number of TIDs per chunk (two chunks per worker) */
512 maxlen = MaxAllocSize / sizeof(ItemPointerData);
513 maxlen /= (2 * buildstate->bs_num_workers);
514
515 ginBeginBAScan(&buildstate->accum);
516 while ((list = ginGetBAEntry(&buildstate->accum,
517 &attnum, &key, &category, &nlist)) != NULL)
518 {
519 /* information about the key */
521
522 /* start of the chunk */
523 uint32 offset = 0;
524
525 /* split the entry into smaller chunk with up to maxlen items */
526 while (offset < nlist)
527 {
528 /* GIN tuple and tuple length */
529 GinTuple *tup;
530 Size tuplen;
531 uint32 len = Min(maxlen, nlist - offset);
532
533 /* there could be many entries, so be willing to abort here */
535
536 tup = _gin_build_tuple(attnum, category,
537 key, attr->attlen, attr->attbyval,
538 &list[offset], len,
539 &tuplen);
540
541 offset += len;
542
543 tuplesort_putgintuple(buildstate->bs_worker_sort, tup, tuplen);
544
545 pfree(tup);
546 }
547 }
548
550 ginInitBA(&buildstate->accum);
551}
552
553/*
554 * ginBuildCallbackParallel
555 * Callback for the parallel index build.
556 *
557 * This is similar to the serial build callback ginBuildCallback, but
558 * instead of writing the accumulated entries into the index, each worker
559 * writes them into a (local) tuplesort.
560 *
561 * The worker then sorts and combines these entries, before writing them
562 * into a shared tuplesort for the leader (see _gin_parallel_scan_and_build
563 * for the whole process).
564 */
565static void
567 bool *isnull, bool tupleIsAlive, void *state)
568{
571 int i;
572
574
575 /*
576 * if scan wrapped around - flush accumulated entries and start anew
577 *
578 * With parallel scans, we don't have a guarantee the scan does not start
579 * half-way through the relation (serial builds disable sync scans and
580 * always start from block 0, parallel scans require allow_sync=true).
581 *
582 * Building the posting lists assumes the TIDs are monotonic and never go
583 * back, and the wrap around would break that. We handle that by detecting
584 * the wraparound, and flushing all entries. This means we'll later see
585 * two separate entries with non-overlapping TID lists (which can be
586 * combined by merge sort).
587 *
588 * To detect a wraparound, we remember the last TID seen by each worker
589 * (for any key). If the next TID seen by the worker is lower, the scan
590 * must have wrapped around.
591 */
592 if (ItemPointerCompare(tid, &buildstate->tid) < 0)
594
595 /* remember the TID we're about to process */
596 buildstate->tid = *tid;
597
598 for (i = 0; i < buildstate->ginstate.origTupdesc->natts; i++)
600 values[i], isnull[i], tid);
601
602 /*
603 * If we've maxed out our available memory, dump everything to the
604 * tuplesort. We use half the per-worker fraction of maintenance_work_mem,
605 * the other half is used for the tuplesort.
606 */
607 if (buildstate->accum.allocatedMemory >= buildstate->work_mem * (Size) 1024)
609
611}
612
615{
616 IndexBuildResult *result;
617 double reltuples;
622 ItemPointerData *list;
623 Datum key;
624 GinNullCategory category;
625 uint32 nlist;
628
630 elog(ERROR, "index \"%s\" already contains data",
632
633 initGinState(&buildstate.ginstate, index);
634 buildstate.indtuples = 0;
635 memset(&buildstate.buildStats, 0, sizeof(GinStatsData));
636
637 /* Initialize fields for parallel build too. */
638 buildstate.bs_numtuples = 0;
639 buildstate.bs_reltuples = 0;
640 buildstate.bs_leader = NULL;
641 memset(&buildstate.tid, 0, sizeof(ItemPointerData));
642
643 /* initialize the meta page */
645
646 /* initialize the root page */
648
654
655
659
660 /* count the root as first entry page */
661 buildstate.buildStats.nEntryPages++;
662
663 /*
664 * create a temporary memory context that is used to hold data not yet
665 * dumped out to the index
666 */
668 "Gin build temporary context",
670
671 /*
672 * create a temporary memory context that is used for calling
673 * ginExtractEntries(), and can be reset after each tuple
674 */
676 "Gin build temporary context for user-defined function",
678
679 buildstate.accum.ginstate = &buildstate.ginstate;
680 ginInitBA(&buildstate.accum);
681
682 /* Report table scan phase started */
685
686 /*
687 * Attempt to launch parallel worker scan when required
688 *
689 * XXX plan_create_index_workers makes the number of workers dependent on
690 * maintenance_work_mem, requiring 32MB for each worker. For GIN that's
691 * reasonable too, because we sort the data just like btree. It does
692 * ignore the memory used to accumulate data in memory (set by work_mem),
693 * but there is no way to communicate that to plan_create_index_workers.
694 */
695 if (indexInfo->ii_ParallelWorkers > 0)
696 _gin_begin_parallel(state, heap, index, indexInfo->ii_Concurrent,
697 indexInfo->ii_ParallelWorkers);
698
699 /*
700 * If parallel build requested and at least one worker process was
701 * successfully launched, set up coordination state, wait for workers to
702 * complete. Then read all tuples from the shared tuplesort and insert
703 * them into the index.
704 *
705 * In serial mode, simply scan the table and build the index one index
706 * tuple at a time.
707 */
708 if (state->bs_leader)
709 {
711
713 coordinate->isWorker = false;
714 coordinate->nParticipants =
715 state->bs_leader->nparticipanttuplesorts;
716 coordinate->sharedsort = state->bs_leader->sharedsort;
717
718 /*
719 * Begin leader tuplesort.
720 *
721 * In cases where parallelism is involved, the leader receives the
722 * same share of maintenance_work_mem as a serial sort (it is
723 * generally treated in the same way as a serial sort once we return).
724 * Parallel worker Tuplesortstates will have received only a fraction
725 * of maintenance_work_mem, though.
726 *
727 * We rely on the lifetime of the Leader Tuplesortstate almost not
728 * overlapping with any worker Tuplesortstate's lifetime. There may
729 * be some small overlap, but that's okay because we rely on leader
730 * Tuplesortstate only allocating a small, fixed amount of memory
731 * here. When its tuplesort_performsort() is called (by our caller),
732 * and significant amounts of memory are likely to be used, all
733 * workers must have already freed almost all memory held by their
734 * Tuplesortstates (they are about to go away completely, too). The
735 * overall effect is that maintenance_work_mem always represents an
736 * absolute high watermark on the amount of memory used by a CREATE
737 * INDEX operation, regardless of the use of parallelism or any other
738 * factor.
739 */
740 state->bs_sortstate =
744
745 /* scan the relation in parallel and merge per-worker results */
746 reltuples = _gin_parallel_merge(state);
747
748 _gin_end_parallel(state->bs_leader, state);
749 }
750 else /* no parallel index build */
751 {
752 /*
753 * Do the heap scan. We disallow sync scan here because
754 * dataPlaceToPage prefers to receive tuples in TID order.
755 */
756 reltuples = table_index_build_scan(heap, index, indexInfo, false, true,
758
759 /* dump remaining entries to the index */
762 while ((list = ginGetBAEntry(&buildstate.accum,
763 &attnum, &key, &category, &nlist)) != NULL)
764 {
765 /* there could be many entries, so be willing to abort here */
767 ginEntryInsert(&buildstate.ginstate, attnum, key, category,
768 list, nlist, &buildstate.buildStats);
769 }
771 }
772
775
776 /*
777 * Update metapage stats
778 */
779 buildstate.buildStats.nTotalPages = RelationGetNumberOfBlocks(index);
780 ginUpdateStats(index, &buildstate.buildStats, true);
781
782 /*
783 * We didn't write WAL records as we built the index, so if WAL-logging is
784 * required, write all pages to the WAL now.
785 */
787 {
790 true);
791 }
792
793 /*
794 * Return statistics
795 */
797
798 result->heap_tuples = reltuples;
799 result->index_tuples = buildstate.indtuples;
800
801 return result;
802}
803
804/*
805 * ginbuildempty() -- build an empty gin index in the initialization fork
806 */
807void
809{
812
813 /* An empty GIN index has two pages. */
818
819 /* Initialize and xlog metabuffer and root buffer. */
828
829 /* Unlock and release the buffers. */
832}
833
834/*
835 * Insert index entries for a single indexable item during "normal"
836 * (non-fast-update) insertion
837 */
838static void
840 Datum value, bool isNull,
841 ItemPointer item)
842{
843 Datum *entries;
844 GinNullCategory *categories;
845 int32 i,
846 nentries;
847
848 entries = ginExtractEntries(ginstate, attnum, value, isNull,
849 &nentries, &categories);
850
851 for (i = 0; i < nentries; i++)
852 ginEntryInsert(ginstate, attnum, entries[i], categories[i],
853 item, 1, NULL);
854}
855
856bool
860 bool indexUnchanged,
861 IndexInfo *indexInfo)
862{
863 GinState *ginstate = (GinState *) indexInfo->ii_AmCache;
866 int i;
867
868 /* Initialize GinState cache if first call in this statement */
869 if (ginstate == NULL)
870 {
872 ginstate = palloc_object(GinState);
873 initGinState(ginstate, index);
874 indexInfo->ii_AmCache = ginstate;
876 }
877
879 "Gin insert temporary context",
881
883
885 {
887
888 memset(&collector, 0, sizeof(GinTupleCollector));
889
890 for (i = 0; i < ginstate->origTupdesc->natts; i++)
892 (OffsetNumber) (i + 1),
893 values[i], isnull[i],
894 ht_ctid);
895
897 }
898 else
899 {
900 for (i = 0; i < ginstate->origTupdesc->natts; i++)
901 ginHeapTupleInsert(ginstate, (OffsetNumber) (i + 1),
902 values[i], isnull[i],
903 ht_ctid);
904 }
905
908
909 return false;
910}
911
912/*
913 * Create parallel context, and launch workers for leader.
914 *
915 * buildstate argument should be initialized (with the exception of the
916 * tuplesort states, which may later be created based on shared
917 * state initially set up here).
918 *
919 * isconcurrent indicates if operation is CREATE INDEX CONCURRENTLY.
920 *
921 * request is the target number of parallel worker processes to launch.
922 *
923 * Sets buildstate's GinLeader, which caller must use to shut down parallel
924 * mode by passing it to _gin_end_parallel() at the very end of its index
925 * build. If not even a single worker process can be launched, this is
926 * never set, and caller should proceed with a serial index build.
927 */
928static void
930 bool isconcurrent, int request)
931{
932 ParallelContext *pcxt;
933 int scantuplesortstates;
934 Snapshot snapshot;
937 GinBuildShared *ginshared;
938 Sharedsort *sharedsort;
940 WalUsage *walusage;
941 BufferUsage *bufferusage;
942 bool leaderparticipates = true;
943 int querylen;
944
945#ifdef DISABLE_LEADER_PARTICIPATION
946 leaderparticipates = false;
947#endif
948
949 /*
950 * Enter parallel mode, and create context for parallel build of gin index
951 */
953 Assert(request > 0);
954 pcxt = CreateParallelContext("postgres", "_gin_parallel_build_main",
955 request);
956
957 scantuplesortstates = leaderparticipates ? request + 1 : request;
958
959 /*
960 * Prepare for scan of the base relation. In a normal index build, we use
961 * SnapshotAny because we must retrieve all tuples and do our own time
962 * qual checks (because we have to index RECENTLY_DEAD tuples). In a
963 * concurrent build, we take a regular MVCC snapshot and index whatever's
964 * live according to that.
965 */
966 if (!isconcurrent)
967 snapshot = SnapshotAny;
968 else
970
971 /*
972 * Estimate size for our own PARALLEL_KEY_GIN_SHARED workspace.
973 */
976 estsort = tuplesort_estimate_shared(scantuplesortstates);
978
980
981 /*
982 * Estimate space for WalUsage and BufferUsage -- PARALLEL_KEY_WAL_USAGE
983 * and PARALLEL_KEY_BUFFER_USAGE.
984 *
985 * If there are no extensions loaded that care, we could skip this. We
986 * have no way of knowing whether anyone's looking at pgWalUsage or
987 * pgBufferUsage, so do it unconditionally.
988 */
990 mul_size(sizeof(WalUsage), pcxt->nworkers));
993 mul_size(sizeof(BufferUsage), pcxt->nworkers));
995
996 /* Finally, estimate PARALLEL_KEY_QUERY_TEXT space */
998 {
1002 }
1003 else
1004 querylen = 0; /* keep compiler quiet */
1005
1006 /* Everyone's had a chance to ask for space, so now create the DSM */
1008
1009 /* If no DSM segment was available, back out (do serial build) */
1010 if (pcxt->seg == NULL)
1011 {
1012 if (IsMVCCSnapshot(snapshot))
1013 UnregisterSnapshot(snapshot);
1016 return;
1017 }
1018
1019 /* Store shared build state, for which we reserved space */
1020 ginshared = (GinBuildShared *) shm_toc_allocate(pcxt->toc, estginshared);
1021 /* Initialize immutable state */
1022 ginshared->heaprelid = RelationGetRelid(heap);
1023 ginshared->indexrelid = RelationGetRelid(index);
1024 ginshared->isconcurrent = isconcurrent;
1025 ginshared->scantuplesortstates = scantuplesortstates;
1026
1028 SpinLockInit(&ginshared->mutex);
1029
1030 /* Initialize mutable state */
1031 ginshared->nparticipantsdone = 0;
1032 ginshared->reltuples = 0.0;
1033 ginshared->indtuples = 0.0;
1034
1037 snapshot);
1038
1039 /*
1040 * Store shared tuplesort-private state, for which we reserved space.
1041 * Then, initialize opaque state using tuplesort routine.
1042 */
1043 sharedsort = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);
1044 tuplesort_initialize_shared(sharedsort, scantuplesortstates,
1045 pcxt->seg);
1046
1047 shm_toc_insert(pcxt->toc, PARALLEL_KEY_GIN_SHARED, ginshared);
1048 shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT, sharedsort);
1049
1050 /* Store query string for workers */
1052 {
1053 char *sharedquery;
1054
1055 sharedquery = (char *) shm_toc_allocate(pcxt->toc, querylen + 1);
1058 }
1059
1060 /*
1061 * Allocate space for each worker's WalUsage and BufferUsage; no need to
1062 * initialize.
1063 */
1064 walusage = shm_toc_allocate(pcxt->toc,
1065 mul_size(sizeof(WalUsage), pcxt->nworkers));
1066 shm_toc_insert(pcxt->toc, PARALLEL_KEY_WAL_USAGE, walusage);
1067 bufferusage = shm_toc_allocate(pcxt->toc,
1068 mul_size(sizeof(BufferUsage), pcxt->nworkers));
1069 shm_toc_insert(pcxt->toc, PARALLEL_KEY_BUFFER_USAGE, bufferusage);
1070
1071 /* Launch workers, saving status for leader/caller */
1073 ginleader->pcxt = pcxt;
1074 ginleader->nparticipanttuplesorts = pcxt->nworkers_launched;
1076 ginleader->nparticipanttuplesorts++;
1077 ginleader->ginshared = ginshared;
1078 ginleader->sharedsort = sharedsort;
1079 ginleader->snapshot = snapshot;
1080 ginleader->walusage = walusage;
1081 ginleader->bufferusage = bufferusage;
1082
1083 /* If no workers were successfully launched, back out (do serial build) */
1084 if (pcxt->nworkers_launched == 0)
1085 {
1087 return;
1088 }
1089
1090 /* Save leader state now that it's clear build will be parallel */
1091 buildstate->bs_leader = ginleader;
1092
1093 /* Join heap scan ourselves */
1096
1097 /*
1098 * Caller needs to wait for all launched workers when we return. Make
1099 * sure that the failure-to-start case will not hang forever.
1100 */
1102}
1103
1104/*
1105 * Shut down workers, destroy parallel context, and end parallel mode.
1106 */
1107static void
1109{
1110 int i;
1111
1112 /* Shutdown worker processes */
1114
1115 /*
1116 * Next, accumulate WAL usage. (This must wait for the workers to finish,
1117 * or we might get incomplete data.)
1118 */
1119 for (i = 0; i < ginleader->pcxt->nworkers_launched; i++)
1120 InstrAccumParallelQuery(&ginleader->bufferusage[i], &ginleader->walusage[i]);
1121
1122 /* Free last reference to MVCC snapshot, if one was used */
1123 if (IsMVCCSnapshot(ginleader->snapshot))
1124 UnregisterSnapshot(ginleader->snapshot);
1127}
1128
1129/*
1130 * Within leader, wait for end of heap scan.
1131 *
1132 * When called, parallel heap scan started by _gin_begin_parallel() will
1133 * already be underway within worker processes (when leader participates
1134 * as a worker, we should end up here just as workers are finishing).
1135 *
1136 * Returns the total number of heap tuples scanned.
1137 */
1138static double
1140{
1141 GinBuildShared *ginshared = state->bs_leader->ginshared;
1142 int nparticipanttuplesorts;
1143
1144 nparticipanttuplesorts = state->bs_leader->nparticipanttuplesorts;
1145 for (;;)
1146 {
1147 SpinLockAcquire(&ginshared->mutex);
1148 if (ginshared->nparticipantsdone == nparticipanttuplesorts)
1149 {
1150 /* copy the data into leader state */
1151 state->bs_reltuples = ginshared->reltuples;
1152 state->bs_numtuples = ginshared->indtuples;
1153
1154 SpinLockRelease(&ginshared->mutex);
1155 break;
1156 }
1157 SpinLockRelease(&ginshared->mutex);
1158
1161 }
1162
1164
1165 return state->bs_reltuples;
1166}
1167
1168/*
1169 * Buffer used to accumulate TIDs from multiple GinTuples for the same key
1170 * (we read these from the tuplesort, sorted by the key).
1171 *
1172 * This is similar to BuildAccumulator in that it's used to collect TIDs
1173 * in memory before inserting them into the index, but it's much simpler
1174 * as it only deals with a single index key at a time.
1175 *
1176 * When adding TIDs to the buffer, we make sure to keep them sorted, both
1177 * during the initial table scan (and detecting when the scan wraps around),
1178 * and during merging (where we do mergesort).
1179 */
1180typedef struct GinBuffer
1181{
1184 Datum key; /* 0 if no key (and keylen == 0) */
1185 Size keylen; /* number of bytes (not typlen) */
1186
1187 /* type info */
1190
1191 /* Number of TIDs to collect before attempt to write some out. */
1193
1194 /* array of TID values */
1197 SortSupport ssup; /* for sorting/comparing keys */
1200
1201/*
1202 * Check that TID array contains valid values, and that it's sorted (if we
1203 * expect it to be).
1204 */
1205static void
1207{
1208#ifdef USE_ASSERT_CHECKING
1209 /* we should not have a buffer with no TIDs to sort */
1210 Assert(buffer->items != NULL);
1211 Assert(buffer->nitems > 0);
1212
1213 for (int i = 0; i < buffer->nitems; i++)
1214 {
1215 Assert(ItemPointerIsValid(&buffer->items[i]));
1216
1217 /* don't check ordering for the first TID item */
1218 if (i == 0)
1219 continue;
1220
1221 Assert(ItemPointerCompare(&buffer->items[i - 1], &buffer->items[i]) < 0);
1222 }
1223#endif
1224}
1225
1226/*
1227 * GinBuffer checks
1228 *
1229 * Make sure the nitems/items fields are consistent (either the array is empty
1230 * or not empty, the fields need to agree). If there are items, check ordering.
1231 */
1232static void
1234{
1235#ifdef USE_ASSERT_CHECKING
1236 /* if we have any items, the array must exist */
1237 Assert(!((buffer->nitems > 0) && (buffer->items == NULL)));
1238
1239 /*
1240 * The buffer may be empty, in which case we must not call the check of
1241 * item pointers, because that assumes non-emptiness.
1242 */
1243 if (buffer->nitems == 0)
1244 return;
1245
1246 /* Make sure the item pointers are valid and sorted. */
1248#endif
1249}
1250
1251/*
1252 * GinBufferInit
1253 * Initialize buffer to store tuples for a GIN index.
1254 *
1255 * Initialize the buffer used to accumulate TID for a single key at a time
1256 * (we process the data sorted), so we know when we received all data for
1257 * a given key.
1258 *
1259 * Initializes sort support procedures for all index attributes.
1260 */
1261static GinBuffer *
1263{
1265 int i,
1266 nKeys;
1268
1269 /*
1270 * How many items can we fit into the memory limit? We don't want to end
1271 * with too many TIDs. and 64kB seems more than enough. But maybe this
1272 * should be tied to maintenance_work_mem or something like that?
1273 */
1274 buffer->maxitems = (64 * 1024L) / sizeof(ItemPointerData);
1275
1277
1278 buffer->ssup = palloc0_array(SortSupportData, nKeys);
1279
1280 /*
1281 * Lookup ordering operator for the index key data type, and initialize
1282 * the sort support function.
1283 */
1284 for (i = 0; i < nKeys; i++)
1285 {
1286 Oid cmpFunc;
1287 SortSupport sortKey = &buffer->ssup[i];
1289
1290 sortKey->ssup_cxt = CurrentMemoryContext;
1291 sortKey->ssup_collation = index->rd_indcollation[i];
1292
1293 if (!OidIsValid(sortKey->ssup_collation))
1294 sortKey->ssup_collation = DEFAULT_COLLATION_OID;
1295
1296 sortKey->ssup_nulls_first = false;
1297 sortKey->ssup_attno = i + 1;
1298 sortKey->abbreviate = false;
1299
1300 Assert(sortKey->ssup_attno != 0);
1301
1302 /*
1303 * If the compare proc isn't specified in the opclass definition, look
1304 * up the index key type's default btree comparator.
1305 */
1307 if (cmpFunc == InvalidOid)
1308 {
1309 TypeCacheEntry *typentry;
1310
1311 typentry = lookup_type_cache(att->atttypid,
1313 if (!OidIsValid(typentry->cmp_proc_finfo.fn_oid))
1314 ereport(ERROR,
1316 errmsg("could not identify a comparison function for type %s",
1317 format_type_be(att->atttypid))));
1318
1319 cmpFunc = typentry->cmp_proc_finfo.fn_oid;
1320 }
1321
1323 }
1324
1325 return buffer;
1326}
1327
1328/* Is the buffer empty, i.e. has no TID values in the array? */
1329static bool
1331{
1332 return (buffer->nitems == 0);
1333}
1334
1335/*
1336 * GinBufferKeyEquals
1337 * Can the buffer store TIDs for the provided GIN tuple (same key)?
1338 *
1339 * Compare if the tuple matches the already accumulated data in the GIN
1340 * buffer. Compare scalar fields first, before the actual key.
1341 *
1342 * Returns true if the key matches, and the TID belongs to the buffer, or
1343 * false if the key does not match.
1344 */
1345static bool
1347{
1348 int r;
1349 Datum tupkey;
1350
1351 AssertCheckGinBuffer(buffer);
1352
1353 if (tup->attrnum != buffer->attnum)
1354 return false;
1355
1356 /* same attribute should have the same type info */
1357 Assert(tup->typbyval == buffer->typbyval);
1358 Assert(tup->typlen == buffer->typlen);
1359
1360 if (tup->category != buffer->category)
1361 return false;
1362
1363 /*
1364 * For NULL/empty keys, this means equality, for normal keys we need to
1365 * compare the actual key value.
1366 */
1367 if (buffer->category != GIN_CAT_NORM_KEY)
1368 return true;
1369
1370 /*
1371 * For the tuple, get either the first sizeof(Datum) bytes for byval
1372 * types, or a pointer to the beginning of the data array.
1373 */
1374 tupkey = (buffer->typbyval) ? *(Datum *) tup->data : PointerGetDatum(tup->data);
1375
1376 r = ApplySortComparator(buffer->key, false,
1377 tupkey, false,
1378 &buffer->ssup[buffer->attnum - 1]);
1379
1380 return (r == 0);
1381}
1382
1383/*
1384 * GinBufferShouldTrim
1385 * Should we trim the list of item pointers?
1386 *
1387 * By trimming we understand writing out and removing the tuple IDs that
1388 * we know can't change by future merges. We can deduce the TID up to which
1389 * this is guaranteed from the "first" TID in each GIN tuple, which provides
1390 * a "horizon" (for a given key) thanks to the sort.
1391 *
1392 * We don't want to do this too often - compressing longer TID lists is more
1393 * efficient. But we also don't want to accumulate too many TIDs, for two
1394 * reasons. First, it consumes memory and we might exceed maintenance_work_mem
1395 * (or whatever limit applies), even if that's unlikely because TIDs are very
1396 * small so we can fit a lot of them. Second, and more importantly, long TID
1397 * lists are an issue if the scan wraps around, because a key may get a very
1398 * wide list (with min/max TID for that key), forcing "full" mergesorts for
1399 * every list merged into it (instead of the efficient append).
1400 *
1401 * So we look at two things when deciding if to trim - if the resulting list
1402 * (after adding TIDs from the new tuple) would be too long, and if there is
1403 * enough TIDs to trim (with values less than "first" TID from the new tuple),
1404 * we do the trim. By enough we mean at least 128 TIDs (mostly an arbitrary
1405 * number).
1406 *
1407 * We try freezing TIDs at the beginning of the list first, before attempting
1408 * to trim the buffer. This may allow trimming the data earlier, reducing the
1409 * memory usage and excluding it from the mergesort.
1410 */
1411static bool
1413{
1414 /*
1415 * Check if the last TID in the current list is frozen. This is the case
1416 * when merging non-overlapping lists, e.g. in each parallel worker.
1417 */
1418 if ((buffer->nitems > 0) &&
1419 (ItemPointerCompare(&buffer->items[buffer->nitems - 1],
1420 GinTupleGetFirst(tup)) == 0))
1421 buffer->nfrozen = buffer->nitems;
1422
1423 /*
1424 * Now find the last TID we know to be frozen, i.e. the last TID right
1425 * before the new GIN tuple.
1426 *
1427 * Start with the first not-yet-frozen tuple, and walk until we find the
1428 * first TID that's higher. If we already know the whole list is frozen
1429 * (i.e. nfrozen == nitems), this does nothing.
1430 *
1431 * XXX This might do a binary search for sufficiently long lists, but it
1432 * does not seem worth the complexity. Overlapping lists should be rare
1433 * common, TID comparisons are cheap, and we should quickly freeze most of
1434 * the list.
1435 */
1436 for (int i = buffer->nfrozen; i < buffer->nitems; i++)
1437 {
1438 /* Is the TID after the first TID of the new tuple? Can't freeze. */
1439 if (ItemPointerCompare(&buffer->items[i],
1440 GinTupleGetFirst(tup)) > 0)
1441 break;
1442
1443 buffer->nfrozen++;
1444 }
1445
1446 /* not enough TIDs to trim (1024 is somewhat arbitrary number) */
1447 if (buffer->nfrozen < 1024)
1448 return false;
1449
1450 /* no need to trim if we have not hit the memory limit yet */
1451 if ((buffer->nitems + tup->nitems) < buffer->maxitems)
1452 return false;
1453
1454 /*
1455 * OK, we have enough frozen TIDs to flush, and we have hit the memory
1456 * limit, so it's time to write it out.
1457 */
1458 return true;
1459}
1460
1461/*
1462 * GinBufferStoreTuple
1463 * Add data (especially TID list) from a GIN tuple to the buffer.
1464 *
1465 * The buffer is expected to be empty (in which case it's initialized), or
1466 * having the same key. The TID values from the tuple are combined with the
1467 * stored values using a merge sort.
1468 *
1469 * The tuples (for the same key) are expected to be sorted by first TID. But
1470 * this does not guarantee the lists do not overlap, especially in the leader,
1471 * because the workers process interleaving data. There should be no overlaps
1472 * in a single worker - it could happen when the parallel scan wraps around,
1473 * but we detect that and flush the data (see ginBuildCallbackParallel).
1474 *
1475 * By sorting the GinTuple not only by key, but also by the first TID, we make
1476 * it more less likely the lists will overlap during merge. We merge them using
1477 * mergesort, but it's cheaper to just append one list to the other.
1478 *
1479 * How often can the lists overlap? There should be no overlaps in workers,
1480 * and in the leader we can see overlaps between lists built by different
1481 * workers. But the workers merge the items as much as possible, so there
1482 * should not be too many.
1483 */
1484static void
1486{
1488 Datum key;
1489
1490 AssertCheckGinBuffer(buffer);
1491
1494
1495 /* if the buffer is empty, set the fields (and copy the key) */
1496 if (GinBufferIsEmpty(buffer))
1497 {
1498 buffer->category = tup->category;
1499 buffer->keylen = tup->keylen;
1500 buffer->attnum = tup->attrnum;
1501
1502 buffer->typlen = tup->typlen;
1503 buffer->typbyval = tup->typbyval;
1504
1505 if (tup->category == GIN_CAT_NORM_KEY)
1506 buffer->key = datumCopy(key, buffer->typbyval, buffer->typlen);
1507 else
1508 buffer->key = (Datum) 0;
1509 }
1510
1511 /* add the new TIDs into the buffer, combine using merge-sort */
1512 {
1513 int nnew;
1514 ItemPointer new;
1515
1516 /*
1517 * Resize the array - we do this first, because we'll dereference the
1518 * first unfrozen TID, which would fail if the array is NULL. We'll
1519 * still pass 0 as number of elements in that array though.
1520 */
1521 if (buffer->items == NULL)
1522 buffer->items = palloc((buffer->nitems + tup->nitems) * sizeof(ItemPointerData));
1523 else
1524 buffer->items = repalloc(buffer->items,
1525 (buffer->nitems + tup->nitems) * sizeof(ItemPointerData));
1526
1527 new = ginMergeItemPointers(&buffer->items[buffer->nfrozen], /* first unfrozen */
1528 (buffer->nitems - buffer->nfrozen), /* num of unfrozen */
1529 items, tup->nitems, &nnew);
1530
1531 Assert(nnew == (tup->nitems + (buffer->nitems - buffer->nfrozen)));
1532
1533 memcpy(&buffer->items[buffer->nfrozen], new,
1534 nnew * sizeof(ItemPointerData));
1535
1536 pfree(new);
1537
1538 buffer->nitems += tup->nitems;
1539
1541 }
1542
1543 /* free the decompressed TID list */
1544 pfree(items);
1545}
1546
1547/*
1548 * GinBufferReset
1549 * Reset the buffer into a state as if it contains no data.
1550 */
1551static void
1553{
1554 Assert(!GinBufferIsEmpty(buffer));
1555
1556 /* release byref values, do nothing for by-val ones */
1557 if ((buffer->category == GIN_CAT_NORM_KEY) && !buffer->typbyval)
1558 pfree(DatumGetPointer(buffer->key));
1559
1560 /*
1561 * Not required, but makes it more likely to trigger NULL dereference if
1562 * using the value incorrectly, etc.
1563 */
1564 buffer->key = (Datum) 0;
1565
1566 buffer->attnum = 0;
1567 buffer->category = 0;
1568 buffer->keylen = 0;
1569 buffer->nitems = 0;
1570 buffer->nfrozen = 0;
1571
1572 buffer->typlen = 0;
1573 buffer->typbyval = 0;
1574}
1575
1576/*
1577 * GinBufferTrim
1578 * Discard the "frozen" part of the TID list (which should have been
1579 * written to disk/index before this call).
1580 */
1581static void
1583{
1584 Assert((buffer->nfrozen > 0) && (buffer->nfrozen <= buffer->nitems));
1585
1586 memmove(&buffer->items[0], &buffer->items[buffer->nfrozen],
1587 sizeof(ItemPointerData) * (buffer->nitems - buffer->nfrozen));
1588
1589 buffer->nitems -= buffer->nfrozen;
1590 buffer->nfrozen = 0;
1591}
1592
1593/*
1594 * GinBufferFree
1595 * Release memory associated with the GinBuffer (including TID array).
1596 */
1597static void
1599{
1600 if (buffer->items)
1601 pfree(buffer->items);
1602
1603 /* release byref values, do nothing for by-val ones */
1604 if (!GinBufferIsEmpty(buffer) &&
1605 (buffer->category == GIN_CAT_NORM_KEY) && !buffer->typbyval)
1606 pfree(DatumGetPointer(buffer->key));
1607
1608 pfree(buffer);
1609}
1610
1611/*
1612 * GinBufferCanAddKey
1613 * Check if a given GIN tuple can be added to the current buffer.
1614 *
1615 * Returns true if the buffer is either empty or for the same index key.
1616 */
1617static bool
1619{
1620 /* empty buffer can accept data for any key */
1621 if (GinBufferIsEmpty(buffer))
1622 return true;
1623
1624 /* otherwise just data for the same key */
1625 return GinBufferKeyEquals(buffer, tup);
1626}
1627
1628/*
1629 * Within leader, wait for end of heap scan and merge per-worker results.
1630 *
1631 * After waiting for all workers to finish, merge the per-worker results into
1632 * the complete index. The results from each worker are sorted by block number
1633 * (start of the page range). While combining the per-worker results we merge
1634 * summaries for the same page range, and also fill-in empty summaries for
1635 * ranges without any tuples.
1636 *
1637 * Returns the total number of heap tuples scanned.
1638 */
1639static double
1641{
1642 GinTuple *tup;
1643 Size tuplen;
1644 double reltuples = 0;
1645 GinBuffer *buffer;
1646
1647 /* GIN tuples from workers, merged by leader */
1648 double numtuples = 0;
1649
1650 /* wait for workers to scan table and produce partial results */
1651 reltuples = _gin_parallel_heapscan(state);
1652
1653 /* Execute the sort */
1656
1657 /* do the actual sort in the leader */
1658 tuplesort_performsort(state->bs_sortstate);
1659
1660 /*
1661 * Initialize buffer to combine entries for the same key.
1662 *
1663 * The leader is allowed to use the whole maintenance_work_mem buffer to
1664 * combine data. The parallel workers already completed.
1665 */
1666 buffer = GinBufferInit(state->ginstate.index);
1667
1668 /*
1669 * Set the progress target for the next phase. Reset the block number
1670 * values set by table_index_build_scan
1671 */
1672 {
1673 const int progress_index[] = {
1678 };
1679 const int64 progress_vals[] = {
1681 state->bs_numtuples,
1682 0, 0
1683 };
1684
1686 }
1687
1688 /*
1689 * Read the GIN tuples from the shared tuplesort, sorted by category and
1690 * key. That probably gives us order matching how data is organized in the
1691 * index.
1692 *
1693 * We don't insert the GIN tuples right away, but instead accumulate as
1694 * many TIDs for the same key as possible, and then insert that at once.
1695 * This way we don't need to decompress/recompress the posting lists, etc.
1696 */
1697 while ((tup = tuplesort_getgintuple(state->bs_sortstate, &tuplen, true)) != NULL)
1698 {
1700
1702
1703 /*
1704 * If the buffer can accept the new GIN tuple, just store it there and
1705 * we're done. If it's a different key (or maybe too much data) flush
1706 * the current contents into the index first.
1707 */
1708 if (!GinBufferCanAddKey(buffer, tup))
1709 {
1710 /*
1711 * Buffer is not empty and it's storing a different key - flush
1712 * the data into the insert, and start a new entry for current
1713 * GinTuple.
1714 */
1716
1718
1719 ginEntryInsert(&state->ginstate,
1720 buffer->attnum, buffer->key, buffer->category,
1721 buffer->items, buffer->nitems, &state->buildStats);
1722
1724 MemoryContextReset(state->tmpCtx);
1725
1726 /* discard the existing data */
1727 GinBufferReset(buffer);
1728 }
1729
1730 /*
1731 * We're about to add a GIN tuple to the buffer - check the memory
1732 * limit first, and maybe write out some of the data into the index
1733 * first, if needed (and possible). We only flush the part of the TID
1734 * list that we know won't change, and only if there's enough data for
1735 * compression to work well.
1736 */
1737 if (GinBufferShouldTrim(buffer, tup))
1738 {
1739 Assert(buffer->nfrozen > 0);
1740
1741 /*
1742 * Buffer is not empty and it's storing a different key - flush
1743 * the data into the insert, and start a new entry for current
1744 * GinTuple.
1745 */
1747
1749
1750 ginEntryInsert(&state->ginstate,
1751 buffer->attnum, buffer->key, buffer->category,
1752 buffer->items, buffer->nfrozen, &state->buildStats);
1753
1755 MemoryContextReset(state->tmpCtx);
1756
1757 /* truncate the data we've just discarded */
1758 GinBufferTrim(buffer);
1759 }
1760
1761 /*
1762 * Remember data for the current tuple (either remember the new key,
1763 * or append if to the existing data).
1764 */
1765 GinBufferStoreTuple(buffer, tup);
1766
1767 /* Report progress */
1769 ++numtuples);
1770 }
1771
1772 /* flush data remaining in the buffer (for the last key) */
1773 if (!GinBufferIsEmpty(buffer))
1774 {
1776
1777 ginEntryInsert(&state->ginstate,
1778 buffer->attnum, buffer->key, buffer->category,
1779 buffer->items, buffer->nitems, &state->buildStats);
1780
1781 /* discard the existing data */
1782 GinBufferReset(buffer);
1783
1784 /* Report progress */
1786 ++numtuples);
1787 }
1788
1789 /* release all the memory */
1790 GinBufferFree(buffer);
1791
1792 tuplesort_end(state->bs_sortstate);
1793
1794 return reltuples;
1795}
1796
1797/*
1798 * Returns size of shared memory required to store state for a parallel
1799 * gin index build based on the snapshot its parallel scan will use.
1800 */
1801static Size
1803{
1804 /* c.f. shm_toc_allocate as to why BUFFERALIGN is used */
1805 return add_size(BUFFERALIGN(sizeof(GinBuildShared)),
1806 table_parallelscan_estimate(heap, snapshot));
1807}
1808
1809/*
1810 * Within leader, participate as a parallel worker.
1811 */
1812static void
1814{
1815 GinLeader *ginleader = buildstate->bs_leader;
1816 int sortmem;
1817
1818 /*
1819 * Might as well use reliable figure when doling out maintenance_work_mem
1820 * (when requested number of workers were not launched, this will be
1821 * somewhat higher than it is for other workers).
1822 */
1824
1825 /* Perform work common to all participants */
1827 ginleader->sharedsort, heap, index,
1828 sortmem, true);
1829}
1830
1831/*
1832 * _gin_process_worker_data
1833 * First phase of the key merging, happening in the worker.
1834 *
1835 * Depending on the number of distinct keys, the TID lists produced by the
1836 * callback may be very short (due to frequent evictions in the callback).
1837 * But combining many tiny lists is expensive, so we try to do as much as
1838 * possible in the workers and only then pass the results to the leader.
1839 *
1840 * We read the tuples sorted by the key, and merge them into larger lists.
1841 * At the moment there's no memory limit, so this will just produce one
1842 * huge (sorted) list per key in each worker. Which means the leader will
1843 * do a very limited number of mergesorts, which is good.
1844 */
1845static void
1847 bool progress)
1848{
1849 GinTuple *tup;
1850 Size tuplen;
1851
1852 GinBuffer *buffer;
1853
1854 /*
1855 * Initialize buffer to combine entries for the same key.
1856 *
1857 * The workers are limited to the same amount of memory as during the sort
1858 * in ginBuildCallbackParallel. But this probably should be the 32MB used
1859 * during planning, just like there.
1860 */
1861 buffer = GinBufferInit(state->ginstate.index);
1862
1863 /* sort the raw per-worker data */
1864 if (progress)
1867
1868 tuplesort_performsort(state->bs_worker_sort);
1869
1870 /* reset the number of GIN tuples produced by this worker */
1871 state->bs_numtuples = 0;
1872
1873 if (progress)
1876
1877 /*
1878 * Read the GIN tuples from the shared tuplesort, sorted by the key, and
1879 * merge them into larger chunks for the leader to combine.
1880 */
1881 while ((tup = tuplesort_getgintuple(worker_sort, &tuplen, true)) != NULL)
1882 {
1883
1885
1886 /*
1887 * If the buffer can accept the new GIN tuple, just store it there and
1888 * we're done. If it's a different key (or maybe too much data) flush
1889 * the current contents into the index first.
1890 */
1891 if (!GinBufferCanAddKey(buffer, tup))
1892 {
1893 GinTuple *ntup;
1894 Size ntuplen;
1895
1896 /*
1897 * Buffer is not empty and it's storing a different key - flush
1898 * the data into the insert, and start a new entry for current
1899 * GinTuple.
1900 */
1902
1903 ntup = _gin_build_tuple(buffer->attnum, buffer->category,
1904 buffer->key, buffer->typlen, buffer->typbyval,
1905 buffer->items, buffer->nitems, &ntuplen);
1906
1907 tuplesort_putgintuple(state->bs_sortstate, ntup, ntuplen);
1908 state->bs_numtuples++;
1909
1910 pfree(ntup);
1911
1912 /* discard the existing data */
1913 GinBufferReset(buffer);
1914 }
1915
1916 /*
1917 * We're about to add a GIN tuple to the buffer - check the memory
1918 * limit first, and maybe write out some of the data into the index
1919 * first, if needed (and possible). We only flush the part of the TID
1920 * list that we know won't change, and only if there's enough data for
1921 * compression to work well.
1922 */
1923 if (GinBufferShouldTrim(buffer, tup))
1924 {
1925 GinTuple *ntup;
1926 Size ntuplen;
1927
1928 Assert(buffer->nfrozen > 0);
1929
1930 /*
1931 * Buffer is not empty and it's storing a different key - flush
1932 * the data into the insert, and start a new entry for current
1933 * GinTuple.
1934 */
1936
1937 ntup = _gin_build_tuple(buffer->attnum, buffer->category,
1938 buffer->key, buffer->typlen, buffer->typbyval,
1939 buffer->items, buffer->nfrozen, &ntuplen);
1940
1941 tuplesort_putgintuple(state->bs_sortstate, ntup, ntuplen);
1942
1943 pfree(ntup);
1944
1945 /* truncate the data we've just discarded */
1946 GinBufferTrim(buffer);
1947 }
1948
1949 /*
1950 * Remember data for the current tuple (either remember the new key,
1951 * or append if to the existing data).
1952 */
1953 GinBufferStoreTuple(buffer, tup);
1954 }
1955
1956 /* flush data remaining in the buffer (for the last key) */
1957 if (!GinBufferIsEmpty(buffer))
1958 {
1959 GinTuple *ntup;
1960 Size ntuplen;
1961
1963
1964 ntup = _gin_build_tuple(buffer->attnum, buffer->category,
1965 buffer->key, buffer->typlen, buffer->typbyval,
1966 buffer->items, buffer->nitems, &ntuplen);
1967
1968 tuplesort_putgintuple(state->bs_sortstate, ntup, ntuplen);
1969 state->bs_numtuples++;
1970
1971 pfree(ntup);
1972
1973 /* discard the existing data */
1974 GinBufferReset(buffer);
1975 }
1976
1977 /* release all the memory */
1978 GinBufferFree(buffer);
1979
1981}
1982
1983/*
1984 * Perform a worker's portion of a parallel GIN index build sort.
1985 *
1986 * This generates a tuplesort for the worker portion of the table.
1987 *
1988 * sortmem is the amount of working memory to use within each worker,
1989 * expressed in KBs.
1990 *
1991 * When this returns, workers are done, and need only release resources.
1992 *
1993 * Before feeding data into a shared tuplesort (for the leader process),
1994 * the workers process data in two phases.
1995 *
1996 * 1) A worker reads a portion of rows from the table, accumulates entries
1997 * in memory, and flushes them into a private tuplesort (e.g. because of
1998 * using too much memory).
1999 *
2000 * 2) The private tuplesort gets sorted (by key and TID), the worker reads
2001 * the data again, and combines the entries as much as possible. This has
2002 * to happen eventually, and this way it's done in workers in parallel.
2003 *
2004 * Finally, the combined entries are written into the shared tuplesort, so
2005 * that the leader can process them.
2006 *
2007 * How well this works (compared to just writing entries into the shared
2008 * tuplesort) depends on the data set. For large tables with many distinct
2009 * keys this helps a lot. With many distinct keys it's likely the buffers has
2010 * to be flushed often, generating many entries with the same key and short
2011 * TID lists. These entries need to be sorted and merged at some point,
2012 * before writing them to the index. The merging is quite expensive, it can
2013 * easily be ~50% of a serial build, and doing as much of it in the workers
2014 * means it's parallelized. The leader still has to merge results from the
2015 * workers, but it's much more efficient to merge few large entries than
2016 * many tiny ones.
2017 *
2018 * This also reduces the amount of data the workers pass to the leader through
2019 * the shared tuplesort. OTOH the workers need more space for the private sort,
2020 * possibly up to 2x of the data, if no entries be merged in a worker. But this
2021 * is very unlikely, and the only consequence is inefficiency, so we ignore it.
2022 */
2023static void
2025 GinBuildShared *ginshared, Sharedsort *sharedsort,
2026 Relation heap, Relation index,
2027 int sortmem, bool progress)
2028{
2030 TableScanDesc scan;
2031 double reltuples;
2032 IndexInfo *indexInfo;
2033
2034 /* Initialize local tuplesort coordination state */
2036 coordinate->isWorker = true;
2037 coordinate->nParticipants = -1;
2038 coordinate->sharedsort = sharedsort;
2039
2040 /* remember how much space is allowed for the accumulated entries */
2041 state->work_mem = (sortmem / 2);
2042
2043 /* remember how many workers participate in the build */
2044 state->bs_num_workers = ginshared->scantuplesortstates;
2045
2046 /* Begin "partial" tuplesort */
2047 state->bs_sortstate = tuplesort_begin_index_gin(heap, index,
2048 state->work_mem,
2049 coordinate,
2051
2052 /* Local per-worker sort of raw-data */
2053 state->bs_worker_sort = tuplesort_begin_index_gin(heap, index,
2054 state->work_mem,
2055 NULL,
2057
2058 /* Join parallel scan */
2059 indexInfo = BuildIndexInfo(index);
2060 indexInfo->ii_Concurrent = ginshared->isconcurrent;
2061
2062 scan = table_beginscan_parallel(heap,
2064
2065 reltuples = table_index_build_scan(heap, index, indexInfo, true, progress,
2067
2068 /* write remaining accumulated entries */
2070
2071 /*
2072 * Do the first phase of in-worker processing - sort the data produced by
2073 * the callback, and combine them into much larger chunks and place that
2074 * into the shared tuplestore for leader to process.
2075 */
2076 _gin_process_worker_data(state, state->bs_worker_sort, progress);
2077
2078 /* sort the GIN tuples built by this worker */
2079 tuplesort_performsort(state->bs_sortstate);
2080
2081 state->bs_reltuples += reltuples;
2082
2083 /*
2084 * Done. Record ambuild statistics.
2085 */
2086 SpinLockAcquire(&ginshared->mutex);
2087 ginshared->nparticipantsdone++;
2088 ginshared->reltuples += state->bs_reltuples;
2089 ginshared->indtuples += state->bs_numtuples;
2090 SpinLockRelease(&ginshared->mutex);
2091
2092 /* Notify leader */
2094
2095 tuplesort_end(state->bs_sortstate);
2096}
2097
2098/*
2099 * Perform work within a launched parallel process.
2100 */
2101void
2103{
2104 char *sharedquery;
2105 GinBuildShared *ginshared;
2106 Sharedsort *sharedsort;
2108 Relation heapRel;
2109 Relation indexRel;
2112 WalUsage *walusage;
2113 BufferUsage *bufferusage;
2114 int sortmem;
2115
2116 /*
2117 * The only possible status flag that can be set to the parallel worker is
2118 * PROC_IN_SAFE_IC.
2119 */
2120 Assert((MyProc->statusFlags == 0) ||
2122
2123 /* Set debug_query_string for individual workers first */
2126
2127 /* Report the query string from leader */
2129
2130 /* Look up gin shared state */
2131 ginshared = shm_toc_lookup(toc, PARALLEL_KEY_GIN_SHARED, false);
2132
2133 /* Open relations using lock modes known to be obtained by index.c */
2134 if (!ginshared->isconcurrent)
2135 {
2138 }
2139 else
2140 {
2143 }
2144
2145 /* Open relations within worker */
2146 heapRel = table_open(ginshared->heaprelid, heapLockmode);
2147 indexRel = index_open(ginshared->indexrelid, indexLockmode);
2148
2149 /* initialize the GIN build state */
2150 initGinState(&buildstate.ginstate, indexRel);
2151 buildstate.indtuples = 0;
2152 memset(&buildstate.buildStats, 0, sizeof(GinStatsData));
2153 memset(&buildstate.tid, 0, sizeof(ItemPointerData));
2154
2155 /*
2156 * create a temporary memory context that is used to hold data not yet
2157 * dumped out to the index
2158 */
2160 "Gin build temporary context",
2162
2163 /*
2164 * create a temporary memory context that is used for calling
2165 * ginExtractEntries(), and can be reset after each tuple
2166 */
2168 "Gin build temporary context for user-defined function",
2170
2171 buildstate.accum.ginstate = &buildstate.ginstate;
2172 ginInitBA(&buildstate.accum);
2173
2174
2175 /* Look up shared state private to tuplesort.c */
2176 sharedsort = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT, false);
2177 tuplesort_attach_shared(sharedsort, seg);
2178
2179 /* Prepare to track buffer usage during parallel execution */
2181
2182 /*
2183 * Might as well use reliable figure when doling out maintenance_work_mem
2184 * (when requested number of workers were not launched, this will be
2185 * somewhat higher than it is for other workers).
2186 */
2188
2189 _gin_parallel_scan_and_build(&buildstate, ginshared, sharedsort,
2190 heapRel, indexRel, sortmem, false);
2191
2192 /* Report WAL/buffer usage during parallel execution */
2193 bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false);
2194 walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false);
2196 &walusage[ParallelWorkerNumber]);
2197
2198 index_close(indexRel, indexLockmode);
2199 table_close(heapRel, heapLockmode);
2200}
2201
2202/*
2203 * Used to keep track of compressed TID lists when building a GIN tuple.
2204 */
2205typedef struct
2206{
2207 dlist_node node; /* linked list pointers */
2210
2211/*
2212 * _gin_build_tuple
2213 * Serialize the state for an index key into a tuple for tuplesort.
2214 *
2215 * The tuple has a number of scalar fields (mostly matching the build state),
2216 * and then a data array that stores the key first, and then the TID list.
2217 *
2218 * For by-reference data types, we store the actual data. For by-val types
2219 * we simply copy the whole Datum, so that we don't have to care about stuff
2220 * like endianness etc. We could make it a little bit smaller, but it's not
2221 * worth it - it's a tiny fraction of the data, and we need to MAXALIGN the
2222 * start of the TID list anyway. So we wouldn't save anything. (This would
2223 * not be a good idea for the permanent in-index data, since we'd prefer
2224 * that that not depend on sizeof(Datum). But this is just a transient
2225 * representation to use while sorting the data.)
2226 *
2227 * The TID list is serialized as compressed - it's highly compressible, and
2228 * we already have ginCompressPostingList for this purpose. The list may be
2229 * pretty long, so we compress it into multiple segments and then copy all
2230 * of that into the GIN tuple.
2231 */
2232static GinTuple *
2233_gin_build_tuple(OffsetNumber attrnum, unsigned char category,
2234 Datum key, int16 typlen, bool typbyval,
2236 Size *len)
2237{
2238 GinTuple *tuple;
2239 char *ptr;
2240
2241 Size tuplen;
2242 int keylen;
2243
2244 dlist_mutable_iter iter;
2245 dlist_head segments;
2246 int ncompressed;
2248
2249 /*
2250 * Calculate how long is the key value. Only keys with GIN_CAT_NORM_KEY
2251 * have actual non-empty key. We include varlena headers and \0 bytes for
2252 * strings, to make it easier to access the data in-line.
2253 *
2254 * For byval types we simply copy the whole Datum. We could store just the
2255 * necessary bytes, but this is simpler to work with and not worth the
2256 * extra complexity. Moreover we still need to do the MAXALIGN to allow
2257 * direct access to items pointers.
2258 *
2259 * XXX Note that for byval types we store the whole datum, no matter what
2260 * the typlen value is.
2261 */
2262 if (category != GIN_CAT_NORM_KEY)
2263 keylen = 0;
2264 else if (typbyval)
2265 keylen = sizeof(Datum);
2266 else if (typlen > 0)
2267 keylen = typlen;
2268 else if (typlen == -1)
2269 keylen = VARSIZE_ANY(DatumGetPointer(key));
2270 else if (typlen == -2)
2271 keylen = strlen(DatumGetPointer(key)) + 1;
2272 else
2273 elog(ERROR, "unexpected typlen value (%d)", typlen);
2274
2275 /* compress the item pointers */
2276 ncompressed = 0;
2277 compresslen = 0;
2278 dlist_init(&segments);
2279
2280 /* generate compressed segments of TID list chunks */
2281 while (ncompressed < nitems)
2282 {
2283 int cnt;
2285
2287 (nitems - ncompressed),
2288 UINT16_MAX,
2289 &cnt);
2290
2291 ncompressed += cnt;
2293
2294 dlist_push_tail(&segments, &seginfo->node);
2295 }
2296
2297 /*
2298 * Determine GIN tuple length with all the data included. Be careful about
2299 * alignment, to allow direct access to compressed segments (those require
2300 * only SHORTALIGN).
2301 */
2302 tuplen = SHORTALIGN(offsetof(GinTuple, data) + keylen) + compresslen;
2303
2304 *len = tuplen;
2305
2306 /*
2307 * Allocate space for the whole GIN tuple.
2308 *
2309 * The palloc0 is needed - writetup_index_gin will write the whole tuple
2310 * to disk, so we need to make sure the padding bytes are defined
2311 * (otherwise valgrind would report this).
2312 */
2313 tuple = palloc0(tuplen);
2314
2315 tuple->tuplen = tuplen;
2316 tuple->attrnum = attrnum;
2317 tuple->category = category;
2318 tuple->keylen = keylen;
2319 tuple->nitems = nitems;
2320
2321 /* key type info */
2322 tuple->typlen = typlen;
2323 tuple->typbyval = typbyval;
2324
2325 /*
2326 * Copy the key and items into the tuple. First the key value, which we
2327 * can simply copy right at the beginning of the data array.
2328 */
2329 if (category == GIN_CAT_NORM_KEY)
2330 {
2331 if (typbyval)
2332 {
2333 memcpy(tuple->data, &key, sizeof(Datum));
2334 }
2335 else if (typlen > 0) /* byref, fixed length */
2336 {
2337 memcpy(tuple->data, DatumGetPointer(key), typlen);
2338 }
2339 else if (typlen == -1)
2340 {
2341 memcpy(tuple->data, DatumGetPointer(key), keylen);
2342 }
2343 else if (typlen == -2)
2344 {
2345 memcpy(tuple->data, DatumGetPointer(key), keylen);
2346 }
2347 }
2348
2349 /* finally, copy the TIDs into the array */
2350 ptr = (char *) tuple + SHORTALIGN(offsetof(GinTuple, data) + keylen);
2351
2352 /* copy in the compressed data, and free the segments */
2353 dlist_foreach_modify(iter, &segments)
2354 {
2356
2357 memcpy(ptr, seginfo->seg, SizeOfGinPostingList(seginfo->seg));
2358
2359 ptr += SizeOfGinPostingList(seginfo->seg);
2360
2361 dlist_delete(&seginfo->node);
2362
2363 pfree(seginfo->seg);
2364 pfree(seginfo);
2365 }
2366
2367 return tuple;
2368}
2369
2370/*
2371 * _gin_parse_tuple_key
2372 * Return a Datum representing the key stored in the tuple.
2373 *
2374 * Most of the tuple fields are directly accessible, the only thing that
2375 * needs more care is the key and the TID list.
2376 *
2377 * For the key, this returns a regular Datum representing it. It's either the
2378 * actual key value, or a pointer to the beginning of the data array (which is
2379 * where the data was copied by _gin_build_tuple).
2380 */
2381static Datum
2383{
2384 Datum key;
2385
2386 if (a->category != GIN_CAT_NORM_KEY)
2387 return (Datum) 0;
2388
2389 if (a->typbyval)
2390 {
2391 memcpy(&key, a->data, a->keylen);
2392 return key;
2393 }
2394
2395 return PointerGetDatum(a->data);
2396}
2397
2398/*
2399* _gin_parse_tuple_items
2400 * Return a pointer to a palloc'd array of decompressed TID array.
2401 */
2402static ItemPointer
2404{
2405 int len;
2406 char *ptr;
2407 int ndecoded;
2409
2410 len = a->tuplen - SHORTALIGN(offsetof(GinTuple, data) + a->keylen);
2411 ptr = (char *) a + SHORTALIGN(offsetof(GinTuple, data) + a->keylen);
2412
2414
2415 Assert(ndecoded == a->nitems);
2416
2417 return items;
2418}
2419
2420/*
2421 * _gin_compare_tuples
2422 * Compare GIN tuples, used by tuplesort during parallel index build.
2423 *
2424 * The scalar fields (attrnum, category) are compared first, the key value is
2425 * compared last. The comparisons are done using type-specific sort support
2426 * functions.
2427 *
2428 * If the key value matches, we compare the first TID value in the TID list,
2429 * which means the tuples are merged in an order in which they are most
2430 * likely to be simply concatenated. (This "first" TID will also allow us
2431 * to determine a point up to which the list is fully determined and can be
2432 * written into the index to enforce a memory limit etc.)
2433 */
2434int
2436{
2437 int r;
2438 Datum keya,
2439 keyb;
2440
2441 if (a->attrnum < b->attrnum)
2442 return -1;
2443
2444 if (a->attrnum > b->attrnum)
2445 return 1;
2446
2447 if (a->category < b->category)
2448 return -1;
2449
2450 if (a->category > b->category)
2451 return 1;
2452
2453 if (a->category == GIN_CAT_NORM_KEY)
2454 {
2457
2458 r = ApplySortComparator(keya, false,
2459 keyb, false,
2460 &ssup[a->attrnum - 1]);
2461
2462 /* if the key is the same, consider the first TID in the array */
2463 return (r != 0) ? r : ItemPointerCompare(GinTupleGetFirst(a),
2465 }
2466
2469}
int ParallelWorkerNumber
Definition parallel.c:116
void InitializeParallelDSM(ParallelContext *pcxt)
Definition parallel.c:212
void WaitForParallelWorkersToFinish(ParallelContext *pcxt)
Definition parallel.c:804
void LaunchParallelWorkers(ParallelContext *pcxt)
Definition parallel.c:582
void DestroyParallelContext(ParallelContext *pcxt)
Definition parallel.c:958
ParallelContext * CreateParallelContext(const char *library_name, const char *function_name, int nworkers)
Definition parallel.c:174
void WaitForParallelWorkersToAttach(ParallelContext *pcxt)
Definition parallel.c:701
void pgstat_progress_update_param(int index, int64 val)
void pgstat_progress_update_multi_param(int nparam, const int *index, const int64 *val)
void pgstat_report_activity(BackendState state, const char *cmd_str)
@ STATE_RUNNING
uint32 BlockNumber
Definition block.h:31
static Datum values[MAXATTR]
Definition bootstrap.c:147
int Buffer
Definition buf.h:23
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition bufmgr.c:4357
Buffer ExtendBufferedRel(BufferManagerRelation bmr, ForkNumber forkNum, BufferAccessStrategy strategy, uint32 flags)
Definition bufmgr.c:965
void UnlockReleaseBuffer(Buffer buffer)
Definition bufmgr.c:5519
void MarkBufferDirty(Buffer buffer)
Definition bufmgr.c:3057
#define RelationGetNumberOfBlocks(reln)
Definition bufmgr.h:307
static Page BufferGetPage(Buffer buffer)
Definition bufmgr.h:466
static void LockBuffer(Buffer buffer, BufferLockMode mode)
Definition bufmgr.h:328
@ EB_SKIP_EXTENSION_LOCK
Definition bufmgr.h:75
@ EB_LOCK_FIRST
Definition bufmgr.h:87
#define BMR_REL(p_rel)
Definition bufmgr.h:114
static ItemId PageGetItemId(Page page, OffsetNumber offsetNumber)
Definition bufpage.h:243
static void * PageGetItem(PageData *page, const ItemIdData *itemId)
Definition bufpage.h:353
PageData * Page
Definition bufpage.h:81
#define Min(x, y)
Definition c.h:1019
#define BUFFERALIGN(LEN)
Definition c.h:840
#define Assert(condition)
Definition c.h:885
int64_t int64
Definition c.h:555
int16_t int16
Definition c.h:553
#define SHORTALIGN(LEN)
Definition c.h:834
int32_t int32
Definition c.h:554
uint32_t uint32
Definition c.h:558
#define OidIsValid(objectId)
Definition c.h:800
size_t Size
Definition c.h:631
bool ConditionVariableCancelSleep(void)
void ConditionVariableInit(ConditionVariable *cv)
void ConditionVariableSleep(ConditionVariable *cv, uint32 wait_event_info)
void ConditionVariableSignal(ConditionVariable *cv)
Datum datumCopy(Datum value, bool typByVal, int typLen)
Definition datum.c:132
int errcode(int sqlerrcode)
Definition elog.c:874
int errmsg(const char *fmt,...)
Definition elog.c:1093
#define ERROR
Definition elog.h:39
#define elog(elevel,...)
Definition elog.h:226
#define ereport(elevel,...)
Definition elog.h:150
#define palloc_object(type)
Definition fe_memutils.h:74
#define MaxAllocSize
Definition fe_memutils.h:22
#define palloc0_array(type, count)
Definition fe_memutils.h:77
#define palloc0_object(type)
Definition fe_memutils.h:75
char * format_type_be(Oid type_oid)
IndexUniqueCheck
Definition genam.h:122
#define GIN_COMPARE_PROC
Definition gin.h:24
#define PROGRESS_GIN_PHASE_PERFORMSORT_2
Definition gin.h:49
#define PROGRESS_GIN_PHASE_MERGE_1
Definition gin.h:48
#define PROGRESS_GIN_PHASE_PERFORMSORT_1
Definition gin.h:47
#define PROGRESS_GIN_PHASE_MERGE_2
Definition gin.h:50
#define PROGRESS_GIN_PHASE_INDEXBUILD_TABLESCAN
Definition gin.h:46
#define GIN_UNLOCK
Definition gin_private.h:50
#define GinGetUseFastUpdate(relation)
Definition gin_private.h:35
static ItemPointer GinTupleGetFirst(GinTuple *tup)
Definition gin_tuple.h:35
#define GinIsPostingTree(itup)
Definition ginblock.h:231
#define GIN_CAT_NORM_KEY
Definition ginblock.h:208
#define SizeOfGinPostingList(plist)
Definition ginblock.h:342
#define GIN_LEAF
Definition ginblock.h:42
#define GinGetPostingTree(itup)
Definition ginblock.h:233
signed char GinNullCategory
Definition ginblock.h:206
#define GinSetPostingTree(itup, blkno)
Definition ginblock.h:232
#define GinMaxItemSize
Definition ginblock.h:248
void freeGinBtreeStack(GinBtreeStack *stack)
Definition ginbtree.c:198
void ginInsertValue(GinBtree btree, GinBtreeStack *stack, void *insertdata, GinStatsData *buildStats)
Definition ginbtree.c:816
GinBtreeStack * ginFindLeafPage(GinBtree btree, bool searchMode, bool rootConflictCheck)
Definition ginbtree.c:83
void ginBeginBAScan(BuildAccumulator *accum)
Definition ginbulk.c:256
ItemPointerData * ginGetBAEntry(BuildAccumulator *accum, OffsetNumber *attnum, Datum *key, GinNullCategory *category, uint32 *n)
Definition ginbulk.c:267
void ginInsertBAEntries(BuildAccumulator *accum, ItemPointer heapptr, OffsetNumber attnum, Datum *entries, GinNullCategory *categories, int32 nentries)
Definition ginbulk.c:209
void ginInitBA(BuildAccumulator *accum)
Definition ginbulk.c:109
BlockNumber createPostingTree(Relation index, ItemPointerData *items, uint32 nitems, GinStatsData *buildStats, Buffer entrybuffer)
void ginInsertItemPointers(Relation index, BlockNumber rootBlkno, ItemPointerData *items, uint32 nitem, GinStatsData *buildStats)
ItemPointer ginReadTuple(GinState *ginstate, OffsetNumber attnum, IndexTuple itup, int *nitems)
IndexTuple GinFormTuple(GinState *ginstate, OffsetNumber attnum, Datum key, GinNullCategory category, Pointer data, Size dataSize, int nipd, bool errorTooBig)
void ginPrepareEntryScan(GinBtree btree, OffsetNumber attnum, Datum key, GinNullCategory category, GinState *ginstate)
void ginHeapTupleFastCollect(GinState *ginstate, GinTupleCollector *collector, OffsetNumber attnum, Datum value, bool isNull, ItemPointer ht_ctid)
Definition ginfast.c:483
void ginHeapTupleFastInsert(GinState *ginstate, GinTupleCollector *collector)
Definition ginfast.c:219
static void ginBuildCallbackParallel(Relation index, ItemPointer tid, Datum *values, bool *isnull, bool tupleIsAlive, void *state)
Definition gininsert.c:566
#define PARALLEL_KEY_BUFFER_USAGE
Definition gininsert.c:45
static void AssertCheckItemPointers(GinBuffer *buffer)
Definition gininsert.c:1206
int _gin_compare_tuples(GinTuple *a, GinTuple *b, SortSupport ssup)
Definition gininsert.c:2435
static IndexTuple addItemPointersToLeafTuple(GinState *ginstate, IndexTuple old, ItemPointerData *items, uint32 nitem, GinStatsData *buildStats, Buffer buffer)
Definition gininsert.c:213
static bool GinBufferIsEmpty(GinBuffer *buffer)
Definition gininsert.c:1330
#define PARALLEL_KEY_GIN_SHARED
Definition gininsert.c:41
IndexBuildResult * ginbuild(Relation heap, Relation index, IndexInfo *indexInfo)
Definition gininsert.c:614
static GinBuffer * GinBufferInit(Relation index)
Definition gininsert.c:1262
static IndexTuple buildFreshLeafTuple(GinState *ginstate, OffsetNumber attnum, Datum key, GinNullCategory category, ItemPointerData *items, uint32 nitem, GinStatsData *buildStats, Buffer buffer)
Definition gininsert.c:295
static void GinBufferReset(GinBuffer *buffer)
Definition gininsert.c:1552
static void ginHeapTupleBulkInsert(GinBuildState *buildstate, OffsetNumber attnum, Datum value, bool isNull, ItemPointer heapptr)
Definition gininsert.c:424
static void AssertCheckGinBuffer(GinBuffer *buffer)
Definition gininsert.c:1233
static void _gin_parallel_scan_and_build(GinBuildState *state, GinBuildShared *ginshared, Sharedsort *sharedsort, Relation heap, Relation index, int sortmem, bool progress)
Definition gininsert.c:2024
void ginEntryInsert(GinState *ginstate, OffsetNumber attnum, Datum key, GinNullCategory category, ItemPointerData *items, uint32 nitem, GinStatsData *buildStats)
Definition gininsert.c:347
static void GinBufferTrim(GinBuffer *buffer)
Definition gininsert.c:1582
static Size _gin_parallel_estimate_shared(Relation heap, Snapshot snapshot)
Definition gininsert.c:1802
static void _gin_end_parallel(GinLeader *ginleader, GinBuildState *state)
Definition gininsert.c:1108
static void _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, bool isconcurrent, int request)
Definition gininsert.c:929
static void ginFlushBuildState(GinBuildState *buildstate, Relation index)
Definition gininsert.c:501
static void _gin_process_worker_data(GinBuildState *state, Tuplesortstate *worker_sort, bool progress)
Definition gininsert.c:1846
static bool GinBufferKeyEquals(GinBuffer *buffer, GinTuple *tup)
Definition gininsert.c:1346
static double _gin_parallel_merge(GinBuildState *state)
Definition gininsert.c:1640
static void ginHeapTupleInsert(GinState *ginstate, OffsetNumber attnum, Datum value, bool isNull, ItemPointer item)
Definition gininsert.c:839
static bool GinBufferCanAddKey(GinBuffer *buffer, GinTuple *tup)
Definition gininsert.c:1618
void _gin_parallel_build_main(dsm_segment *seg, shm_toc *toc)
Definition gininsert.c:2102
void ginbuildempty(Relation index)
Definition gininsert.c:808
bool gininsert(Relation index, Datum *values, bool *isnull, ItemPointer ht_ctid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, IndexInfo *indexInfo)
Definition gininsert.c:857
static void _gin_leader_participate_as_worker(GinBuildState *buildstate, Relation heap, Relation index)
Definition gininsert.c:1813
static GinTuple * _gin_build_tuple(OffsetNumber attrnum, unsigned char category, Datum key, int16 typlen, bool typbyval, ItemPointerData *items, uint32 nitems, Size *len)
Definition gininsert.c:2233
static Datum _gin_parse_tuple_key(GinTuple *a)
Definition gininsert.c:2382
#define PARALLEL_KEY_TUPLESORT
Definition gininsert.c:42
static bool GinBufferShouldTrim(GinBuffer *buffer, GinTuple *tup)
Definition gininsert.c:1412
#define PARALLEL_KEY_QUERY_TEXT
Definition gininsert.c:43
static void GinBufferFree(GinBuffer *buffer)
Definition gininsert.c:1598
static ItemPointer _gin_parse_tuple_items(GinTuple *a)
Definition gininsert.c:2403
#define ParallelTableScanFromGinBuildShared(shared)
Definition gininsert.c:106
#define PARALLEL_KEY_WAL_USAGE
Definition gininsert.c:44
static void GinBufferStoreTuple(GinBuffer *buffer, GinTuple *tup)
Definition gininsert.c:1485
static void ginBuildCallback(Relation index, ItemPointer tid, Datum *values, bool *isnull, bool tupleIsAlive, void *state)
Definition gininsert.c:448
static double _gin_parallel_heapscan(GinBuildState *state)
Definition gininsert.c:1139
ItemPointer ginPostingListDecodeAllSegments(GinPostingList *segment, int len, int *ndecoded_out)
GinPostingList * ginCompressPostingList(const ItemPointerData *ipd, int nipd, int maxsize, int *nwritten)
ItemPointer ginMergeItemPointers(ItemPointerData *a, uint32 na, ItemPointerData *b, uint32 nb, int *nmerged)
OffsetNumber gintuple_get_attrnum(GinState *ginstate, IndexTuple tuple)
Definition ginutil.c:232
Buffer GinNewBuffer(Relation index)
Definition ginutil.c:306
void GinInitBuffer(Buffer b, uint32 f)
Definition ginutil.c:356
Datum * ginExtractEntries(GinState *ginstate, OffsetNumber attnum, Datum value, bool isNull, int32 *nentries, GinNullCategory **categories)
Definition ginutil.c:451
Datum gintuple_get_key(GinState *ginstate, IndexTuple tuple, GinNullCategory *category)
Definition ginutil.c:265
void GinInitMetabuffer(Buffer b)
Definition ginutil.c:362
void initGinState(GinState *state, Relation index)
Definition ginutil.c:103
void ginUpdateStats(Relation index, const GinStatsData *stats, bool is_build)
Definition ginutil.c:618
int maintenance_work_mem
Definition globals.c:133
static void dlist_init(dlist_head *head)
Definition ilist.h:314
static void dlist_delete(dlist_node *node)
Definition ilist.h:405
#define dlist_foreach_modify(iter, lhead)
Definition ilist.h:640
static void dlist_push_tail(dlist_head *head, dlist_node *node)
Definition ilist.h:364
#define dlist_container(type, membername, ptr)
Definition ilist.h:593
#define nitems(x)
Definition indent.h:31
IndexInfo * BuildIndexInfo(Relation index)
Definition index.c:2426
void index_close(Relation relation, LOCKMODE lockmode)
Definition indexam.c:177
RegProcedure index_getprocid(Relation irel, AttrNumber attnum, uint16 procnum)
Definition indexam.c:883
Relation index_open(Oid relationId, LOCKMODE lockmode)
Definition indexam.c:133
static struct @174 value
void InstrAccumParallelQuery(BufferUsage *bufusage, WalUsage *walusage)
Definition instrument.c:219
void InstrEndParallelQuery(BufferUsage *bufusage, WalUsage *walusage)
Definition instrument.c:209
void InstrStartParallelQuery(void)
Definition instrument.c:201
int b
Definition isn.c:74
int a
Definition isn.c:73
int i
Definition isn.c:77
int32 ItemPointerCompare(const ItemPointerData *arg1, const ItemPointerData *arg2)
Definition itemptr.c:51
static bool ItemPointerIsValid(const ItemPointerData *pointer)
Definition itemptr.h:83
IndexTupleData * IndexTuple
Definition itup.h:53
int LOCKMODE
Definition lockdefs.h:26
#define AccessExclusiveLock
Definition lockdefs.h:43
#define ShareUpdateExclusiveLock
Definition lockdefs.h:39
#define ShareLock
Definition lockdefs.h:40
#define RowExclusiveLock
Definition lockdefs.h:38
void MemoryContextReset(MemoryContext context)
Definition mcxt.c:403
void * repalloc(void *pointer, Size size)
Definition mcxt.c:1632
void pfree(void *pointer)
Definition mcxt.c:1616
void * palloc0(Size size)
Definition mcxt.c:1417
void * palloc(Size size)
Definition mcxt.c:1387
MemoryContext CurrentMemoryContext
Definition mcxt.c:160
void MemoryContextDelete(MemoryContext context)
Definition mcxt.c:472
#define AllocSetContextCreate
Definition memutils.h:129
#define ALLOCSET_DEFAULT_SIZES
Definition memutils.h:160
#define START_CRIT_SECTION()
Definition miscadmin.h:150
#define CHECK_FOR_INTERRUPTS()
Definition miscadmin.h:123
#define END_CRIT_SECTION()
Definition miscadmin.h:152
uint16 OffsetNumber
Definition off.h:24
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
Definition palloc.h:124
int16 attnum
FormData_pg_attribute * Form_pg_attribute
const void size_t len
const void * data
static int progress
Definition pgbench.c:262
const char * debug_query_string
Definition postgres.c:90
static Datum PointerGetDatum(const void *X)
Definition postgres.h:352
uint64_t Datum
Definition postgres.h:70
static Pointer DatumGetPointer(Datum X)
Definition postgres.h:342
#define InvalidOid
unsigned int Oid
void CheckForSerializableConflictIn(Relation relation, const ItemPointerData *tid, BlockNumber blkno)
Definition predicate.c:4334
static int fb(int x)
#define PROC_IN_SAFE_IC
Definition proc.h:60
#define PROGRESS_CREATEIDX_TUPLES_TOTAL
Definition progress.h:106
#define PROGRESS_SCAN_BLOCKS_DONE
Definition progress.h:142
#define PROGRESS_CREATEIDX_TUPLES_DONE
Definition progress.h:107
#define PROGRESS_CREATEIDX_SUBPHASE
Definition progress.h:105
#define PROGRESS_SCAN_BLOCKS_TOTAL
Definition progress.h:141
#define RelationGetRelid(relation)
Definition rel.h:514
#define RelationGetDescr(relation)
Definition rel.h:540
#define RelationGetRelationName(relation)
Definition rel.h:548
#define RelationNeedsWAL(relation)
Definition rel.h:637
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition rel.h:533
@ MAIN_FORKNUM
Definition relpath.h:58
@ INIT_FORKNUM
Definition relpath.h:61
void * shm_toc_allocate(shm_toc *toc, Size nbytes)
Definition shm_toc.c:88
void shm_toc_insert(shm_toc *toc, uint64 key, void *address)
Definition shm_toc.c:171
void * shm_toc_lookup(shm_toc *toc, uint64 key, bool noError)
Definition shm_toc.c:232
#define shm_toc_estimate_chunk(e, sz)
Definition shm_toc.h:51
#define shm_toc_estimate_keys(e, cnt)
Definition shm_toc.h:53
Size add_size(Size s1, Size s2)
Definition shmem.c:482
Size mul_size(Size s1, Size s2)
Definition shmem.c:497
Snapshot GetTransactionSnapshot(void)
Definition snapmgr.c:272
void UnregisterSnapshot(Snapshot snapshot)
Definition snapmgr.c:866
Snapshot RegisterSnapshot(Snapshot snapshot)
Definition snapmgr.c:824
#define SnapshotAny
Definition snapmgr.h:33
#define IsMVCCSnapshot(snapshot)
Definition snapmgr.h:55
void PrepareSortSupportComparisonShim(Oid cmpFunc, SortSupport ssup)
Definition sortsupport.c:68
static int ApplySortComparator(Datum datum1, bool isNull1, Datum datum2, bool isNull2, SortSupport ssup)
static void SpinLockRelease(volatile slock_t *lock)
Definition spin.h:62
static void SpinLockAcquire(volatile slock_t *lock)
Definition spin.h:56
static void SpinLockInit(volatile slock_t *lock)
Definition spin.h:50
PGPROC * MyProc
Definition proc.c:67
Oid fn_oid
Definition fmgr.h:59
bool(* findItem)(GinBtree, GinBtreeStack *)
OffsetNumber off
Size keylen
Definition gininsert.c:1185
GinNullCategory category
Definition gininsert.c:1183
OffsetNumber attnum
Definition gininsert.c:1182
Datum key
Definition gininsert.c:1184
SortSupport ssup
Definition gininsert.c:1197
ItemPointerData * items
Definition gininsert.c:1198
int16 typlen
Definition gininsert.c:1188
bool typbyval
Definition gininsert.c:1189
double reltuples
Definition gininsert.c:90
int scantuplesortstates
Definition gininsert.c:61
double indtuples
Definition gininsert.c:91
bool isconcurrent
Definition gininsert.c:60
slock_t mutex
Definition gininsert.c:77
ConditionVariable workersdonecv
Definition gininsert.c:69
int nparticipantsdone
Definition gininsert.c:89
double indtuples
Definition gininsert.c:144
GinState ginstate
Definition gininsert.c:143
double bs_reltuples
Definition gininsert.c:163
MemoryContext tmpCtx
Definition gininsert.c:146
GinLeader * bs_leader
Definition gininsert.c:156
GinStatsData buildStats
Definition gininsert.c:145
Tuplesortstate * bs_worker_sort
Definition gininsert.c:178
ItemPointerData tid
Definition gininsert.c:149
double bs_numtuples
Definition gininsert.c:162
Tuplesortstate * bs_sortstate
Definition gininsert.c:170
MemoryContext funcCtx
Definition gininsert.c:147
BuildAccumulator accum
Definition gininsert.c:148
Sharedsort * sharedsort
Definition gininsert.c:135
int nparticipanttuplesorts
Definition gininsert.c:123
ParallelContext * pcxt
Definition gininsert.c:115
BufferUsage * bufferusage
Definition gininsert.c:138
Snapshot snapshot
Definition gininsert.c:136
GinBuildShared * ginshared
Definition gininsert.c:134
WalUsage * walusage
Definition gininsert.c:137
dlist_node node
Definition gininsert.c:2207
GinPostingList * seg
Definition gininsert.c:2208
TupleDesc origTupdesc
Definition gin_private.h:74
Relation index
Definition gin_private.h:60
int64 nEntries
Definition gin.h:61
char data[FLEXIBLE_ARRAY_MEMBER]
Definition gin_tuple.h:31
int nitems
Definition gin_tuple.h:30
int16 typlen
Definition gin_tuple.h:27
bool typbyval
Definition gin_tuple.h:28
signed char category
Definition gin_tuple.h:29
int tuplen
Definition gin_tuple.h:24
uint16 keylen
Definition gin_tuple.h:26
OffsetNumber attrnum
Definition gin_tuple.h:25
double heap_tuples
Definition genam.h:38
double index_tuples
Definition genam.h:39
void * ii_AmCache
Definition execnodes.h:225
int ii_ParallelWorkers
Definition execnodes.h:220
bool ii_Concurrent
Definition execnodes.h:212
MemoryContext ii_Context
Definition execnodes.h:228
uint8 statusFlags
Definition proc.h:202
dsm_segment * seg
Definition parallel.h:44
shm_toc_estimator estimator
Definition parallel.h:43
shm_toc * toc
Definition parallel.h:46
int nworkers_launched
Definition parallel.h:39
FmgrInfo cmp_proc_finfo
Definition typcache.h:77
dlist_node * cur
Definition ilist.h:200
Definition type.h:96
void table_close(Relation relation, LOCKMODE lockmode)
Definition table.c:126
Relation table_open(Oid relationId, LOCKMODE lockmode)
Definition table.c:40
TableScanDesc table_beginscan_parallel(Relation relation, ParallelTableScanDesc pscan)
Definition tableam.c:166
Size table_parallelscan_estimate(Relation rel, Snapshot snapshot)
Definition tableam.c:131
void table_parallelscan_initialize(Relation rel, ParallelTableScanDesc pscan, Snapshot snapshot)
Definition tableam.c:146
static double table_index_build_scan(Relation table_rel, Relation index_rel, IndexInfo *index_info, bool allow_sync, bool progress, IndexBuildCallback callback, void *callback_state, TableScanDesc scan)
Definition tableam.h:1765
static ItemArray items
static FormData_pg_attribute * TupleDescAttr(TupleDesc tupdesc, int i)
Definition tupdesc.h:160
static CompactAttribute * TupleDescCompactAttr(TupleDesc tupdesc, int i)
Definition tupdesc.h:175
void tuplesort_performsort(Tuplesortstate *state)
Definition tuplesort.c:1259
void tuplesort_initialize_shared(Sharedsort *shared, int nWorkers, dsm_segment *seg)
Definition tuplesort.c:3210
Size tuplesort_estimate_shared(int nWorkers)
Definition tuplesort.c:3189
void tuplesort_end(Tuplesortstate *state)
Definition tuplesort.c:847
void tuplesort_attach_shared(Sharedsort *shared, dsm_segment *seg)
Definition tuplesort.c:3233
#define TUPLESORT_NONE
Definition tuplesort.h:67
Tuplesortstate * tuplesort_begin_index_gin(Relation heapRel, Relation indexRel, int workMem, SortCoordinate coordinate, int sortopt)
GinTuple * tuplesort_getgintuple(Tuplesortstate *state, Size *len, bool forward)
void tuplesort_putgintuple(Tuplesortstate *state, GinTuple *tuple, Size size)
TypeCacheEntry * lookup_type_cache(Oid type_id, int flags)
Definition typcache.c:389
#define TYPECACHE_CMP_PROC_FINFO
Definition typcache.h:144
static Size VARSIZE_ANY(const void *PTR)
Definition varatt.h:460
void ExitParallelMode(void)
Definition xact.c:1065
void EnterParallelMode(void)
Definition xact.c:1052
void log_newpage_range(Relation rel, ForkNumber forknum, BlockNumber startblk, BlockNumber endblk, bool page_std)
XLogRecPtr log_newpage_buffer(Buffer buffer, bool page_std)