PostgreSQL Source Code git master
hash.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * hash.c
4 * Implementation of Margo Seltzer's Hashing package for postgres.
5 *
6 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
8 *
9 *
10 * IDENTIFICATION
11 * src/backend/access/hash/hash.c
12 *
13 * NOTES
14 * This file contains only the public interface routines.
15 *
16 *-------------------------------------------------------------------------
17 */
18
19#include "postgres.h"
20
21#include "access/hash.h"
22#include "access/hash_xlog.h"
23#include "access/relscan.h"
24#include "access/tableam.h"
25#include "access/xloginsert.h"
26#include "commands/progress.h"
27#include "commands/vacuum.h"
28#include "miscadmin.h"
29#include "nodes/execnodes.h"
30#include "optimizer/plancat.h"
31#include "pgstat.h"
32#include "utils/fmgrprotos.h"
34#include "utils/rel.h"
35
36/* Working state for hashbuild and its callback */
37typedef struct
38{
39 HSpool *spool; /* NULL if not using spooling */
40 double indtuples; /* # tuples accepted into index */
41 Relation heapRel; /* heap relation descriptor */
43
45 ItemPointer tid,
47 bool *isnull,
48 bool tupleIsAlive,
49 void *state);
50
51
52/*
53 * Hash handler function: return IndexAmRoutine with access method parameters
54 * and callbacks.
55 */
58{
60
62 amroutine->amsupport = HASHNProcs;
64 amroutine->amcanorder = false;
65 amroutine->amcanorderbyop = false;
66 amroutine->amcanbackward = true;
67 amroutine->amcanunique = false;
68 amroutine->amcanmulticol = false;
69 amroutine->amoptionalkey = false;
70 amroutine->amsearcharray = false;
71 amroutine->amsearchnulls = false;
72 amroutine->amstorage = false;
73 amroutine->amclusterable = false;
74 amroutine->ampredlocks = true;
75 amroutine->amcanparallel = false;
76 amroutine->amcanbuildparallel = false;
77 amroutine->amcaninclude = false;
78 amroutine->amusemaintenanceworkmem = false;
79 amroutine->amsummarizing = false;
80 amroutine->amparallelvacuumoptions =
82 amroutine->amkeytype = INT4OID;
83
84 amroutine->ambuild = hashbuild;
85 amroutine->ambuildempty = hashbuildempty;
86 amroutine->aminsert = hashinsert;
87 amroutine->aminsertcleanup = NULL;
88 amroutine->ambulkdelete = hashbulkdelete;
90 amroutine->amcanreturn = NULL;
92 amroutine->amgettreeheight = NULL;
93 amroutine->amoptions = hashoptions;
94 amroutine->amproperty = NULL;
95 amroutine->ambuildphasename = NULL;
96 amroutine->amvalidate = hashvalidate;
98 amroutine->ambeginscan = hashbeginscan;
99 amroutine->amrescan = hashrescan;
100 amroutine->amgettuple = hashgettuple;
101 amroutine->amgetbitmap = hashgetbitmap;
102 amroutine->amendscan = hashendscan;
103 amroutine->ammarkpos = NULL;
104 amroutine->amrestrpos = NULL;
105 amroutine->amestimateparallelscan = NULL;
106 amroutine->aminitparallelscan = NULL;
107 amroutine->amparallelrescan = NULL;
108
109 PG_RETURN_POINTER(amroutine);
110}
111
112/*
113 * hashbuild() -- build a new hash index.
114 */
117{
118 IndexBuildResult *result;
119 BlockNumber relpages;
120 double reltuples;
121 double allvisfrac;
122 uint32 num_buckets;
123 long sort_threshold;
124 HashBuildState buildstate;
125
126 /*
127 * We expect to be called exactly once for any index relation. If that's
128 * not the case, big trouble's what we have.
129 */
131 elog(ERROR, "index \"%s\" already contains data",
133
134 /* Estimate the number of rows currently present in the table */
135 estimate_rel_size(heap, NULL, &relpages, &reltuples, &allvisfrac);
136
137 /* Initialize the hash index metadata page and initial buckets */
138 num_buckets = _hash_init(index, reltuples, MAIN_FORKNUM);
139
140 /*
141 * If we just insert the tuples into the index in scan order, then
142 * (assuming their hash codes are pretty random) there will be no locality
143 * of access to the index, and if the index is bigger than available RAM
144 * then we'll thrash horribly. To prevent that scenario, we can sort the
145 * tuples by (expected) bucket number. However, such a sort is useless
146 * overhead when the index does fit in RAM. We choose to sort if the
147 * initial index size exceeds maintenance_work_mem, or the number of
148 * buffers usable for the index, whichever is less. (Limiting by the
149 * number of buffers should reduce thrashing between PG buffers and kernel
150 * buffers, which seems useful even if no physical I/O results. Limiting
151 * by maintenance_work_mem is useful to allow easy testing of the sort
152 * code path, and may be useful to DBAs as an additional control knob.)
153 *
154 * NOTE: this test will need adjustment if a bucket is ever different from
155 * one page. Also, "initial index size" accounting does not include the
156 * metapage, nor the first bitmap page.
157 */
158 sort_threshold = (maintenance_work_mem * 1024L) / BLCKSZ;
159 if (index->rd_rel->relpersistence != RELPERSISTENCE_TEMP)
160 sort_threshold = Min(sort_threshold, NBuffers);
161 else
162 sort_threshold = Min(sort_threshold, NLocBuffer);
163
164 if (num_buckets >= (uint32) sort_threshold)
165 buildstate.spool = _h_spoolinit(heap, index, num_buckets);
166 else
167 buildstate.spool = NULL;
168
169 /* prepare to build the index */
170 buildstate.indtuples = 0;
171 buildstate.heapRel = heap;
172
173 /* do the heap scan */
174 reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
176 &buildstate, NULL);
178 buildstate.indtuples);
179
180 if (buildstate.spool)
181 {
182 /* sort the tuples and insert them into the index */
183 _h_indexbuild(buildstate.spool, buildstate.heapRel);
184 _h_spooldestroy(buildstate.spool);
185 }
186
187 /*
188 * Return statistics
189 */
190 result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
191
192 result->heap_tuples = reltuples;
193 result->index_tuples = buildstate.indtuples;
194
195 return result;
196}
197
198/*
199 * hashbuildempty() -- build an empty hash index in the initialization fork
200 */
201void
203{
205}
206
207/*
208 * Per-tuple callback for table_index_build_scan
209 */
210static void
212 ItemPointer tid,
213 Datum *values,
214 bool *isnull,
215 bool tupleIsAlive,
216 void *state)
217{
218 HashBuildState *buildstate = (HashBuildState *) state;
219 Datum index_values[1];
220 bool index_isnull[1];
221 IndexTuple itup;
222
223 /* convert data to a hash key; on failure, do not insert anything */
225 values, isnull,
226 index_values, index_isnull))
227 return;
228
229 /* Either spool the tuple for sorting, or just put it into the index */
230 if (buildstate->spool)
231 _h_spool(buildstate->spool, tid, index_values, index_isnull);
232 else
233 {
234 /* form an index tuple and point it at the heap tuple */
236 index_values, index_isnull);
237 itup->t_tid = *tid;
238 _hash_doinsert(index, itup, buildstate->heapRel, false);
239 pfree(itup);
240 }
241
242 buildstate->indtuples += 1;
243}
244
245/*
246 * hashinsert() -- insert an index tuple into a hash table.
247 *
248 * Hash on the heap tuple's key, form an index tuple with hash code.
249 * Find the appropriate location for the new tuple, and put it there.
250 */
251bool
252hashinsert(Relation rel, Datum *values, bool *isnull,
253 ItemPointer ht_ctid, Relation heapRel,
254 IndexUniqueCheck checkUnique,
255 bool indexUnchanged,
256 IndexInfo *indexInfo)
257{
258 Datum index_values[1];
259 bool index_isnull[1];
260 IndexTuple itup;
261
262 /* convert data to a hash key; on failure, do not insert anything */
263 if (!_hash_convert_tuple(rel,
264 values, isnull,
265 index_values, index_isnull))
266 return false;
267
268 /* form an index tuple and point it at the heap tuple */
269 itup = index_form_tuple(RelationGetDescr(rel), index_values, index_isnull);
270 itup->t_tid = *ht_ctid;
271
272 _hash_doinsert(rel, itup, heapRel, false);
273
274 pfree(itup);
275
276 return false;
277}
278
279
280/*
281 * hashgettuple() -- Get the next tuple in the scan.
282 */
283bool
285{
287 bool res;
288
289 /* Hash indexes are always lossy since we store only the hash code */
290 scan->xs_recheck = true;
291
292 /*
293 * If we've already initialized this scan, we can just advance it in the
294 * appropriate direction. If we haven't done so yet, we call a routine to
295 * get the first item in the scan.
296 */
298 res = _hash_first(scan, dir);
299 else
300 {
301 /*
302 * Check to see if we should kill the previously-fetched tuple.
303 */
304 if (scan->kill_prior_tuple)
305 {
306 /*
307 * Yes, so remember it for later. (We'll deal with all such tuples
308 * at once right after leaving the index page or at end of scan.)
309 * In case if caller reverses the indexscan direction it is quite
310 * possible that the same item might get entered multiple times.
311 * But, we don't detect that; instead, we just forget any excess
312 * entries.
313 */
314 if (so->killedItems == NULL)
315 so->killedItems = (int *)
316 palloc(MaxIndexTuplesPerPage * sizeof(int));
317
319 so->killedItems[so->numKilled++] = so->currPos.itemIndex;
320 }
321
322 /*
323 * Now continue the scan.
324 */
325 res = _hash_next(scan, dir);
326 }
327
328 return res;
329}
330
331
332/*
333 * hashgetbitmap() -- get all tuples at once
334 */
335int64
337{
339 bool res;
340 int64 ntids = 0;
341 HashScanPosItem *currItem;
342
344
345 while (res)
346 {
347 currItem = &so->currPos.items[so->currPos.itemIndex];
348
349 /*
350 * _hash_first and _hash_next handle eliminate dead index entries
351 * whenever scan->ignore_killed_tuples is true. Therefore, there's
352 * nothing to do here except add the results to the TIDBitmap.
353 */
354 tbm_add_tuples(tbm, &(currItem->heapTid), 1, true);
355 ntids++;
356
358 }
359
360 return ntids;
361}
362
363
364/*
365 * hashbeginscan() -- start a scan on a hash index
366 */
368hashbeginscan(Relation rel, int nkeys, int norderbys)
369{
370 IndexScanDesc scan;
372
373 /* no order by operators allowed */
374 Assert(norderbys == 0);
375
376 scan = RelationGetIndexScan(rel, nkeys, norderbys);
377
382
383 so->hashso_buc_populated = false;
384 so->hashso_buc_split = false;
385
386 so->killedItems = NULL;
387 so->numKilled = 0;
388
389 scan->opaque = so;
390
391 return scan;
392}
393
394/*
395 * hashrescan() -- rescan an index relation
396 */
397void
398hashrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
399 ScanKey orderbys, int norderbys)
400{
402 Relation rel = scan->indexRelation;
403
405 {
406 /* Before leaving current page, deal with any killed items */
407 if (so->numKilled > 0)
408 _hash_kill_items(scan);
409 }
410
411 _hash_dropscanbuf(rel, so);
412
413 /* set position invalid (this will cause _hash_first call) */
415
416 /* Update scan key, if a new one is given */
417 if (scankey && scan->numberOfKeys > 0)
418 memcpy(scan->keyData, scankey, scan->numberOfKeys * sizeof(ScanKeyData));
419
420 so->hashso_buc_populated = false;
421 so->hashso_buc_split = false;
422}
423
424/*
425 * hashendscan() -- close down a scan
426 */
427void
429{
431 Relation rel = scan->indexRelation;
432
434 {
435 /* Before leaving current page, deal with any killed items */
436 if (so->numKilled > 0)
437 _hash_kill_items(scan);
438 }
439
440 _hash_dropscanbuf(rel, so);
441
442 if (so->killedItems != NULL)
443 pfree(so->killedItems);
444 pfree(so);
445 scan->opaque = NULL;
446}
447
448/*
449 * Bulk deletion of all index entries pointing to a set of heap tuples.
450 * The set of target tuples is specified via a callback routine that tells
451 * whether any given heap tuple (identified by ItemPointer) is being deleted.
452 *
453 * This function also deletes the tuples that are moved by split to other
454 * bucket.
455 *
456 * Result: a palloc'd struct containing statistical info for VACUUM displays.
457 */
460 IndexBulkDeleteCallback callback, void *callback_state)
461{
462 Relation rel = info->index;
463 double tuples_removed;
464 double num_index_tuples;
465 double orig_ntuples;
466 Bucket orig_maxbucket;
467 Bucket cur_maxbucket;
468 Bucket cur_bucket;
469 Buffer metabuf = InvalidBuffer;
470 HashMetaPage metap;
471 HashMetaPage cachedmetap;
472
473 tuples_removed = 0;
474 num_index_tuples = 0;
475
476 /*
477 * We need a copy of the metapage so that we can use its hashm_spares[]
478 * values to compute bucket page addresses, but a cached copy should be
479 * good enough. (If not, we'll detect that further down and refresh the
480 * cache as necessary.)
481 */
482 cachedmetap = _hash_getcachedmetap(rel, &metabuf, false);
483 Assert(cachedmetap != NULL);
484
485 orig_maxbucket = cachedmetap->hashm_maxbucket;
486 orig_ntuples = cachedmetap->hashm_ntuples;
487
488 /* Scan the buckets that we know exist */
489 cur_bucket = 0;
490 cur_maxbucket = orig_maxbucket;
491
492loop_top:
493 while (cur_bucket <= cur_maxbucket)
494 {
495 BlockNumber bucket_blkno;
496 BlockNumber blkno;
497 Buffer bucket_buf;
498 Buffer buf;
499 HashPageOpaque bucket_opaque;
500 Page page;
501 bool split_cleanup = false;
502
503 /* Get address of bucket's start page */
504 bucket_blkno = BUCKET_TO_BLKNO(cachedmetap, cur_bucket);
505
506 blkno = bucket_blkno;
507
508 /*
509 * We need to acquire a cleanup lock on the primary bucket page to out
510 * wait concurrent scans before deleting the dead tuples.
511 */
515
516 page = BufferGetPage(buf);
517 bucket_opaque = HashPageGetOpaque(page);
518
519 /*
520 * If the bucket contains tuples that are moved by split, then we need
521 * to delete such tuples. We can't delete such tuples if the split
522 * operation on bucket is not finished as those are needed by scans.
523 */
524 if (!H_BUCKET_BEING_SPLIT(bucket_opaque) &&
525 H_NEEDS_SPLIT_CLEANUP(bucket_opaque))
526 {
527 split_cleanup = true;
528
529 /*
530 * This bucket might have been split since we last held a lock on
531 * the metapage. If so, hashm_maxbucket, hashm_highmask and
532 * hashm_lowmask might be old enough to cause us to fail to remove
533 * tuples left behind by the most recent split. To prevent that,
534 * now that the primary page of the target bucket has been locked
535 * (and thus can't be further split), check whether we need to
536 * update our cached metapage data.
537 */
538 Assert(bucket_opaque->hasho_prevblkno != InvalidBlockNumber);
539 if (bucket_opaque->hasho_prevblkno > cachedmetap->hashm_maxbucket)
540 {
541 cachedmetap = _hash_getcachedmetap(rel, &metabuf, true);
542 Assert(cachedmetap != NULL);
543 }
544 }
545
546 bucket_buf = buf;
547
548 hashbucketcleanup(rel, cur_bucket, bucket_buf, blkno, info->strategy,
549 cachedmetap->hashm_maxbucket,
550 cachedmetap->hashm_highmask,
551 cachedmetap->hashm_lowmask, &tuples_removed,
552 &num_index_tuples, split_cleanup,
553 callback, callback_state);
554
555 _hash_dropbuf(rel, bucket_buf);
556
557 /* Advance to next bucket */
558 cur_bucket++;
559 }
560
561 if (BufferIsInvalid(metabuf))
563
564 /* Write-lock metapage and check for split since we started */
566 metap = HashPageGetMeta(BufferGetPage(metabuf));
567
568 if (cur_maxbucket != metap->hashm_maxbucket)
569 {
570 /* There's been a split, so process the additional bucket(s) */
572 cachedmetap = _hash_getcachedmetap(rel, &metabuf, true);
573 Assert(cachedmetap != NULL);
574 cur_maxbucket = cachedmetap->hashm_maxbucket;
575 goto loop_top;
576 }
577
578 /* Okay, we're really done. Update tuple count in metapage. */
580
581 if (orig_maxbucket == metap->hashm_maxbucket &&
582 orig_ntuples == metap->hashm_ntuples)
583 {
584 /*
585 * No one has split or inserted anything since start of scan, so
586 * believe our count as gospel.
587 */
588 metap->hashm_ntuples = num_index_tuples;
589 }
590 else
591 {
592 /*
593 * Otherwise, our count is untrustworthy since we may have
594 * double-scanned tuples in split buckets. Proceed by dead-reckoning.
595 * (Note: we still return estimated_count = false, because using this
596 * count is better than not updating reltuples at all.)
597 */
598 if (metap->hashm_ntuples > tuples_removed)
599 metap->hashm_ntuples -= tuples_removed;
600 else
601 metap->hashm_ntuples = 0;
602 num_index_tuples = metap->hashm_ntuples;
603 }
604
605 MarkBufferDirty(metabuf);
606
607 /* XLOG stuff */
608 if (RelationNeedsWAL(rel))
609 {
611 XLogRecPtr recptr;
612
613 xlrec.ntuples = metap->hashm_ntuples;
614
617
619
620 recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_UPDATE_META_PAGE);
621 PageSetLSN(BufferGetPage(metabuf), recptr);
622 }
623
625
626 _hash_relbuf(rel, metabuf);
627
628 /* return statistics */
629 if (stats == NULL)
631 stats->estimated_count = false;
632 stats->num_index_tuples = num_index_tuples;
633 stats->tuples_removed += tuples_removed;
634 /* hashvacuumcleanup will fill in num_pages */
635
636 return stats;
637}
638
639/*
640 * Post-VACUUM cleanup.
641 *
642 * Result: a palloc'd struct containing statistical info for VACUUM displays.
643 */
646{
647 Relation rel = info->index;
648 BlockNumber num_pages;
649
650 /* If hashbulkdelete wasn't called, return NULL signifying no change */
651 /* Note: this covers the analyze_only case too */
652 if (stats == NULL)
653 return NULL;
654
655 /* update statistics */
656 num_pages = RelationGetNumberOfBlocks(rel);
657 stats->num_pages = num_pages;
658
659 return stats;
660}
661
662/*
663 * Helper function to perform deletion of index entries from a bucket.
664 *
665 * This function expects that the caller has acquired a cleanup lock on the
666 * primary bucket page, and will return with a write lock again held on the
667 * primary bucket page. The lock won't necessarily be held continuously,
668 * though, because we'll release it when visiting overflow pages.
669 *
670 * There can't be any concurrent scans in progress when we first enter this
671 * function because of the cleanup lock we hold on the primary bucket page,
672 * but as soon as we release that lock, there might be. If those scans got
673 * ahead of our cleanup scan, they might see a tuple before we kill it and
674 * wake up only after VACUUM has completed and the TID has been recycled for
675 * an unrelated tuple. To avoid that calamity, we prevent scans from passing
676 * our cleanup scan by locking the next page in the bucket chain before
677 * releasing the lock on the previous page. (This type of lock chaining is not
678 * ideal, so we might want to look for a better solution at some point.)
679 *
680 * We need to retain a pin on the primary bucket to ensure that no concurrent
681 * split can start.
682 */
683void
684hashbucketcleanup(Relation rel, Bucket cur_bucket, Buffer bucket_buf,
685 BlockNumber bucket_blkno, BufferAccessStrategy bstrategy,
686 uint32 maxbucket, uint32 highmask, uint32 lowmask,
687 double *tuples_removed, double *num_index_tuples,
688 bool split_cleanup,
689 IndexBulkDeleteCallback callback, void *callback_state)
690{
691 BlockNumber blkno;
692 Buffer buf;
694 bool bucket_dirty = false;
695
696 blkno = bucket_blkno;
697 buf = bucket_buf;
698
699 if (split_cleanup)
700 new_bucket = _hash_get_newbucket_from_oldbucket(rel, cur_bucket,
701 lowmask, maxbucket);
702
703 /* Scan each page in bucket */
704 for (;;)
705 {
706 HashPageOpaque opaque;
707 OffsetNumber offno;
708 OffsetNumber maxoffno;
709 Buffer next_buf;
710 Page page;
711 OffsetNumber deletable[MaxOffsetNumber];
712 int ndeletable = 0;
713 bool retain_pin = false;
714 bool clear_dead_marking = false;
715
717
718 page = BufferGetPage(buf);
719 opaque = HashPageGetOpaque(page);
720
721 /* Scan each tuple in page */
722 maxoffno = PageGetMaxOffsetNumber(page);
723 for (offno = FirstOffsetNumber;
724 offno <= maxoffno;
725 offno = OffsetNumberNext(offno))
726 {
727 ItemPointer htup;
728 IndexTuple itup;
729 Bucket bucket;
730 bool kill_tuple = false;
731
732 itup = (IndexTuple) PageGetItem(page,
733 PageGetItemId(page, offno));
734 htup = &(itup->t_tid);
735
736 /*
737 * To remove the dead tuples, we strictly want to rely on results
738 * of callback function. refer btvacuumpage for detailed reason.
739 */
740 if (callback && callback(htup, callback_state))
741 {
742 kill_tuple = true;
743 if (tuples_removed)
744 *tuples_removed += 1;
745 }
746 else if (split_cleanup)
747 {
748 /* delete the tuples that are moved by split. */
750 maxbucket,
751 highmask,
752 lowmask);
753 /* mark the item for deletion */
754 if (bucket != cur_bucket)
755 {
756 /*
757 * We expect tuples to either belong to current bucket or
758 * new_bucket. This is ensured because we don't allow
759 * further splits from bucket that contains garbage. See
760 * comments in _hash_expandtable.
761 */
762 Assert(bucket == new_bucket);
763 kill_tuple = true;
764 }
765 }
766
767 if (kill_tuple)
768 {
769 /* mark the item for deletion */
770 deletable[ndeletable++] = offno;
771 }
772 else
773 {
774 /* we're keeping it, so count it */
775 if (num_index_tuples)
776 *num_index_tuples += 1;
777 }
778 }
779
780 /* retain the pin on primary bucket page till end of bucket scan */
781 if (blkno == bucket_blkno)
782 retain_pin = true;
783 else
784 retain_pin = false;
785
786 blkno = opaque->hasho_nextblkno;
787
788 /*
789 * Apply deletions, advance to next page and write page if needed.
790 */
791 if (ndeletable > 0)
792 {
793 /* No ereport(ERROR) until changes are logged */
795
796 PageIndexMultiDelete(page, deletable, ndeletable);
797 bucket_dirty = true;
798
799 /*
800 * Let us mark the page as clean if vacuum removes the DEAD tuples
801 * from an index page. We do this by clearing
802 * LH_PAGE_HAS_DEAD_TUPLES flag.
803 */
804 if (tuples_removed && *tuples_removed > 0 &&
805 H_HAS_DEAD_TUPLES(opaque))
806 {
807 opaque->hasho_flag &= ~LH_PAGE_HAS_DEAD_TUPLES;
808 clear_dead_marking = true;
809 }
810
812
813 /* XLOG stuff */
814 if (RelationNeedsWAL(rel))
815 {
816 xl_hash_delete xlrec;
817 XLogRecPtr recptr;
818
819 xlrec.clear_dead_marking = clear_dead_marking;
820 xlrec.is_primary_bucket_page = (buf == bucket_buf);
821
823 XLogRegisterData((char *) &xlrec, SizeOfHashDelete);
824
825 /*
826 * bucket buffer was not changed, but still needs to be
827 * registered to ensure that we can acquire a cleanup lock on
828 * it during replay.
829 */
830 if (!xlrec.is_primary_bucket_page)
831 {
833
834 XLogRegisterBuffer(0, bucket_buf, flags);
835 }
836
838 XLogRegisterBufData(1, (char *) deletable,
839 ndeletable * sizeof(OffsetNumber));
840
841 recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_DELETE);
842 PageSetLSN(BufferGetPage(buf), recptr);
843 }
844
846 }
847
848 /* bail out if there are no more pages to scan. */
849 if (!BlockNumberIsValid(blkno))
850 break;
851
852 next_buf = _hash_getbuf_with_strategy(rel, blkno, HASH_WRITE,
854 bstrategy);
855
856 /*
857 * release the lock on previous page after acquiring the lock on next
858 * page
859 */
860 if (retain_pin)
862 else
863 _hash_relbuf(rel, buf);
864
865 buf = next_buf;
866 }
867
868 /*
869 * lock the bucket page to clear the garbage flag and squeeze the bucket.
870 * if the current buffer is same as bucket buffer, then we already have
871 * lock on bucket page.
872 */
873 if (buf != bucket_buf)
874 {
875 _hash_relbuf(rel, buf);
877 }
878
879 /*
880 * Clear the garbage flag from bucket after deleting the tuples that are
881 * moved by split. We purposefully clear the flag before squeeze bucket,
882 * so that after restart, vacuum shouldn't again try to delete the moved
883 * by split tuples.
884 */
885 if (split_cleanup)
886 {
887 HashPageOpaque bucket_opaque;
888 Page page;
889
890 page = BufferGetPage(bucket_buf);
891 bucket_opaque = HashPageGetOpaque(page);
892
893 /* No ereport(ERROR) until changes are logged */
895
896 bucket_opaque->hasho_flag &= ~LH_BUCKET_NEEDS_SPLIT_CLEANUP;
897 MarkBufferDirty(bucket_buf);
898
899 /* XLOG stuff */
900 if (RelationNeedsWAL(rel))
901 {
902 XLogRecPtr recptr;
903
905 XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD);
906
907 recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_CLEANUP);
908 PageSetLSN(page, recptr);
909 }
910
912 }
913
914 /*
915 * If we have deleted anything, try to compact free space. For squeezing
916 * the bucket, we must have a cleanup lock, else it can impact the
917 * ordering of tuples for a scan that has started before it.
918 */
919 if (bucket_dirty && IsBufferCleanupOK(bucket_buf))
920 _hash_squeezebucket(rel, cur_bucket, bucket_blkno, bucket_buf,
921 bstrategy);
922 else
923 LockBuffer(bucket_buf, BUFFER_LOCK_UNLOCK);
924}
void pgstat_progress_update_param(int index, int64 val)
uint32 BlockNumber
Definition: block.h:31
#define InvalidBlockNumber
Definition: block.h:33
static bool BlockNumberIsValid(BlockNumber blockNumber)
Definition: block.h:71
static Datum values[MAXATTR]
Definition: bootstrap.c:151
int Buffer
Definition: buf.h:23
#define BufferIsInvalid(buffer)
Definition: buf.h:31
#define InvalidBuffer
Definition: buf.h:25
bool IsBufferCleanupOK(Buffer buffer)
Definition: bufmgr.c:5455
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:2532
void LockBufferForCleanup(Buffer buffer)
Definition: bufmgr.c:5238
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:5158
Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition: bufmgr.c:793
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:189
#define RelationGetNumberOfBlocks(reln)
Definition: bufmgr.h:273
static Page BufferGetPage(Buffer buffer)
Definition: bufmgr.h:400
#define BUFFER_LOCK_EXCLUSIVE
Definition: bufmgr.h:191
@ RBM_NORMAL
Definition: bufmgr.h:45
void PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems)
Definition: bufpage.c:1150
Pointer Page
Definition: bufpage.h:81
static Item PageGetItem(Page page, ItemId itemId)
Definition: bufpage.h:354
static ItemId PageGetItemId(Page page, OffsetNumber offsetNumber)
Definition: bufpage.h:243
static void PageSetLSN(Page page, XLogRecPtr lsn)
Definition: bufpage.h:391
static OffsetNumber PageGetMaxOffsetNumber(Page page)
Definition: bufpage.h:372
#define Min(x, y)
Definition: c.h:961
uint8_t uint8
Definition: c.h:486
#define PG_USED_FOR_ASSERTS_ONLY
Definition: c.h:204
#define Assert(condition)
Definition: c.h:815
int64_t int64
Definition: c.h:485
uint32_t uint32
Definition: c.h:488
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:225
#define PG_RETURN_POINTER(x)
Definition: fmgr.h:361
#define PG_FUNCTION_ARGS
Definition: fmgr.h:193
IndexScanDesc RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys)
Definition: genam.c:80
bool(* IndexBulkDeleteCallback)(ItemPointer itemptr, void *state)
Definition: genam.h:89
IndexUniqueCheck
Definition: genam.h:118
int NBuffers
Definition: globals.c:141
int maintenance_work_mem
Definition: globals.c:132
bool hashinsert(Relation rel, Datum *values, bool *isnull, ItemPointer ht_ctid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, IndexInfo *indexInfo)
Definition: hash.c:252
IndexBulkDeleteResult * hashvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
Definition: hash.c:645
IndexBuildResult * hashbuild(Relation heap, Relation index, IndexInfo *indexInfo)
Definition: hash.c:116
Datum hashhandler(PG_FUNCTION_ARGS)
Definition: hash.c:57
bool hashgettuple(IndexScanDesc scan, ScanDirection dir)
Definition: hash.c:284
static void hashbuildCallback(Relation index, ItemPointer tid, Datum *values, bool *isnull, bool tupleIsAlive, void *state)
Definition: hash.c:211
void hashbuildempty(Relation index)
Definition: hash.c:202
IndexScanDesc hashbeginscan(Relation rel, int nkeys, int norderbys)
Definition: hash.c:368
IndexBulkDeleteResult * hashbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state)
Definition: hash.c:459
void hashrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, ScanKey orderbys, int norderbys)
Definition: hash.c:398
void hashendscan(IndexScanDesc scan)
Definition: hash.c:428
int64 hashgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
Definition: hash.c:336
void hashbucketcleanup(Relation rel, Bucket cur_bucket, Buffer bucket_buf, BlockNumber bucket_blkno, BufferAccessStrategy bstrategy, uint32 maxbucket, uint32 highmask, uint32 lowmask, double *tuples_removed, double *num_index_tuples, bool split_cleanup, IndexBulkDeleteCallback callback, void *callback_state)
Definition: hash.c:684
#define HASH_NOLOCK
Definition: hash.h:341
#define HashPageGetOpaque(page)
Definition: hash.h:88
#define LH_BUCKET_PAGE
Definition: hash.h:55
#define HASH_WRITE
Definition: hash.h:340
#define H_BUCKET_BEING_SPLIT(opaque)
Definition: hash.h:91
#define LH_META_PAGE
Definition: hash.h:57
#define HashScanPosInvalidate(scanpos)
Definition: hash.h:144
#define HashPageGetMeta(page)
Definition: hash.h:323
#define BUCKET_TO_BLKNO(metap, B)
Definition: hash.h:39
#define HASH_METAPAGE
Definition: hash.h:198
#define H_HAS_DEAD_TUPLES(opaque)
Definition: hash.h:93
#define H_NEEDS_SPLIT_CLEANUP(opaque)
Definition: hash.h:90
uint32 Bucket
Definition: hash.h:35
#define HASHNProcs
Definition: hash.h:358
HashScanOpaqueData * HashScanOpaque
Definition: hash.h:192
#define HASHOPTIONS_PROC
Definition: hash.h:357
#define HashScanPosIsValid(scanpos)
Definition: hash.h:137
#define LH_OVERFLOW_PAGE
Definition: hash.h:54
#define InvalidBucket
Definition: hash.h:37
#define XLOG_HASH_SPLIT_CLEANUP
Definition: hash_xlog.h:37
#define XLOG_HASH_UPDATE_META_PAGE
Definition: hash_xlog.h:38
#define SizeOfHashDelete
Definition: hash_xlog.h:186
#define XLOG_HASH_DELETE
Definition: hash_xlog.h:36
#define SizeOfHashUpdateMetaPage
Definition: hash_xlog.h:200
void _hash_doinsert(Relation rel, IndexTuple itup, Relation heapRel, bool sorted)
Definition: hashinsert.c:38
void _hash_squeezebucket(Relation rel, Bucket bucket, BlockNumber bucket_blkno, Buffer bucket_buf, BufferAccessStrategy bstrategy)
Definition: hashovfl.c:842
HashMetaPage _hash_getcachedmetap(Relation rel, Buffer *metabuf, bool force_refresh)
Definition: hashpage.c:1501
void _hash_relbuf(Relation rel, Buffer buf)
Definition: hashpage.c:266
uint32 _hash_init(Relation rel, double num_tuples, ForkNumber forkNum)
Definition: hashpage.c:327
void _hash_dropbuf(Relation rel, Buffer buf)
Definition: hashpage.c:277
void _hash_dropscanbuf(Relation rel, HashScanOpaque so)
Definition: hashpage.c:289
Buffer _hash_getbuf(Relation rel, BlockNumber blkno, int access, int flags)
Definition: hashpage.c:70
Buffer _hash_getbuf_with_strategy(Relation rel, BlockNumber blkno, int access, int flags, BufferAccessStrategy bstrategy)
Definition: hashpage.c:239
bool _hash_first(IndexScanDesc scan, ScanDirection dir)
Definition: hashsearch.c:288
bool _hash_next(IndexScanDesc scan, ScanDirection dir)
Definition: hashsearch.c:48
void _h_spool(HSpool *hspool, ItemPointer self, const Datum *values, const bool *isnull)
Definition: hashsort.c:109
void _h_indexbuild(HSpool *hspool, Relation heapRel)
Definition: hashsort.c:120
HSpool * _h_spoolinit(Relation heap, Relation index, uint32 num_buckets)
Definition: hashsort.c:60
void _h_spooldestroy(HSpool *hspool)
Definition: hashsort.c:99
uint32 _hash_get_indextuple_hashkey(IndexTuple itup)
Definition: hashutil.c:291
Bucket _hash_hashkey2bucket(uint32 hashkey, uint32 maxbucket, uint32 highmask, uint32 lowmask)
Definition: hashutil.c:125
void _hash_checkpage(Relation rel, Buffer buf, int flags)
Definition: hashutil.c:210
Bucket _hash_get_newbucket_from_oldbucket(Relation rel, Bucket old_bucket, uint32 lowmask, uint32 maxbucket)
Definition: hashutil.c:494
bytea * hashoptions(Datum reloptions, bool validate)
Definition: hashutil.c:275
void _hash_kill_items(IndexScanDesc scan)
Definition: hashutil.c:536
bool _hash_convert_tuple(Relation index, Datum *user_values, bool *user_isnull, Datum *index_values, bool *index_isnull)
Definition: hashutil.c:318
void hashadjustmembers(Oid opfamilyoid, Oid opclassoid, List *operators, List *functions)
Definition: hashvalidate.c:272
bool hashvalidate(Oid opclassoid)
Definition: hashvalidate.c:41
IndexTuple index_form_tuple(TupleDesc tupleDescriptor, const Datum *values, const bool *isnull)
Definition: indextuple.c:44
if(TABLE==NULL||TABLE_index==NULL)
Definition: isn.c:76
IndexTupleData * IndexTuple
Definition: itup.h:53
#define MaxIndexTuplesPerPage
Definition: itup.h:167
int NLocBuffer
Definition: localbuf.c:42
void pfree(void *pointer)
Definition: mcxt.c:1521
void * palloc0(Size size)
Definition: mcxt.c:1347
void * palloc(Size size)
Definition: mcxt.c:1317
#define START_CRIT_SECTION()
Definition: miscadmin.h:149
#define END_CRIT_SECTION()
Definition: miscadmin.h:151
#define makeNode(_type_)
Definition: nodes.h:155
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
uint16 OffsetNumber
Definition: off.h:24
#define FirstOffsetNumber
Definition: off.h:27
#define MaxOffsetNumber
Definition: off.h:28
static char * buf
Definition: pg_test_fsync.c:72
void estimate_rel_size(Relation rel, int32 *attr_widths, BlockNumber *pages, double *tuples, double *allvisfrac)
Definition: plancat.c:1067
uintptr_t Datum
Definition: postgres.h:69
#define PROGRESS_CREATEIDX_TUPLES_TOTAL
Definition: progress.h:87
#define RelationGetDescr(relation)
Definition: rel.h:531
#define RelationGetRelationName(relation)
Definition: rel.h:539
#define RelationNeedsWAL(relation)
Definition: rel.h:628
@ MAIN_FORKNUM
Definition: relpath.h:58
@ INIT_FORKNUM
Definition: relpath.h:61
ScanDirection
Definition: sdir.h:25
@ ForwardScanDirection
Definition: sdir.h:28
void hashcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, Cost *indexStartupCost, Cost *indexTotalCost, Selectivity *indexSelectivity, double *indexCorrelation, double *indexPages)
Definition: selfuncs.c:7165
#define HTMaxStrategyNumber
Definition: stratnum.h:43
HSpool * spool
Definition: hash.c:39
Relation heapRel
Definition: hash.c:41
double indtuples
Definition: hash.c:40
uint32 hashm_lowmask
Definition: hash.h:256
uint32 hashm_maxbucket
Definition: hash.h:254
double hashm_ntuples
Definition: hash.h:248
uint32 hashm_highmask
Definition: hash.h:255
BlockNumber hasho_nextblkno
Definition: hash.h:80
uint16 hasho_flag
Definition: hash.h:82
BlockNumber hasho_prevblkno
Definition: hash.h:79
bool hashso_buc_split
Definition: hash.h:180
HashScanPosData currPos
Definition: hash.h:189
bool hashso_buc_populated
Definition: hash.h:174
Buffer hashso_split_bucket_buf
Definition: hash.h:171
Buffer hashso_bucket_buf
Definition: hash.h:164
int * killedItems
Definition: hash.h:182
HashScanPosItem items[MaxIndexTuplesPerPage]
Definition: hash.h:127
int itemIndex
Definition: hash.h:125
ambuildphasename_function ambuildphasename
Definition: amapi.h:289
ambuildempty_function ambuildempty
Definition: amapi.h:279
amvacuumcleanup_function amvacuumcleanup
Definition: amapi.h:283
bool amclusterable
Definition: amapi.h:253
amoptions_function amoptions
Definition: amapi.h:287
amestimateparallelscan_function amestimateparallelscan
Definition: amapi.h:301
amrestrpos_function amrestrpos
Definition: amapi.h:298
aminsert_function aminsert
Definition: amapi.h:280
amendscan_function amendscan
Definition: amapi.h:296
uint16 amoptsprocnum
Definition: amapi.h:233
amparallelrescan_function amparallelrescan
Definition: amapi.h:303
Oid amkeytype
Definition: amapi.h:269
bool ampredlocks
Definition: amapi.h:255
uint16 amsupport
Definition: amapi.h:231
amcostestimate_function amcostestimate
Definition: amapi.h:285
bool amcanorderbyop
Definition: amapi.h:237
amadjustmembers_function amadjustmembers
Definition: amapi.h:291
ambuild_function ambuild
Definition: amapi.h:278
bool amstorage
Definition: amapi.h:251
uint16 amstrategies
Definition: amapi.h:229
bool amoptionalkey
Definition: amapi.h:245
amgettuple_function amgettuple
Definition: amapi.h:294
amcanreturn_function amcanreturn
Definition: amapi.h:284
bool amcanunique
Definition: amapi.h:241
amgetbitmap_function amgetbitmap
Definition: amapi.h:295
amproperty_function amproperty
Definition: amapi.h:288
ambulkdelete_function ambulkdelete
Definition: amapi.h:282
bool amsearcharray
Definition: amapi.h:247
bool amsummarizing
Definition: amapi.h:265
amvalidate_function amvalidate
Definition: amapi.h:290
ammarkpos_function ammarkpos
Definition: amapi.h:297
bool amcanmulticol
Definition: amapi.h:243
bool amusemaintenanceworkmem
Definition: amapi.h:263
ambeginscan_function ambeginscan
Definition: amapi.h:292
bool amcanparallel
Definition: amapi.h:257
amrescan_function amrescan
Definition: amapi.h:293
bool amcanorder
Definition: amapi.h:235
bool amcanbuildparallel
Definition: amapi.h:259
aminitparallelscan_function aminitparallelscan
Definition: amapi.h:302
uint8 amparallelvacuumoptions
Definition: amapi.h:267
aminsertcleanup_function aminsertcleanup
Definition: amapi.h:281
bool amcanbackward
Definition: amapi.h:239
amgettreeheight_function amgettreeheight
Definition: amapi.h:286
bool amcaninclude
Definition: amapi.h:261
bool amsearchnulls
Definition: amapi.h:249
double heap_tuples
Definition: genam.h:34
double index_tuples
Definition: genam.h:35
bool estimated_count
Definition: genam.h:80
BlockNumber num_pages
Definition: genam.h:79
double tuples_removed
Definition: genam.h:82
double num_index_tuples
Definition: genam.h:81
struct ScanKeyData * keyData
Definition: relscan.h:139
bool kill_prior_tuple
Definition: relscan.h:145
Relation indexRelation
Definition: relscan.h:135
ItemPointerData t_tid
Definition: itup.h:37
Relation index
Definition: genam.h:48
BufferAccessStrategy strategy
Definition: genam.h:55
Definition: type.h:96
Definition: regguts.h:323
bool clear_dead_marking
Definition: hash_xlog.h:180
bool is_primary_bucket_page
Definition: hash_xlog.h:182
static double table_index_build_scan(Relation table_rel, Relation index_rel, struct IndexInfo *index_info, bool allow_sync, bool progress, IndexBuildCallback callback, void *callback_state, TableScanDesc scan)
Definition: tableam.h:1780
static void callback(struct sockaddr *addr, struct sockaddr *mask, void *unused)
Definition: test_ifaddrs.c:46
void tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids, bool recheck)
Definition: tidbitmap.c:377
void vacuum_delay_point(void)
Definition: vacuum.c:2361
#define VACUUM_OPTION_PARALLEL_BULKDEL
Definition: vacuum.h:48
uint64 XLogRecPtr
Definition: xlogdefs.h:21
void XLogRegisterBufData(uint8 block_id, const char *data, uint32 len)
Definition: xloginsert.c:405
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:474
void XLogRegisterData(const char *data, uint32 len)
Definition: xloginsert.c:364
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition: xloginsert.c:242
void XLogBeginInsert(void)
Definition: xloginsert.c:149
#define REGBUF_NO_CHANGE
Definition: xloginsert.h:36
#define REGBUF_STANDARD
Definition: xloginsert.h:34
#define REGBUF_NO_IMAGE
Definition: xloginsert.h:32