PostgreSQL Source Code git master
hash.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * hash.c
4 * Implementation of Margo Seltzer's Hashing package for postgres.
5 *
6 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
8 *
9 *
10 * IDENTIFICATION
11 * src/backend/access/hash/hash.c
12 *
13 * NOTES
14 * This file contains only the public interface routines.
15 *
16 *-------------------------------------------------------------------------
17 */
18
19#include "postgres.h"
20
21#include "access/hash.h"
22#include "access/hash_xlog.h"
23#include "access/relscan.h"
24#include "access/stratnum.h"
25#include "access/tableam.h"
26#include "access/xloginsert.h"
27#include "commands/progress.h"
28#include "commands/vacuum.h"
29#include "miscadmin.h"
30#include "nodes/execnodes.h"
31#include "optimizer/plancat.h"
32#include "pgstat.h"
33#include "utils/fmgrprotos.h"
35#include "utils/rel.h"
36
37/* Working state for hashbuild and its callback */
38typedef struct
39{
40 HSpool *spool; /* NULL if not using spooling */
41 double indtuples; /* # tuples accepted into index */
42 Relation heapRel; /* heap relation descriptor */
44
46 ItemPointer tid,
48 bool *isnull,
49 bool tupleIsAlive,
50 void *state);
51
52
53/*
54 * Hash handler function: return IndexAmRoutine with access method parameters
55 * and callbacks.
56 */
59{
61
63 amroutine->amsupport = HASHNProcs;
65 amroutine->amcanorder = false;
66 amroutine->amcanorderbyop = false;
67 amroutine->amcanhash = true;
68 amroutine->amconsistentequality = true;
69 amroutine->amconsistentordering = false;
70 amroutine->amcanbackward = true;
71 amroutine->amcanunique = false;
72 amroutine->amcanmulticol = false;
73 amroutine->amoptionalkey = false;
74 amroutine->amsearcharray = false;
75 amroutine->amsearchnulls = false;
76 amroutine->amstorage = false;
77 amroutine->amclusterable = false;
78 amroutine->ampredlocks = true;
79 amroutine->amcanparallel = false;
80 amroutine->amcanbuildparallel = false;
81 amroutine->amcaninclude = false;
82 amroutine->amusemaintenanceworkmem = false;
83 amroutine->amsummarizing = false;
84 amroutine->amparallelvacuumoptions =
86 amroutine->amkeytype = INT4OID;
87
88 amroutine->ambuild = hashbuild;
89 amroutine->ambuildempty = hashbuildempty;
90 amroutine->aminsert = hashinsert;
91 amroutine->aminsertcleanup = NULL;
92 amroutine->ambulkdelete = hashbulkdelete;
94 amroutine->amcanreturn = NULL;
96 amroutine->amgettreeheight = NULL;
97 amroutine->amoptions = hashoptions;
98 amroutine->amproperty = NULL;
99 amroutine->ambuildphasename = NULL;
100 amroutine->amvalidate = hashvalidate;
102 amroutine->ambeginscan = hashbeginscan;
103 amroutine->amrescan = hashrescan;
104 amroutine->amgettuple = hashgettuple;
105 amroutine->amgetbitmap = hashgetbitmap;
106 amroutine->amendscan = hashendscan;
107 amroutine->ammarkpos = NULL;
108 amroutine->amrestrpos = NULL;
109 amroutine->amestimateparallelscan = NULL;
110 amroutine->aminitparallelscan = NULL;
111 amroutine->amparallelrescan = NULL;
114
115 PG_RETURN_POINTER(amroutine);
116}
117
118/*
119 * hashbuild() -- build a new hash index.
120 */
123{
124 IndexBuildResult *result;
125 BlockNumber relpages;
126 double reltuples;
127 double allvisfrac;
128 uint32 num_buckets;
129 Size sort_threshold;
130 HashBuildState buildstate;
131
132 /*
133 * We expect to be called exactly once for any index relation. If that's
134 * not the case, big trouble's what we have.
135 */
137 elog(ERROR, "index \"%s\" already contains data",
139
140 /* Estimate the number of rows currently present in the table */
141 estimate_rel_size(heap, NULL, &relpages, &reltuples, &allvisfrac);
142
143 /* Initialize the hash index metadata page and initial buckets */
144 num_buckets = _hash_init(index, reltuples, MAIN_FORKNUM);
145
146 /*
147 * If we just insert the tuples into the index in scan order, then
148 * (assuming their hash codes are pretty random) there will be no locality
149 * of access to the index, and if the index is bigger than available RAM
150 * then we'll thrash horribly. To prevent that scenario, we can sort the
151 * tuples by (expected) bucket number. However, such a sort is useless
152 * overhead when the index does fit in RAM. We choose to sort if the
153 * initial index size exceeds maintenance_work_mem, or the number of
154 * buffers usable for the index, whichever is less. (Limiting by the
155 * number of buffers should reduce thrashing between PG buffers and kernel
156 * buffers, which seems useful even if no physical I/O results. Limiting
157 * by maintenance_work_mem is useful to allow easy testing of the sort
158 * code path, and may be useful to DBAs as an additional control knob.)
159 *
160 * NOTE: this test will need adjustment if a bucket is ever different from
161 * one page. Also, "initial index size" accounting does not include the
162 * metapage, nor the first bitmap page.
163 */
164 sort_threshold = (maintenance_work_mem * (Size) 1024) / BLCKSZ;
165 if (index->rd_rel->relpersistence != RELPERSISTENCE_TEMP)
166 sort_threshold = Min(sort_threshold, NBuffers);
167 else
168 sort_threshold = Min(sort_threshold, NLocBuffer);
169
170 if (num_buckets >= sort_threshold)
171 buildstate.spool = _h_spoolinit(heap, index, num_buckets);
172 else
173 buildstate.spool = NULL;
174
175 /* prepare to build the index */
176 buildstate.indtuples = 0;
177 buildstate.heapRel = heap;
178
179 /* do the heap scan */
180 reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
182 &buildstate, NULL);
184 buildstate.indtuples);
185
186 if (buildstate.spool)
187 {
188 /* sort the tuples and insert them into the index */
189 _h_indexbuild(buildstate.spool, buildstate.heapRel);
190 _h_spooldestroy(buildstate.spool);
191 }
192
193 /*
194 * Return statistics
195 */
197
198 result->heap_tuples = reltuples;
199 result->index_tuples = buildstate.indtuples;
200
201 return result;
202}
203
204/*
205 * hashbuildempty() -- build an empty hash index in the initialization fork
206 */
207void
209{
211}
212
213/*
214 * Per-tuple callback for table_index_build_scan
215 */
216static void
218 ItemPointer tid,
219 Datum *values,
220 bool *isnull,
221 bool tupleIsAlive,
222 void *state)
223{
224 HashBuildState *buildstate = (HashBuildState *) state;
225 Datum index_values[1];
226 bool index_isnull[1];
227 IndexTuple itup;
228
229 /* convert data to a hash key; on failure, do not insert anything */
231 values, isnull,
232 index_values, index_isnull))
233 return;
234
235 /* Either spool the tuple for sorting, or just put it into the index */
236 if (buildstate->spool)
237 _h_spool(buildstate->spool, tid, index_values, index_isnull);
238 else
239 {
240 /* form an index tuple and point it at the heap tuple */
242 index_values, index_isnull);
243 itup->t_tid = *tid;
244 _hash_doinsert(index, itup, buildstate->heapRel, false);
245 pfree(itup);
246 }
247
248 buildstate->indtuples += 1;
249}
250
251/*
252 * hashinsert() -- insert an index tuple into a hash table.
253 *
254 * Hash on the heap tuple's key, form an index tuple with hash code.
255 * Find the appropriate location for the new tuple, and put it there.
256 */
257bool
258hashinsert(Relation rel, Datum *values, bool *isnull,
259 ItemPointer ht_ctid, Relation heapRel,
260 IndexUniqueCheck checkUnique,
261 bool indexUnchanged,
262 IndexInfo *indexInfo)
263{
264 Datum index_values[1];
265 bool index_isnull[1];
266 IndexTuple itup;
267
268 /* convert data to a hash key; on failure, do not insert anything */
269 if (!_hash_convert_tuple(rel,
270 values, isnull,
271 index_values, index_isnull))
272 return false;
273
274 /* form an index tuple and point it at the heap tuple */
275 itup = index_form_tuple(RelationGetDescr(rel), index_values, index_isnull);
276 itup->t_tid = *ht_ctid;
277
278 _hash_doinsert(rel, itup, heapRel, false);
279
280 pfree(itup);
281
282 return false;
283}
284
285
286/*
287 * hashgettuple() -- Get the next tuple in the scan.
288 */
289bool
291{
293 bool res;
294
295 /* Hash indexes are always lossy since we store only the hash code */
296 scan->xs_recheck = true;
297
298 /*
299 * If we've already initialized this scan, we can just advance it in the
300 * appropriate direction. If we haven't done so yet, we call a routine to
301 * get the first item in the scan.
302 */
304 res = _hash_first(scan, dir);
305 else
306 {
307 /*
308 * Check to see if we should kill the previously-fetched tuple.
309 */
310 if (scan->kill_prior_tuple)
311 {
312 /*
313 * Yes, so remember it for later. (We'll deal with all such tuples
314 * at once right after leaving the index page or at end of scan.)
315 * In case if caller reverses the indexscan direction it is quite
316 * possible that the same item might get entered multiple times.
317 * But, we don't detect that; instead, we just forget any excess
318 * entries.
319 */
320 if (so->killedItems == NULL)
322
324 so->killedItems[so->numKilled++] = so->currPos.itemIndex;
325 }
326
327 /*
328 * Now continue the scan.
329 */
330 res = _hash_next(scan, dir);
331 }
332
333 return res;
334}
335
336
337/*
338 * hashgetbitmap() -- get all tuples at once
339 */
340int64
342{
344 bool res;
345 int64 ntids = 0;
346 HashScanPosItem *currItem;
347
349
350 while (res)
351 {
352 currItem = &so->currPos.items[so->currPos.itemIndex];
353
354 /*
355 * _hash_first and _hash_next handle eliminate dead index entries
356 * whenever scan->ignore_killed_tuples is true. Therefore, there's
357 * nothing to do here except add the results to the TIDBitmap.
358 */
359 tbm_add_tuples(tbm, &(currItem->heapTid), 1, true);
360 ntids++;
361
362 res = _hash_next(scan, ForwardScanDirection);
363 }
364
365 return ntids;
366}
367
368
369/*
370 * hashbeginscan() -- start a scan on a hash index
371 */
373hashbeginscan(Relation rel, int nkeys, int norderbys)
374{
375 IndexScanDesc scan;
377
378 /* no order by operators allowed */
379 Assert(norderbys == 0);
380
381 scan = RelationGetIndexScan(rel, nkeys, norderbys);
382
387
388 so->hashso_buc_populated = false;
389 so->hashso_buc_split = false;
390
391 so->killedItems = NULL;
392 so->numKilled = 0;
393
394 scan->opaque = so;
395
396 return scan;
397}
398
399/*
400 * hashrescan() -- rescan an index relation
401 */
402void
403hashrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
404 ScanKey orderbys, int norderbys)
405{
407 Relation rel = scan->indexRelation;
408
410 {
411 /* Before leaving current page, deal with any killed items */
412 if (so->numKilled > 0)
413 _hash_kill_items(scan);
414 }
415
416 _hash_dropscanbuf(rel, so);
417
418 /* set position invalid (this will cause _hash_first call) */
420
421 /* Update scan key, if a new one is given */
422 if (scankey && scan->numberOfKeys > 0)
423 memcpy(scan->keyData, scankey, scan->numberOfKeys * sizeof(ScanKeyData));
424
425 so->hashso_buc_populated = false;
426 so->hashso_buc_split = false;
427}
428
429/*
430 * hashendscan() -- close down a scan
431 */
432void
434{
436 Relation rel = scan->indexRelation;
437
439 {
440 /* Before leaving current page, deal with any killed items */
441 if (so->numKilled > 0)
442 _hash_kill_items(scan);
443 }
444
445 _hash_dropscanbuf(rel, so);
446
447 if (so->killedItems != NULL)
448 pfree(so->killedItems);
449 pfree(so);
450 scan->opaque = NULL;
451}
452
453/*
454 * Bulk deletion of all index entries pointing to a set of heap tuples.
455 * The set of target tuples is specified via a callback routine that tells
456 * whether any given heap tuple (identified by ItemPointer) is being deleted.
457 *
458 * This function also deletes the tuples that are moved by split to other
459 * bucket.
460 *
461 * Result: a palloc'd struct containing statistical info for VACUUM displays.
462 */
465 IndexBulkDeleteCallback callback, void *callback_state)
466{
467 Relation rel = info->index;
468 double tuples_removed;
469 double num_index_tuples;
470 double orig_ntuples;
471 Bucket orig_maxbucket;
472 Bucket cur_maxbucket;
473 Bucket cur_bucket;
474 Buffer metabuf = InvalidBuffer;
475 HashMetaPage metap;
476 HashMetaPage cachedmetap;
477
478 tuples_removed = 0;
479 num_index_tuples = 0;
480
481 /*
482 * We need a copy of the metapage so that we can use its hashm_spares[]
483 * values to compute bucket page addresses, but a cached copy should be
484 * good enough. (If not, we'll detect that further down and refresh the
485 * cache as necessary.)
486 */
487 cachedmetap = _hash_getcachedmetap(rel, &metabuf, false);
488 Assert(cachedmetap != NULL);
489
490 orig_maxbucket = cachedmetap->hashm_maxbucket;
491 orig_ntuples = cachedmetap->hashm_ntuples;
492
493 /* Scan the buckets that we know exist */
494 cur_bucket = 0;
495 cur_maxbucket = orig_maxbucket;
496
497loop_top:
498 while (cur_bucket <= cur_maxbucket)
499 {
500 BlockNumber bucket_blkno;
501 BlockNumber blkno;
502 Buffer bucket_buf;
503 Buffer buf;
504 HashPageOpaque bucket_opaque;
505 Page page;
506 bool split_cleanup = false;
507
508 /* Get address of bucket's start page */
509 bucket_blkno = BUCKET_TO_BLKNO(cachedmetap, cur_bucket);
510
511 blkno = bucket_blkno;
512
513 /*
514 * We need to acquire a cleanup lock on the primary bucket page to out
515 * wait concurrent scans before deleting the dead tuples.
516 */
520
521 page = BufferGetPage(buf);
522 bucket_opaque = HashPageGetOpaque(page);
523
524 /*
525 * If the bucket contains tuples that are moved by split, then we need
526 * to delete such tuples. We can't delete such tuples if the split
527 * operation on bucket is not finished as those are needed by scans.
528 */
529 if (!H_BUCKET_BEING_SPLIT(bucket_opaque) &&
530 H_NEEDS_SPLIT_CLEANUP(bucket_opaque))
531 {
532 split_cleanup = true;
533
534 /*
535 * This bucket might have been split since we last held a lock on
536 * the metapage. If so, hashm_maxbucket, hashm_highmask and
537 * hashm_lowmask might be old enough to cause us to fail to remove
538 * tuples left behind by the most recent split. To prevent that,
539 * now that the primary page of the target bucket has been locked
540 * (and thus can't be further split), check whether we need to
541 * update our cached metapage data.
542 */
543 Assert(bucket_opaque->hasho_prevblkno != InvalidBlockNumber);
544 if (bucket_opaque->hasho_prevblkno > cachedmetap->hashm_maxbucket)
545 {
546 cachedmetap = _hash_getcachedmetap(rel, &metabuf, true);
547 Assert(cachedmetap != NULL);
548 }
549 }
550
551 bucket_buf = buf;
552
553 hashbucketcleanup(rel, cur_bucket, bucket_buf, blkno, info->strategy,
554 cachedmetap->hashm_maxbucket,
555 cachedmetap->hashm_highmask,
556 cachedmetap->hashm_lowmask, &tuples_removed,
557 &num_index_tuples, split_cleanup,
558 callback, callback_state);
559
560 _hash_dropbuf(rel, bucket_buf);
561
562 /* Advance to next bucket */
563 cur_bucket++;
564 }
565
566 if (BufferIsInvalid(metabuf))
568
569 /* Write-lock metapage and check for split since we started */
571 metap = HashPageGetMeta(BufferGetPage(metabuf));
572
573 if (cur_maxbucket != metap->hashm_maxbucket)
574 {
575 /* There's been a split, so process the additional bucket(s) */
577 cachedmetap = _hash_getcachedmetap(rel, &metabuf, true);
578 Assert(cachedmetap != NULL);
579 cur_maxbucket = cachedmetap->hashm_maxbucket;
580 goto loop_top;
581 }
582
583 /* Okay, we're really done. Update tuple count in metapage. */
585
586 if (orig_maxbucket == metap->hashm_maxbucket &&
587 orig_ntuples == metap->hashm_ntuples)
588 {
589 /*
590 * No one has split or inserted anything since start of scan, so
591 * believe our count as gospel.
592 */
593 metap->hashm_ntuples = num_index_tuples;
594 }
595 else
596 {
597 /*
598 * Otherwise, our count is untrustworthy since we may have
599 * double-scanned tuples in split buckets. Proceed by dead-reckoning.
600 * (Note: we still return estimated_count = false, because using this
601 * count is better than not updating reltuples at all.)
602 */
603 if (metap->hashm_ntuples > tuples_removed)
604 metap->hashm_ntuples -= tuples_removed;
605 else
606 metap->hashm_ntuples = 0;
607 num_index_tuples = metap->hashm_ntuples;
608 }
609
610 MarkBufferDirty(metabuf);
611
612 /* XLOG stuff */
613 if (RelationNeedsWAL(rel))
614 {
616 XLogRecPtr recptr;
617
618 xlrec.ntuples = metap->hashm_ntuples;
619
622
624
625 recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_UPDATE_META_PAGE);
626 PageSetLSN(BufferGetPage(metabuf), recptr);
627 }
628
630
631 _hash_relbuf(rel, metabuf);
632
633 /* return statistics */
634 if (stats == NULL)
636 stats->estimated_count = false;
637 stats->num_index_tuples = num_index_tuples;
638 stats->tuples_removed += tuples_removed;
639 /* hashvacuumcleanup will fill in num_pages */
640
641 return stats;
642}
643
644/*
645 * Post-VACUUM cleanup.
646 *
647 * Result: a palloc'd struct containing statistical info for VACUUM displays.
648 */
651{
652 Relation rel = info->index;
653 BlockNumber num_pages;
654
655 /* If hashbulkdelete wasn't called, return NULL signifying no change */
656 /* Note: this covers the analyze_only case too */
657 if (stats == NULL)
658 return NULL;
659
660 /* update statistics */
661 num_pages = RelationGetNumberOfBlocks(rel);
662 stats->num_pages = num_pages;
663
664 return stats;
665}
666
667/*
668 * Helper function to perform deletion of index entries from a bucket.
669 *
670 * This function expects that the caller has acquired a cleanup lock on the
671 * primary bucket page, and will return with a write lock again held on the
672 * primary bucket page. The lock won't necessarily be held continuously,
673 * though, because we'll release it when visiting overflow pages.
674 *
675 * There can't be any concurrent scans in progress when we first enter this
676 * function because of the cleanup lock we hold on the primary bucket page,
677 * but as soon as we release that lock, there might be. If those scans got
678 * ahead of our cleanup scan, they might see a tuple before we kill it and
679 * wake up only after VACUUM has completed and the TID has been recycled for
680 * an unrelated tuple. To avoid that calamity, we prevent scans from passing
681 * our cleanup scan by locking the next page in the bucket chain before
682 * releasing the lock on the previous page. (This type of lock chaining is not
683 * ideal, so we might want to look for a better solution at some point.)
684 *
685 * We need to retain a pin on the primary bucket to ensure that no concurrent
686 * split can start.
687 */
688void
689hashbucketcleanup(Relation rel, Bucket cur_bucket, Buffer bucket_buf,
690 BlockNumber bucket_blkno, BufferAccessStrategy bstrategy,
691 uint32 maxbucket, uint32 highmask, uint32 lowmask,
692 double *tuples_removed, double *num_index_tuples,
693 bool split_cleanup,
694 IndexBulkDeleteCallback callback, void *callback_state)
695{
696 BlockNumber blkno;
697 Buffer buf;
699 bool bucket_dirty = false;
700
701 blkno = bucket_blkno;
702 buf = bucket_buf;
703
704 if (split_cleanup)
705 new_bucket = _hash_get_newbucket_from_oldbucket(rel, cur_bucket,
706 lowmask, maxbucket);
707
708 /* Scan each page in bucket */
709 for (;;)
710 {
711 HashPageOpaque opaque;
712 OffsetNumber offno;
713 OffsetNumber maxoffno;
714 Buffer next_buf;
715 Page page;
716 OffsetNumber deletable[MaxOffsetNumber];
717 int ndeletable = 0;
718 bool retain_pin = false;
719 bool clear_dead_marking = false;
720
721 vacuum_delay_point(false);
722
723 page = BufferGetPage(buf);
724 opaque = HashPageGetOpaque(page);
725
726 /* Scan each tuple in page */
727 maxoffno = PageGetMaxOffsetNumber(page);
728 for (offno = FirstOffsetNumber;
729 offno <= maxoffno;
730 offno = OffsetNumberNext(offno))
731 {
732 ItemPointer htup;
733 IndexTuple itup;
734 Bucket bucket;
735 bool kill_tuple = false;
736
737 itup = (IndexTuple) PageGetItem(page,
738 PageGetItemId(page, offno));
739 htup = &(itup->t_tid);
740
741 /*
742 * To remove the dead tuples, we strictly want to rely on results
743 * of callback function. refer btvacuumpage for detailed reason.
744 */
745 if (callback && callback(htup, callback_state))
746 {
747 kill_tuple = true;
748 if (tuples_removed)
749 *tuples_removed += 1;
750 }
751 else if (split_cleanup)
752 {
753 /* delete the tuples that are moved by split. */
755 maxbucket,
756 highmask,
757 lowmask);
758 /* mark the item for deletion */
759 if (bucket != cur_bucket)
760 {
761 /*
762 * We expect tuples to either belong to current bucket or
763 * new_bucket. This is ensured because we don't allow
764 * further splits from bucket that contains garbage. See
765 * comments in _hash_expandtable.
766 */
767 Assert(bucket == new_bucket);
768 kill_tuple = true;
769 }
770 }
771
772 if (kill_tuple)
773 {
774 /* mark the item for deletion */
775 deletable[ndeletable++] = offno;
776 }
777 else
778 {
779 /* we're keeping it, so count it */
780 if (num_index_tuples)
781 *num_index_tuples += 1;
782 }
783 }
784
785 /* retain the pin on primary bucket page till end of bucket scan */
786 if (blkno == bucket_blkno)
787 retain_pin = true;
788 else
789 retain_pin = false;
790
791 blkno = opaque->hasho_nextblkno;
792
793 /*
794 * Apply deletions, advance to next page and write page if needed.
795 */
796 if (ndeletable > 0)
797 {
798 /* No ereport(ERROR) until changes are logged */
800
801 PageIndexMultiDelete(page, deletable, ndeletable);
802 bucket_dirty = true;
803
804 /*
805 * Let us mark the page as clean if vacuum removes the DEAD tuples
806 * from an index page. We do this by clearing
807 * LH_PAGE_HAS_DEAD_TUPLES flag.
808 */
809 if (tuples_removed && *tuples_removed > 0 &&
810 H_HAS_DEAD_TUPLES(opaque))
811 {
812 opaque->hasho_flag &= ~LH_PAGE_HAS_DEAD_TUPLES;
813 clear_dead_marking = true;
814 }
815
817
818 /* XLOG stuff */
819 if (RelationNeedsWAL(rel))
820 {
821 xl_hash_delete xlrec;
822 XLogRecPtr recptr;
823
824 xlrec.clear_dead_marking = clear_dead_marking;
825 xlrec.is_primary_bucket_page = (buf == bucket_buf);
826
829
830 /*
831 * bucket buffer was not changed, but still needs to be
832 * registered to ensure that we can acquire a cleanup lock on
833 * it during replay.
834 */
835 if (!xlrec.is_primary_bucket_page)
836 {
838
839 XLogRegisterBuffer(0, bucket_buf, flags);
840 }
841
843 XLogRegisterBufData(1, deletable,
844 ndeletable * sizeof(OffsetNumber));
845
846 recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_DELETE);
847 PageSetLSN(BufferGetPage(buf), recptr);
848 }
849
851 }
852
853 /* bail out if there are no more pages to scan. */
854 if (!BlockNumberIsValid(blkno))
855 break;
856
857 next_buf = _hash_getbuf_with_strategy(rel, blkno, HASH_WRITE,
859 bstrategy);
860
861 /*
862 * release the lock on previous page after acquiring the lock on next
863 * page
864 */
865 if (retain_pin)
867 else
868 _hash_relbuf(rel, buf);
869
870 buf = next_buf;
871 }
872
873 /*
874 * lock the bucket page to clear the garbage flag and squeeze the bucket.
875 * if the current buffer is same as bucket buffer, then we already have
876 * lock on bucket page.
877 */
878 if (buf != bucket_buf)
879 {
880 _hash_relbuf(rel, buf);
882 }
883
884 /*
885 * Clear the garbage flag from bucket after deleting the tuples that are
886 * moved by split. We purposefully clear the flag before squeeze bucket,
887 * so that after restart, vacuum shouldn't again try to delete the moved
888 * by split tuples.
889 */
890 if (split_cleanup)
891 {
892 HashPageOpaque bucket_opaque;
893 Page page;
894
895 page = BufferGetPage(bucket_buf);
896 bucket_opaque = HashPageGetOpaque(page);
897
898 /* No ereport(ERROR) until changes are logged */
900
901 bucket_opaque->hasho_flag &= ~LH_BUCKET_NEEDS_SPLIT_CLEANUP;
902 MarkBufferDirty(bucket_buf);
903
904 /* XLOG stuff */
905 if (RelationNeedsWAL(rel))
906 {
907 XLogRecPtr recptr;
908
910 XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD);
911
912 recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_CLEANUP);
913 PageSetLSN(page, recptr);
914 }
915
917 }
918
919 /*
920 * If we have deleted anything, try to compact free space. For squeezing
921 * the bucket, we must have a cleanup lock, else it can impact the
922 * ordering of tuples for a scan that has started before it.
923 */
924 if (bucket_dirty && IsBufferCleanupOK(bucket_buf))
925 _hash_squeezebucket(rel, cur_bucket, bucket_blkno, bucket_buf,
926 bstrategy);
927 else
928 LockBuffer(bucket_buf, BUFFER_LOCK_UNLOCK);
929}
930
933{
934 if (strategy == HTEqualStrategyNumber)
935 return COMPARE_EQ;
936 return COMPARE_INVALID;
937}
938
941{
942 if (cmptype == COMPARE_EQ)
944 return InvalidStrategy;
945}
void pgstat_progress_update_param(int index, int64 val)
uint32 BlockNumber
Definition: block.h:31
#define InvalidBlockNumber
Definition: block.h:33
static bool BlockNumberIsValid(BlockNumber blockNumber)
Definition: block.h:71
static Datum values[MAXATTR]
Definition: bootstrap.c:153
int Buffer
Definition: buf.h:23
#define BufferIsInvalid(buffer)
Definition: buf.h:31
#define InvalidBuffer
Definition: buf.h:25
void LockBuffer(Buffer buffer, BufferLockMode mode)
Definition: bufmgr.c:5604
bool IsBufferCleanupOK(Buffer buffer)
Definition: bufmgr.c:5915
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:2943
void LockBufferForCleanup(Buffer buffer)
Definition: bufmgr.c:5684
Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition: bufmgr.c:792
#define RelationGetNumberOfBlocks(reln)
Definition: bufmgr.h:294
static Page BufferGetPage(Buffer buffer)
Definition: bufmgr.h:436
@ BUFFER_LOCK_EXCLUSIVE
Definition: bufmgr.h:207
@ BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:205
@ RBM_NORMAL
Definition: bufmgr.h:46
void PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems)
Definition: bufpage.c:1160
static void * PageGetItem(const PageData *page, const ItemIdData *itemId)
Definition: bufpage.h:353
static ItemId PageGetItemId(Page page, OffsetNumber offsetNumber)
Definition: bufpage.h:243
static void PageSetLSN(Page page, XLogRecPtr lsn)
Definition: bufpage.h:390
PageData * Page
Definition: bufpage.h:81
static OffsetNumber PageGetMaxOffsetNumber(const PageData *page)
Definition: bufpage.h:371
#define Min(x, y)
Definition: c.h:1016
uint8_t uint8
Definition: c.h:550
#define PG_USED_FOR_ASSERTS_ONLY
Definition: c.h:229
int64_t int64
Definition: c.h:549
uint32_t uint32
Definition: c.h:552
size_t Size
Definition: c.h:624
CompareType
Definition: cmptype.h:32
@ COMPARE_INVALID
Definition: cmptype.h:33
@ COMPARE_EQ
Definition: cmptype.h:36
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:226
#define palloc_object(type)
Definition: fe_memutils.h:74
#define palloc_array(type, count)
Definition: fe_memutils.h:76
#define palloc0_object(type)
Definition: fe_memutils.h:75
#define PG_RETURN_POINTER(x)
Definition: fmgr.h:361
#define PG_FUNCTION_ARGS
Definition: fmgr.h:193
IndexScanDesc RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys)
Definition: genam.c:80
bool(* IndexBulkDeleteCallback)(ItemPointer itemptr, void *state)
Definition: genam.h:114
IndexUniqueCheck
Definition: genam.h:143
int NBuffers
Definition: globals.c:142
int maintenance_work_mem
Definition: globals.c:133
bool hashinsert(Relation rel, Datum *values, bool *isnull, ItemPointer ht_ctid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, IndexInfo *indexInfo)
Definition: hash.c:258
IndexBulkDeleteResult * hashvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
Definition: hash.c:650
IndexBuildResult * hashbuild(Relation heap, Relation index, IndexInfo *indexInfo)
Definition: hash.c:122
Datum hashhandler(PG_FUNCTION_ARGS)
Definition: hash.c:58
bool hashgettuple(IndexScanDesc scan, ScanDirection dir)
Definition: hash.c:290
CompareType hashtranslatestrategy(StrategyNumber strategy, Oid opfamily)
Definition: hash.c:932
static void hashbuildCallback(Relation index, ItemPointer tid, Datum *values, bool *isnull, bool tupleIsAlive, void *state)
Definition: hash.c:217
void hashbuildempty(Relation index)
Definition: hash.c:208
IndexScanDesc hashbeginscan(Relation rel, int nkeys, int norderbys)
Definition: hash.c:373
StrategyNumber hashtranslatecmptype(CompareType cmptype, Oid opfamily)
Definition: hash.c:940
IndexBulkDeleteResult * hashbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state)
Definition: hash.c:464
void hashrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, ScanKey orderbys, int norderbys)
Definition: hash.c:403
void hashendscan(IndexScanDesc scan)
Definition: hash.c:433
int64 hashgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
Definition: hash.c:341
void hashbucketcleanup(Relation rel, Bucket cur_bucket, Buffer bucket_buf, BlockNumber bucket_blkno, BufferAccessStrategy bstrategy, uint32 maxbucket, uint32 highmask, uint32 lowmask, double *tuples_removed, double *num_index_tuples, bool split_cleanup, IndexBulkDeleteCallback callback, void *callback_state)
Definition: hash.c:689
#define HASH_NOLOCK
Definition: hash.h:341
#define HashPageGetOpaque(page)
Definition: hash.h:88
#define LH_BUCKET_PAGE
Definition: hash.h:55
#define HASH_WRITE
Definition: hash.h:340
#define H_BUCKET_BEING_SPLIT(opaque)
Definition: hash.h:91
#define LH_META_PAGE
Definition: hash.h:57
#define HashScanPosInvalidate(scanpos)
Definition: hash.h:144
#define HashPageGetMeta(page)
Definition: hash.h:323
#define BUCKET_TO_BLKNO(metap, B)
Definition: hash.h:39
#define HASH_METAPAGE
Definition: hash.h:198
#define H_HAS_DEAD_TUPLES(opaque)
Definition: hash.h:93
#define H_NEEDS_SPLIT_CLEANUP(opaque)
Definition: hash.h:90
uint32 Bucket
Definition: hash.h:35
#define HASHNProcs
Definition: hash.h:358
HashScanOpaqueData * HashScanOpaque
Definition: hash.h:192
#define HASHOPTIONS_PROC
Definition: hash.h:357
#define HashScanPosIsValid(scanpos)
Definition: hash.h:137
#define LH_OVERFLOW_PAGE
Definition: hash.h:54
#define InvalidBucket
Definition: hash.h:37
#define XLOG_HASH_SPLIT_CLEANUP
Definition: hash_xlog.h:37
#define XLOG_HASH_UPDATE_META_PAGE
Definition: hash_xlog.h:38
#define SizeOfHashDelete
Definition: hash_xlog.h:187
#define XLOG_HASH_DELETE
Definition: hash_xlog.h:36
#define SizeOfHashUpdateMetaPage
Definition: hash_xlog.h:201
Assert(PointerIsAligned(start, uint64))
void _hash_doinsert(Relation rel, IndexTuple itup, Relation heapRel, bool sorted)
Definition: hashinsert.c:38
void _hash_squeezebucket(Relation rel, Bucket bucket, BlockNumber bucket_blkno, Buffer bucket_buf, BufferAccessStrategy bstrategy)
Definition: hashovfl.c:842
HashMetaPage _hash_getcachedmetap(Relation rel, Buffer *metabuf, bool force_refresh)
Definition: hashpage.c:1501
void _hash_relbuf(Relation rel, Buffer buf)
Definition: hashpage.c:266
uint32 _hash_init(Relation rel, double num_tuples, ForkNumber forkNum)
Definition: hashpage.c:327
void _hash_dropbuf(Relation rel, Buffer buf)
Definition: hashpage.c:277
void _hash_dropscanbuf(Relation rel, HashScanOpaque so)
Definition: hashpage.c:289
Buffer _hash_getbuf(Relation rel, BlockNumber blkno, int access, int flags)
Definition: hashpage.c:70
Buffer _hash_getbuf_with_strategy(Relation rel, BlockNumber blkno, int access, int flags, BufferAccessStrategy bstrategy)
Definition: hashpage.c:239
bool _hash_first(IndexScanDesc scan, ScanDirection dir)
Definition: hashsearch.c:288
bool _hash_next(IndexScanDesc scan, ScanDirection dir)
Definition: hashsearch.c:48
void _h_indexbuild(HSpool *hspool, Relation heapRel)
Definition: hashsort.c:120
HSpool * _h_spoolinit(Relation heap, Relation index, uint32 num_buckets)
Definition: hashsort.c:60
void _h_spool(HSpool *hspool, const ItemPointerData *self, const Datum *values, const bool *isnull)
Definition: hashsort.c:109
void _h_spooldestroy(HSpool *hspool)
Definition: hashsort.c:99
uint32 _hash_get_indextuple_hashkey(IndexTuple itup)
Definition: hashutil.c:291
Bucket _hash_hashkey2bucket(uint32 hashkey, uint32 maxbucket, uint32 highmask, uint32 lowmask)
Definition: hashutil.c:125
void _hash_checkpage(Relation rel, Buffer buf, int flags)
Definition: hashutil.c:210
Bucket _hash_get_newbucket_from_oldbucket(Relation rel, Bucket old_bucket, uint32 lowmask, uint32 maxbucket)
Definition: hashutil.c:494
bytea * hashoptions(Datum reloptions, bool validate)
Definition: hashutil.c:275
bool _hash_convert_tuple(Relation index, const Datum *user_values, const bool *user_isnull, Datum *index_values, bool *index_isnull)
Definition: hashutil.c:318
void _hash_kill_items(IndexScanDesc scan)
Definition: hashutil.c:536
void hashadjustmembers(Oid opfamilyoid, Oid opclassoid, List *operators, List *functions)
Definition: hashvalidate.c:263
bool hashvalidate(Oid opclassoid)
Definition: hashvalidate.c:40
IndexTuple index_form_tuple(TupleDesc tupleDescriptor, const Datum *values, const bool *isnull)
Definition: indextuple.c:44
if(TABLE==NULL||TABLE_index==NULL)
Definition: isn.c:81
IndexTupleData * IndexTuple
Definition: itup.h:53
#define MaxIndexTuplesPerPage
Definition: itup.h:181
int NLocBuffer
Definition: localbuf.c:45
void pfree(void *pointer)
Definition: mcxt.c:1594
#define START_CRIT_SECTION()
Definition: miscadmin.h:150
#define END_CRIT_SECTION()
Definition: miscadmin.h:152
#define makeNode(_type_)
Definition: nodes.h:161
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
uint16 OffsetNumber
Definition: off.h:24
#define FirstOffsetNumber
Definition: off.h:27
#define MaxOffsetNumber
Definition: off.h:28
static char buf[DEFAULT_XLOG_SEG_SIZE]
Definition: pg_test_fsync.c:71
void estimate_rel_size(Relation rel, int32 *attr_widths, BlockNumber *pages, double *tuples, double *allvisfrac)
Definition: plancat.c:1307
uint64_t Datum
Definition: postgres.h:70
unsigned int Oid
Definition: postgres_ext.h:32
#define PROGRESS_CREATEIDX_TUPLES_TOTAL
Definition: progress.h:106
#define RelationGetDescr(relation)
Definition: rel.h:541
#define RelationGetRelationName(relation)
Definition: rel.h:549
#define RelationNeedsWAL(relation)
Definition: rel.h:638
@ MAIN_FORKNUM
Definition: relpath.h:58
@ INIT_FORKNUM
Definition: relpath.h:61
ScanDirection
Definition: sdir.h:25
@ ForwardScanDirection
Definition: sdir.h:28
void hashcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, Cost *indexStartupCost, Cost *indexTotalCost, Selectivity *indexSelectivity, double *indexCorrelation, double *indexPages)
Definition: selfuncs.c:8068
uint16 StrategyNumber
Definition: stratnum.h:22
#define HTMaxStrategyNumber
Definition: stratnum.h:43
#define InvalidStrategy
Definition: stratnum.h:24
#define HTEqualStrategyNumber
Definition: stratnum.h:41
HSpool * spool
Definition: hash.c:40
Relation heapRel
Definition: hash.c:42
double indtuples
Definition: hash.c:41
uint32 hashm_lowmask
Definition: hash.h:256
uint32 hashm_maxbucket
Definition: hash.h:254
double hashm_ntuples
Definition: hash.h:248
uint32 hashm_highmask
Definition: hash.h:255
BlockNumber hasho_nextblkno
Definition: hash.h:80
uint16 hasho_flag
Definition: hash.h:82
BlockNumber hasho_prevblkno
Definition: hash.h:79
bool hashso_buc_split
Definition: hash.h:180
HashScanPosData currPos
Definition: hash.h:189
bool hashso_buc_populated
Definition: hash.h:174
Buffer hashso_split_bucket_buf
Definition: hash.h:171
Buffer hashso_bucket_buf
Definition: hash.h:164
int * killedItems
Definition: hash.h:182
HashScanPosItem items[MaxIndexTuplesPerPage]
Definition: hash.h:127
int itemIndex
Definition: hash.h:125
ambuildphasename_function ambuildphasename
Definition: amapi.h:306
ambuildempty_function ambuildempty
Definition: amapi.h:296
amvacuumcleanup_function amvacuumcleanup
Definition: amapi.h:300
bool amclusterable
Definition: amapi.h:270
amoptions_function amoptions
Definition: amapi.h:304
amestimateparallelscan_function amestimateparallelscan
Definition: amapi.h:318
amrestrpos_function amrestrpos
Definition: amapi.h:315
aminsert_function aminsert
Definition: amapi.h:297
amendscan_function amendscan
Definition: amapi.h:313
amtranslate_strategy_function amtranslatestrategy
Definition: amapi.h:323
uint16 amoptsprocnum
Definition: amapi.h:244
amparallelrescan_function amparallelrescan
Definition: amapi.h:320
Oid amkeytype
Definition: amapi.h:286
bool amconsistentordering
Definition: amapi.h:254
bool ampredlocks
Definition: amapi.h:272
uint16 amsupport
Definition: amapi.h:242
amtranslate_cmptype_function amtranslatecmptype
Definition: amapi.h:324
amcostestimate_function amcostestimate
Definition: amapi.h:302
bool amcanorderbyop
Definition: amapi.h:248
amadjustmembers_function amadjustmembers
Definition: amapi.h:308
ambuild_function ambuild
Definition: amapi.h:295
bool amstorage
Definition: amapi.h:268
uint16 amstrategies
Definition: amapi.h:240
bool amoptionalkey
Definition: amapi.h:262
amgettuple_function amgettuple
Definition: amapi.h:311
amcanreturn_function amcanreturn
Definition: amapi.h:301
bool amcanunique
Definition: amapi.h:258
amgetbitmap_function amgetbitmap
Definition: amapi.h:312
amproperty_function amproperty
Definition: amapi.h:305
ambulkdelete_function ambulkdelete
Definition: amapi.h:299
bool amsearcharray
Definition: amapi.h:264
bool amsummarizing
Definition: amapi.h:282
amvalidate_function amvalidate
Definition: amapi.h:307
ammarkpos_function ammarkpos
Definition: amapi.h:314
bool amcanmulticol
Definition: amapi.h:260
bool amusemaintenanceworkmem
Definition: amapi.h:280
ambeginscan_function ambeginscan
Definition: amapi.h:309
bool amcanparallel
Definition: amapi.h:274
amrescan_function amrescan
Definition: amapi.h:310
bool amcanorder
Definition: amapi.h:246
bool amcanbuildparallel
Definition: amapi.h:276
aminitparallelscan_function aminitparallelscan
Definition: amapi.h:319
uint8 amparallelvacuumoptions
Definition: amapi.h:284
aminsertcleanup_function aminsertcleanup
Definition: amapi.h:298
bool amcanbackward
Definition: amapi.h:256
amgettreeheight_function amgettreeheight
Definition: amapi.h:303
bool amcaninclude
Definition: amapi.h:278
bool amsearchnulls
Definition: amapi.h:266
bool amconsistentequality
Definition: amapi.h:252
bool amcanhash
Definition: amapi.h:250
double heap_tuples
Definition: genam.h:59
double index_tuples
Definition: genam.h:60
BlockNumber num_pages
Definition: genam.h:104
double tuples_removed
Definition: genam.h:107
double num_index_tuples
Definition: genam.h:106
struct ScanKeyData * keyData
Definition: relscan.h:143
bool kill_prior_tuple
Definition: relscan.h:149
Relation indexRelation
Definition: relscan.h:139
ItemPointerData t_tid
Definition: itup.h:37
Relation index
Definition: genam.h:73
BufferAccessStrategy strategy
Definition: genam.h:80
Definition: type.h:96
Definition: regguts.h:323
bool clear_dead_marking
Definition: hash_xlog.h:181
bool is_primary_bucket_page
Definition: hash_xlog.h:183
static double table_index_build_scan(Relation table_rel, Relation index_rel, IndexInfo *index_info, bool allow_sync, bool progress, IndexBuildCallback callback, void *callback_state, TableScanDesc scan)
Definition: tableam.h:1754
static void callback(struct sockaddr *addr, struct sockaddr *mask, void *unused)
Definition: test_ifaddrs.c:46
void tbm_add_tuples(TIDBitmap *tbm, const ItemPointerData *tids, int ntids, bool recheck)
Definition: tidbitmap.c:367
void vacuum_delay_point(bool is_analyze)
Definition: vacuum.c:2426
#define VACUUM_OPTION_PARALLEL_BULKDEL
Definition: vacuum.h:48
uint64 XLogRecPtr
Definition: xlogdefs.h:21
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:478
void XLogRegisterBufData(uint8 block_id, const void *data, uint32 len)
Definition: xloginsert.c:409
void XLogRegisterData(const void *data, uint32 len)
Definition: xloginsert.c:368
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition: xloginsert.c:245
void XLogBeginInsert(void)
Definition: xloginsert.c:152
#define REGBUF_NO_CHANGE
Definition: xloginsert.h:37
#define REGBUF_STANDARD
Definition: xloginsert.h:35
#define REGBUF_NO_IMAGE
Definition: xloginsert.h:33