PostgreSQL Source Code  git master
hash.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * hash.c
4  * Implementation of Margo Seltzer's Hashing package for postgres.
5  *
6  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  * src/backend/access/hash/hash.c
12  *
13  * NOTES
14  * This file contains only the public interface routines.
15  *
16  *-------------------------------------------------------------------------
17  */
18 
19 #include "postgres.h"
20 
21 #include "access/hash.h"
22 #include "access/hash_xlog.h"
23 #include "access/relscan.h"
24 #include "access/tableam.h"
25 #include "access/xloginsert.h"
26 #include "commands/progress.h"
27 #include "commands/vacuum.h"
28 #include "miscadmin.h"
29 #include "nodes/execnodes.h"
30 #include "optimizer/plancat.h"
31 #include "pgstat.h"
32 #include "utils/fmgrprotos.h"
33 #include "utils/index_selfuncs.h"
34 #include "utils/rel.h"
35 
36 /* Working state for hashbuild and its callback */
37 typedef struct
38 {
39  HSpool *spool; /* NULL if not using spooling */
40  double indtuples; /* # tuples accepted into index */
41  Relation heapRel; /* heap relation descriptor */
43 
44 static void hashbuildCallback(Relation index,
45  ItemPointer tid,
46  Datum *values,
47  bool *isnull,
48  bool tupleIsAlive,
49  void *state);
50 
51 
52 /*
53  * Hash handler function: return IndexAmRoutine with access method parameters
54  * and callbacks.
55  */
56 Datum
58 {
60 
61  amroutine->amstrategies = HTMaxStrategyNumber;
62  amroutine->amsupport = HASHNProcs;
63  amroutine->amoptsprocnum = HASHOPTIONS_PROC;
64  amroutine->amcanorder = false;
65  amroutine->amcanorderbyop = false;
66  amroutine->amcanbackward = true;
67  amroutine->amcanunique = false;
68  amroutine->amcanmulticol = false;
69  amroutine->amoptionalkey = false;
70  amroutine->amsearcharray = false;
71  amroutine->amsearchnulls = false;
72  amroutine->amstorage = false;
73  amroutine->amclusterable = false;
74  amroutine->ampredlocks = true;
75  amroutine->amcanparallel = false;
76  amroutine->amcanbuildparallel = false;
77  amroutine->amcaninclude = false;
78  amroutine->amusemaintenanceworkmem = false;
79  amroutine->amsummarizing = false;
80  amroutine->amparallelvacuumoptions =
82  amroutine->amkeytype = INT4OID;
83 
84  amroutine->ambuild = hashbuild;
85  amroutine->ambuildempty = hashbuildempty;
86  amroutine->aminsert = hashinsert;
87  amroutine->aminsertcleanup = NULL;
88  amroutine->ambulkdelete = hashbulkdelete;
90  amroutine->amcanreturn = NULL;
91  amroutine->amcostestimate = hashcostestimate;
92  amroutine->amoptions = hashoptions;
93  amroutine->amproperty = NULL;
94  amroutine->ambuildphasename = NULL;
95  amroutine->amvalidate = hashvalidate;
97  amroutine->ambeginscan = hashbeginscan;
98  amroutine->amrescan = hashrescan;
99  amroutine->amgettuple = hashgettuple;
100  amroutine->amgetbitmap = hashgetbitmap;
101  amroutine->amendscan = hashendscan;
102  amroutine->ammarkpos = NULL;
103  amroutine->amrestrpos = NULL;
104  amroutine->amestimateparallelscan = NULL;
105  amroutine->aminitparallelscan = NULL;
106  amroutine->amparallelrescan = NULL;
107 
108  PG_RETURN_POINTER(amroutine);
109 }
110 
111 /*
112  * hashbuild() -- build a new hash index.
113  */
116 {
117  IndexBuildResult *result;
118  BlockNumber relpages;
119  double reltuples;
120  double allvisfrac;
121  uint32 num_buckets;
122  long sort_threshold;
123  HashBuildState buildstate;
124 
125  /*
126  * We expect to be called exactly once for any index relation. If that's
127  * not the case, big trouble's what we have.
128  */
130  elog(ERROR, "index \"%s\" already contains data",
132 
133  /* Estimate the number of rows currently present in the table */
134  estimate_rel_size(heap, NULL, &relpages, &reltuples, &allvisfrac);
135 
136  /* Initialize the hash index metadata page and initial buckets */
137  num_buckets = _hash_init(index, reltuples, MAIN_FORKNUM);
138 
139  /*
140  * If we just insert the tuples into the index in scan order, then
141  * (assuming their hash codes are pretty random) there will be no locality
142  * of access to the index, and if the index is bigger than available RAM
143  * then we'll thrash horribly. To prevent that scenario, we can sort the
144  * tuples by (expected) bucket number. However, such a sort is useless
145  * overhead when the index does fit in RAM. We choose to sort if the
146  * initial index size exceeds maintenance_work_mem, or the number of
147  * buffers usable for the index, whichever is less. (Limiting by the
148  * number of buffers should reduce thrashing between PG buffers and kernel
149  * buffers, which seems useful even if no physical I/O results. Limiting
150  * by maintenance_work_mem is useful to allow easy testing of the sort
151  * code path, and may be useful to DBAs as an additional control knob.)
152  *
153  * NOTE: this test will need adjustment if a bucket is ever different from
154  * one page. Also, "initial index size" accounting does not include the
155  * metapage, nor the first bitmap page.
156  */
157  sort_threshold = (maintenance_work_mem * 1024L) / BLCKSZ;
158  if (index->rd_rel->relpersistence != RELPERSISTENCE_TEMP)
159  sort_threshold = Min(sort_threshold, NBuffers);
160  else
161  sort_threshold = Min(sort_threshold, NLocBuffer);
162 
163  if (num_buckets >= (uint32) sort_threshold)
164  buildstate.spool = _h_spoolinit(heap, index, num_buckets);
165  else
166  buildstate.spool = NULL;
167 
168  /* prepare to build the index */
169  buildstate.indtuples = 0;
170  buildstate.heapRel = heap;
171 
172  /* do the heap scan */
173  reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
175  (void *) &buildstate, NULL);
177  buildstate.indtuples);
178 
179  if (buildstate.spool)
180  {
181  /* sort the tuples and insert them into the index */
182  _h_indexbuild(buildstate.spool, buildstate.heapRel);
183  _h_spooldestroy(buildstate.spool);
184  }
185 
186  /*
187  * Return statistics
188  */
189  result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
190 
191  result->heap_tuples = reltuples;
192  result->index_tuples = buildstate.indtuples;
193 
194  return result;
195 }
196 
197 /*
198  * hashbuildempty() -- build an empty hash index in the initialization fork
199  */
200 void
202 {
204 }
205 
206 /*
207  * Per-tuple callback for table_index_build_scan
208  */
209 static void
211  ItemPointer tid,
212  Datum *values,
213  bool *isnull,
214  bool tupleIsAlive,
215  void *state)
216 {
217  HashBuildState *buildstate = (HashBuildState *) state;
218  Datum index_values[1];
219  bool index_isnull[1];
220  IndexTuple itup;
221 
222  /* convert data to a hash key; on failure, do not insert anything */
224  values, isnull,
225  index_values, index_isnull))
226  return;
227 
228  /* Either spool the tuple for sorting, or just put it into the index */
229  if (buildstate->spool)
230  _h_spool(buildstate->spool, tid, index_values, index_isnull);
231  else
232  {
233  /* form an index tuple and point it at the heap tuple */
235  index_values, index_isnull);
236  itup->t_tid = *tid;
237  _hash_doinsert(index, itup, buildstate->heapRel, false);
238  pfree(itup);
239  }
240 
241  buildstate->indtuples += 1;
242 }
243 
244 /*
245  * hashinsert() -- insert an index tuple into a hash table.
246  *
247  * Hash on the heap tuple's key, form an index tuple with hash code.
248  * Find the appropriate location for the new tuple, and put it there.
249  */
250 bool
251 hashinsert(Relation rel, Datum *values, bool *isnull,
252  ItemPointer ht_ctid, Relation heapRel,
253  IndexUniqueCheck checkUnique,
254  bool indexUnchanged,
255  IndexInfo *indexInfo)
256 {
257  Datum index_values[1];
258  bool index_isnull[1];
259  IndexTuple itup;
260 
261  /* convert data to a hash key; on failure, do not insert anything */
262  if (!_hash_convert_tuple(rel,
263  values, isnull,
264  index_values, index_isnull))
265  return false;
266 
267  /* form an index tuple and point it at the heap tuple */
268  itup = index_form_tuple(RelationGetDescr(rel), index_values, index_isnull);
269  itup->t_tid = *ht_ctid;
270 
271  _hash_doinsert(rel, itup, heapRel, false);
272 
273  pfree(itup);
274 
275  return false;
276 }
277 
278 
279 /*
280  * hashgettuple() -- Get the next tuple in the scan.
281  */
282 bool
284 {
285  HashScanOpaque so = (HashScanOpaque) scan->opaque;
286  bool res;
287 
288  /* Hash indexes are always lossy since we store only the hash code */
289  scan->xs_recheck = true;
290 
291  /*
292  * If we've already initialized this scan, we can just advance it in the
293  * appropriate direction. If we haven't done so yet, we call a routine to
294  * get the first item in the scan.
295  */
297  res = _hash_first(scan, dir);
298  else
299  {
300  /*
301  * Check to see if we should kill the previously-fetched tuple.
302  */
303  if (scan->kill_prior_tuple)
304  {
305  /*
306  * Yes, so remember it for later. (We'll deal with all such tuples
307  * at once right after leaving the index page or at end of scan.)
308  * In case if caller reverses the indexscan direction it is quite
309  * possible that the same item might get entered multiple times.
310  * But, we don't detect that; instead, we just forget any excess
311  * entries.
312  */
313  if (so->killedItems == NULL)
314  so->killedItems = (int *)
315  palloc(MaxIndexTuplesPerPage * sizeof(int));
316 
318  so->killedItems[so->numKilled++] = so->currPos.itemIndex;
319  }
320 
321  /*
322  * Now continue the scan.
323  */
324  res = _hash_next(scan, dir);
325  }
326 
327  return res;
328 }
329 
330 
331 /*
332  * hashgetbitmap() -- get all tuples at once
333  */
334 int64
336 {
337  HashScanOpaque so = (HashScanOpaque) scan->opaque;
338  bool res;
339  int64 ntids = 0;
340  HashScanPosItem *currItem;
341 
343 
344  while (res)
345  {
346  currItem = &so->currPos.items[so->currPos.itemIndex];
347 
348  /*
349  * _hash_first and _hash_next handle eliminate dead index entries
350  * whenever scan->ignore_killed_tuples is true. Therefore, there's
351  * nothing to do here except add the results to the TIDBitmap.
352  */
353  tbm_add_tuples(tbm, &(currItem->heapTid), 1, true);
354  ntids++;
355 
357  }
358 
359  return ntids;
360 }
361 
362 
363 /*
364  * hashbeginscan() -- start a scan on a hash index
365  */
367 hashbeginscan(Relation rel, int nkeys, int norderbys)
368 {
369  IndexScanDesc scan;
370  HashScanOpaque so;
371 
372  /* no order by operators allowed */
373  Assert(norderbys == 0);
374 
375  scan = RelationGetIndexScan(rel, nkeys, norderbys);
376 
377  so = (HashScanOpaque) palloc(sizeof(HashScanOpaqueData));
381 
382  so->hashso_buc_populated = false;
383  so->hashso_buc_split = false;
384 
385  so->killedItems = NULL;
386  so->numKilled = 0;
387 
388  scan->opaque = so;
389 
390  return scan;
391 }
392 
393 /*
394  * hashrescan() -- rescan an index relation
395  */
396 void
397 hashrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
398  ScanKey orderbys, int norderbys)
399 {
400  HashScanOpaque so = (HashScanOpaque) scan->opaque;
401  Relation rel = scan->indexRelation;
402 
404  {
405  /* Before leaving current page, deal with any killed items */
406  if (so->numKilled > 0)
407  _hash_kill_items(scan);
408  }
409 
410  _hash_dropscanbuf(rel, so);
411 
412  /* set position invalid (this will cause _hash_first call) */
414 
415  /* Update scan key, if a new one is given */
416  if (scankey && scan->numberOfKeys > 0)
417  {
418  memmove(scan->keyData,
419  scankey,
420  scan->numberOfKeys * sizeof(ScanKeyData));
421  }
422 
423  so->hashso_buc_populated = false;
424  so->hashso_buc_split = false;
425 }
426 
427 /*
428  * hashendscan() -- close down a scan
429  */
430 void
432 {
433  HashScanOpaque so = (HashScanOpaque) scan->opaque;
434  Relation rel = scan->indexRelation;
435 
437  {
438  /* Before leaving current page, deal with any killed items */
439  if (so->numKilled > 0)
440  _hash_kill_items(scan);
441  }
442 
443  _hash_dropscanbuf(rel, so);
444 
445  if (so->killedItems != NULL)
446  pfree(so->killedItems);
447  pfree(so);
448  scan->opaque = NULL;
449 }
450 
451 /*
452  * Bulk deletion of all index entries pointing to a set of heap tuples.
453  * The set of target tuples is specified via a callback routine that tells
454  * whether any given heap tuple (identified by ItemPointer) is being deleted.
455  *
456  * This function also deletes the tuples that are moved by split to other
457  * bucket.
458  *
459  * Result: a palloc'd struct containing statistical info for VACUUM displays.
460  */
463  IndexBulkDeleteCallback callback, void *callback_state)
464 {
465  Relation rel = info->index;
466  double tuples_removed;
467  double num_index_tuples;
468  double orig_ntuples;
469  Bucket orig_maxbucket;
470  Bucket cur_maxbucket;
471  Bucket cur_bucket;
472  Buffer metabuf = InvalidBuffer;
473  HashMetaPage metap;
474  HashMetaPage cachedmetap;
475 
476  tuples_removed = 0;
477  num_index_tuples = 0;
478 
479  /*
480  * We need a copy of the metapage so that we can use its hashm_spares[]
481  * values to compute bucket page addresses, but a cached copy should be
482  * good enough. (If not, we'll detect that further down and refresh the
483  * cache as necessary.)
484  */
485  cachedmetap = _hash_getcachedmetap(rel, &metabuf, false);
486  Assert(cachedmetap != NULL);
487 
488  orig_maxbucket = cachedmetap->hashm_maxbucket;
489  orig_ntuples = cachedmetap->hashm_ntuples;
490 
491  /* Scan the buckets that we know exist */
492  cur_bucket = 0;
493  cur_maxbucket = orig_maxbucket;
494 
495 loop_top:
496  while (cur_bucket <= cur_maxbucket)
497  {
498  BlockNumber bucket_blkno;
499  BlockNumber blkno;
500  Buffer bucket_buf;
501  Buffer buf;
502  HashPageOpaque bucket_opaque;
503  Page page;
504  bool split_cleanup = false;
505 
506  /* Get address of bucket's start page */
507  bucket_blkno = BUCKET_TO_BLKNO(cachedmetap, cur_bucket);
508 
509  blkno = bucket_blkno;
510 
511  /*
512  * We need to acquire a cleanup lock on the primary bucket page to out
513  * wait concurrent scans before deleting the dead tuples.
514  */
515  buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, info->strategy);
518 
519  page = BufferGetPage(buf);
520  bucket_opaque = HashPageGetOpaque(page);
521 
522  /*
523  * If the bucket contains tuples that are moved by split, then we need
524  * to delete such tuples. We can't delete such tuples if the split
525  * operation on bucket is not finished as those are needed by scans.
526  */
527  if (!H_BUCKET_BEING_SPLIT(bucket_opaque) &&
528  H_NEEDS_SPLIT_CLEANUP(bucket_opaque))
529  {
530  split_cleanup = true;
531 
532  /*
533  * This bucket might have been split since we last held a lock on
534  * the metapage. If so, hashm_maxbucket, hashm_highmask and
535  * hashm_lowmask might be old enough to cause us to fail to remove
536  * tuples left behind by the most recent split. To prevent that,
537  * now that the primary page of the target bucket has been locked
538  * (and thus can't be further split), check whether we need to
539  * update our cached metapage data.
540  */
541  Assert(bucket_opaque->hasho_prevblkno != InvalidBlockNumber);
542  if (bucket_opaque->hasho_prevblkno > cachedmetap->hashm_maxbucket)
543  {
544  cachedmetap = _hash_getcachedmetap(rel, &metabuf, true);
545  Assert(cachedmetap != NULL);
546  }
547  }
548 
549  bucket_buf = buf;
550 
551  hashbucketcleanup(rel, cur_bucket, bucket_buf, blkno, info->strategy,
552  cachedmetap->hashm_maxbucket,
553  cachedmetap->hashm_highmask,
554  cachedmetap->hashm_lowmask, &tuples_removed,
555  &num_index_tuples, split_cleanup,
556  callback, callback_state);
557 
558  _hash_dropbuf(rel, bucket_buf);
559 
560  /* Advance to next bucket */
561  cur_bucket++;
562  }
563 
564  if (BufferIsInvalid(metabuf))
566 
567  /* Write-lock metapage and check for split since we started */
569  metap = HashPageGetMeta(BufferGetPage(metabuf));
570 
571  if (cur_maxbucket != metap->hashm_maxbucket)
572  {
573  /* There's been a split, so process the additional bucket(s) */
574  LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
575  cachedmetap = _hash_getcachedmetap(rel, &metabuf, true);
576  Assert(cachedmetap != NULL);
577  cur_maxbucket = cachedmetap->hashm_maxbucket;
578  goto loop_top;
579  }
580 
581  /* Okay, we're really done. Update tuple count in metapage. */
583 
584  if (orig_maxbucket == metap->hashm_maxbucket &&
585  orig_ntuples == metap->hashm_ntuples)
586  {
587  /*
588  * No one has split or inserted anything since start of scan, so
589  * believe our count as gospel.
590  */
591  metap->hashm_ntuples = num_index_tuples;
592  }
593  else
594  {
595  /*
596  * Otherwise, our count is untrustworthy since we may have
597  * double-scanned tuples in split buckets. Proceed by dead-reckoning.
598  * (Note: we still return estimated_count = false, because using this
599  * count is better than not updating reltuples at all.)
600  */
601  if (metap->hashm_ntuples > tuples_removed)
602  metap->hashm_ntuples -= tuples_removed;
603  else
604  metap->hashm_ntuples = 0;
605  num_index_tuples = metap->hashm_ntuples;
606  }
607 
608  MarkBufferDirty(metabuf);
609 
610  /* XLOG stuff */
611  if (RelationNeedsWAL(rel))
612  {
614  XLogRecPtr recptr;
615 
616  xlrec.ntuples = metap->hashm_ntuples;
617 
618  XLogBeginInsert();
619  XLogRegisterData((char *) &xlrec, SizeOfHashUpdateMetaPage);
620 
621  XLogRegisterBuffer(0, metabuf, REGBUF_STANDARD);
622 
623  recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_UPDATE_META_PAGE);
624  PageSetLSN(BufferGetPage(metabuf), recptr);
625  }
626 
628 
629  _hash_relbuf(rel, metabuf);
630 
631  /* return statistics */
632  if (stats == NULL)
634  stats->estimated_count = false;
635  stats->num_index_tuples = num_index_tuples;
636  stats->tuples_removed += tuples_removed;
637  /* hashvacuumcleanup will fill in num_pages */
638 
639  return stats;
640 }
641 
642 /*
643  * Post-VACUUM cleanup.
644  *
645  * Result: a palloc'd struct containing statistical info for VACUUM displays.
646  */
649 {
650  Relation rel = info->index;
651  BlockNumber num_pages;
652 
653  /* If hashbulkdelete wasn't called, return NULL signifying no change */
654  /* Note: this covers the analyze_only case too */
655  if (stats == NULL)
656  return NULL;
657 
658  /* update statistics */
659  num_pages = RelationGetNumberOfBlocks(rel);
660  stats->num_pages = num_pages;
661 
662  return stats;
663 }
664 
665 /*
666  * Helper function to perform deletion of index entries from a bucket.
667  *
668  * This function expects that the caller has acquired a cleanup lock on the
669  * primary bucket page, and will return with a write lock again held on the
670  * primary bucket page. The lock won't necessarily be held continuously,
671  * though, because we'll release it when visiting overflow pages.
672  *
673  * There can't be any concurrent scans in progress when we first enter this
674  * function because of the cleanup lock we hold on the primary bucket page,
675  * but as soon as we release that lock, there might be. If those scans got
676  * ahead of our cleanup scan, they might see a tuple before we kill it and
677  * wake up only after VACUUM has completed and the TID has been recycled for
678  * an unrelated tuple. To avoid that calamity, we prevent scans from passing
679  * our cleanup scan by locking the next page in the bucket chain before
680  * releasing the lock on the previous page. (This type of lock chaining is not
681  * ideal, so we might want to look for a better solution at some point.)
682  *
683  * We need to retain a pin on the primary bucket to ensure that no concurrent
684  * split can start.
685  */
686 void
687 hashbucketcleanup(Relation rel, Bucket cur_bucket, Buffer bucket_buf,
688  BlockNumber bucket_blkno, BufferAccessStrategy bstrategy,
689  uint32 maxbucket, uint32 highmask, uint32 lowmask,
690  double *tuples_removed, double *num_index_tuples,
691  bool split_cleanup,
692  IndexBulkDeleteCallback callback, void *callback_state)
693 {
694  BlockNumber blkno;
695  Buffer buf;
697  bool bucket_dirty = false;
698 
699  blkno = bucket_blkno;
700  buf = bucket_buf;
701 
702  if (split_cleanup)
703  new_bucket = _hash_get_newbucket_from_oldbucket(rel, cur_bucket,
704  lowmask, maxbucket);
705 
706  /* Scan each page in bucket */
707  for (;;)
708  {
709  HashPageOpaque opaque;
710  OffsetNumber offno;
711  OffsetNumber maxoffno;
712  Buffer next_buf;
713  Page page;
714  OffsetNumber deletable[MaxOffsetNumber];
715  int ndeletable = 0;
716  bool retain_pin = false;
717  bool clear_dead_marking = false;
718 
720 
721  page = BufferGetPage(buf);
722  opaque = HashPageGetOpaque(page);
723 
724  /* Scan each tuple in page */
725  maxoffno = PageGetMaxOffsetNumber(page);
726  for (offno = FirstOffsetNumber;
727  offno <= maxoffno;
728  offno = OffsetNumberNext(offno))
729  {
730  ItemPointer htup;
731  IndexTuple itup;
732  Bucket bucket;
733  bool kill_tuple = false;
734 
735  itup = (IndexTuple) PageGetItem(page,
736  PageGetItemId(page, offno));
737  htup = &(itup->t_tid);
738 
739  /*
740  * To remove the dead tuples, we strictly want to rely on results
741  * of callback function. refer btvacuumpage for detailed reason.
742  */
743  if (callback && callback(htup, callback_state))
744  {
745  kill_tuple = true;
746  if (tuples_removed)
747  *tuples_removed += 1;
748  }
749  else if (split_cleanup)
750  {
751  /* delete the tuples that are moved by split. */
753  maxbucket,
754  highmask,
755  lowmask);
756  /* mark the item for deletion */
757  if (bucket != cur_bucket)
758  {
759  /*
760  * We expect tuples to either belong to current bucket or
761  * new_bucket. This is ensured because we don't allow
762  * further splits from bucket that contains garbage. See
763  * comments in _hash_expandtable.
764  */
765  Assert(bucket == new_bucket);
766  kill_tuple = true;
767  }
768  }
769 
770  if (kill_tuple)
771  {
772  /* mark the item for deletion */
773  deletable[ndeletable++] = offno;
774  }
775  else
776  {
777  /* we're keeping it, so count it */
778  if (num_index_tuples)
779  *num_index_tuples += 1;
780  }
781  }
782 
783  /* retain the pin on primary bucket page till end of bucket scan */
784  if (blkno == bucket_blkno)
785  retain_pin = true;
786  else
787  retain_pin = false;
788 
789  blkno = opaque->hasho_nextblkno;
790 
791  /*
792  * Apply deletions, advance to next page and write page if needed.
793  */
794  if (ndeletable > 0)
795  {
796  /* No ereport(ERROR) until changes are logged */
798 
799  PageIndexMultiDelete(page, deletable, ndeletable);
800  bucket_dirty = true;
801 
802  /*
803  * Let us mark the page as clean if vacuum removes the DEAD tuples
804  * from an index page. We do this by clearing
805  * LH_PAGE_HAS_DEAD_TUPLES flag.
806  */
807  if (tuples_removed && *tuples_removed > 0 &&
808  H_HAS_DEAD_TUPLES(opaque))
809  {
811  clear_dead_marking = true;
812  }
813 
815 
816  /* XLOG stuff */
817  if (RelationNeedsWAL(rel))
818  {
819  xl_hash_delete xlrec;
820  XLogRecPtr recptr;
821 
822  xlrec.clear_dead_marking = clear_dead_marking;
823  xlrec.is_primary_bucket_page = (buf == bucket_buf);
824 
825  XLogBeginInsert();
826  XLogRegisterData((char *) &xlrec, SizeOfHashDelete);
827 
828  /*
829  * bucket buffer was not changed, but still needs to be
830  * registered to ensure that we can acquire a cleanup lock on
831  * it during replay.
832  */
833  if (!xlrec.is_primary_bucket_page)
834  {
836 
837  XLogRegisterBuffer(0, bucket_buf, flags);
838  }
839 
841  XLogRegisterBufData(1, (char *) deletable,
842  ndeletable * sizeof(OffsetNumber));
843 
844  recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_DELETE);
845  PageSetLSN(BufferGetPage(buf), recptr);
846  }
847 
849  }
850 
851  /* bail out if there are no more pages to scan. */
852  if (!BlockNumberIsValid(blkno))
853  break;
854 
855  next_buf = _hash_getbuf_with_strategy(rel, blkno, HASH_WRITE,
857  bstrategy);
858 
859  /*
860  * release the lock on previous page after acquiring the lock on next
861  * page
862  */
863  if (retain_pin)
865  else
866  _hash_relbuf(rel, buf);
867 
868  buf = next_buf;
869  }
870 
871  /*
872  * lock the bucket page to clear the garbage flag and squeeze the bucket.
873  * if the current buffer is same as bucket buffer, then we already have
874  * lock on bucket page.
875  */
876  if (buf != bucket_buf)
877  {
878  _hash_relbuf(rel, buf);
879  LockBuffer(bucket_buf, BUFFER_LOCK_EXCLUSIVE);
880  }
881 
882  /*
883  * Clear the garbage flag from bucket after deleting the tuples that are
884  * moved by split. We purposefully clear the flag before squeeze bucket,
885  * so that after restart, vacuum shouldn't again try to delete the moved
886  * by split tuples.
887  */
888  if (split_cleanup)
889  {
890  HashPageOpaque bucket_opaque;
891  Page page;
892 
893  page = BufferGetPage(bucket_buf);
894  bucket_opaque = HashPageGetOpaque(page);
895 
896  /* No ereport(ERROR) until changes are logged */
898 
899  bucket_opaque->hasho_flag &= ~LH_BUCKET_NEEDS_SPLIT_CLEANUP;
900  MarkBufferDirty(bucket_buf);
901 
902  /* XLOG stuff */
903  if (RelationNeedsWAL(rel))
904  {
905  XLogRecPtr recptr;
906 
907  XLogBeginInsert();
908  XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD);
909 
910  recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_CLEANUP);
911  PageSetLSN(page, recptr);
912  }
913 
915  }
916 
917  /*
918  * If we have deleted anything, try to compact free space. For squeezing
919  * the bucket, we must have a cleanup lock, else it can impact the
920  * ordering of tuples for a scan that has started before it.
921  */
922  if (bucket_dirty && IsBufferCleanupOK(bucket_buf))
923  _hash_squeezebucket(rel, cur_bucket, bucket_blkno, bucket_buf,
924  bstrategy);
925  else
926  LockBuffer(bucket_buf, BUFFER_LOCK_UNLOCK);
927 }
void pgstat_progress_update_param(int index, int64 val)
uint32 BlockNumber
Definition: block.h:31
#define InvalidBlockNumber
Definition: block.h:33
static bool BlockNumberIsValid(BlockNumber blockNumber)
Definition: block.h:71
static Datum values[MAXATTR]
Definition: bootstrap.c:152
int Buffer
Definition: buf.h:23
#define BufferIsInvalid(buffer)
Definition: buf.h:31
#define InvalidBuffer
Definition: buf.h:25
bool IsBufferCleanupOK(Buffer buffer)
Definition: bufmgr.c:5382
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:2474
void LockBufferForCleanup(Buffer buffer)
Definition: bufmgr.c:5165
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:5085
Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition: bufmgr.c:792
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:197
#define RelationGetNumberOfBlocks(reln)
Definition: bufmgr.h:281
static Page BufferGetPage(Buffer buffer)
Definition: bufmgr.h:408
#define BUFFER_LOCK_EXCLUSIVE
Definition: bufmgr.h:199
@ RBM_NORMAL
Definition: bufmgr.h:45
void PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems)
Definition: bufpage.c:1161
Pointer Page
Definition: bufpage.h:78
static Item PageGetItem(Page page, ItemId itemId)
Definition: bufpage.h:351
static ItemId PageGetItemId(Page page, OffsetNumber offsetNumber)
Definition: bufpage.h:240
static void PageSetLSN(Page page, XLogRecPtr lsn)
Definition: bufpage.h:388
static OffsetNumber PageGetMaxOffsetNumber(Page page)
Definition: bufpage.h:369
unsigned int uint32
Definition: c.h:506
#define Min(x, y)
Definition: c.h:1004
#define PG_USED_FOR_ASSERTS_ONLY
Definition: c.h:182
#define Assert(condition)
Definition: c.h:858
unsigned char uint8
Definition: c.h:504
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:224
#define PG_RETURN_POINTER(x)
Definition: fmgr.h:361
#define PG_FUNCTION_ARGS
Definition: fmgr.h:193
IndexScanDesc RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys)
Definition: genam.c:78
bool(* IndexBulkDeleteCallback)(ItemPointer itemptr, void *state)
Definition: genam.h:87
IndexUniqueCheck
Definition: genam.h:116
int NBuffers
Definition: globals.c:139
int maintenance_work_mem
Definition: globals.c:130
bool hashinsert(Relation rel, Datum *values, bool *isnull, ItemPointer ht_ctid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, IndexInfo *indexInfo)
Definition: hash.c:251
IndexBulkDeleteResult * hashvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
Definition: hash.c:648
Datum hashhandler(PG_FUNCTION_ARGS)
Definition: hash.c:57
IndexBulkDeleteResult * hashbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state)
Definition: hash.c:462
bool hashgettuple(IndexScanDesc scan, ScanDirection dir)
Definition: hash.c:283
static void hashbuildCallback(Relation index, ItemPointer tid, Datum *values, bool *isnull, bool tupleIsAlive, void *state)
Definition: hash.c:210
void hashbuildempty(Relation index)
Definition: hash.c:201
IndexScanDesc hashbeginscan(Relation rel, int nkeys, int norderbys)
Definition: hash.c:367
void hashrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, ScanKey orderbys, int norderbys)
Definition: hash.c:397
void hashendscan(IndexScanDesc scan)
Definition: hash.c:431
IndexBuildResult * hashbuild(Relation heap, Relation index, IndexInfo *indexInfo)
Definition: hash.c:115
int64 hashgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
Definition: hash.c:335
void hashbucketcleanup(Relation rel, Bucket cur_bucket, Buffer bucket_buf, BlockNumber bucket_blkno, BufferAccessStrategy bstrategy, uint32 maxbucket, uint32 highmask, uint32 lowmask, double *tuples_removed, double *num_index_tuples, bool split_cleanup, IndexBulkDeleteCallback callback, void *callback_state)
Definition: hash.c:687
#define HASH_NOLOCK
Definition: hash.h:341
#define HashPageGetOpaque(page)
Definition: hash.h:88
#define LH_BUCKET_PAGE
Definition: hash.h:55
#define HASH_WRITE
Definition: hash.h:340
#define H_BUCKET_BEING_SPLIT(opaque)
Definition: hash.h:91
#define LH_META_PAGE
Definition: hash.h:57
#define HashScanPosInvalidate(scanpos)
Definition: hash.h:144
#define HashPageGetMeta(page)
Definition: hash.h:323
#define BUCKET_TO_BLKNO(metap, B)
Definition: hash.h:39
#define HASH_METAPAGE
Definition: hash.h:198
#define H_HAS_DEAD_TUPLES(opaque)
Definition: hash.h:93
#define H_NEEDS_SPLIT_CLEANUP(opaque)
Definition: hash.h:90
uint32 Bucket
Definition: hash.h:35
#define LH_BUCKET_NEEDS_SPLIT_CLEANUP
Definition: hash.h:60
#define HASHNProcs
Definition: hash.h:358
HashScanOpaqueData * HashScanOpaque
Definition: hash.h:192
#define HASHOPTIONS_PROC
Definition: hash.h:357
#define HashScanPosIsValid(scanpos)
Definition: hash.h:137
#define LH_PAGE_HAS_DEAD_TUPLES
Definition: hash.h:61
#define LH_OVERFLOW_PAGE
Definition: hash.h:54
#define InvalidBucket
Definition: hash.h:37
#define XLOG_HASH_SPLIT_CLEANUP
Definition: hash_xlog.h:37
#define XLOG_HASH_UPDATE_META_PAGE
Definition: hash_xlog.h:38
#define SizeOfHashDelete
Definition: hash_xlog.h:186
#define XLOG_HASH_DELETE
Definition: hash_xlog.h:36
#define SizeOfHashUpdateMetaPage
Definition: hash_xlog.h:200
void _hash_doinsert(Relation rel, IndexTuple itup, Relation heapRel, bool sorted)
Definition: hashinsert.c:38
void _hash_squeezebucket(Relation rel, Bucket bucket, BlockNumber bucket_blkno, Buffer bucket_buf, BufferAccessStrategy bstrategy)
Definition: hashovfl.c:842
HashMetaPage _hash_getcachedmetap(Relation rel, Buffer *metabuf, bool force_refresh)
Definition: hashpage.c:1501
void _hash_relbuf(Relation rel, Buffer buf)
Definition: hashpage.c:266
uint32 _hash_init(Relation rel, double num_tuples, ForkNumber forkNum)
Definition: hashpage.c:327
void _hash_dropbuf(Relation rel, Buffer buf)
Definition: hashpage.c:277
void _hash_dropscanbuf(Relation rel, HashScanOpaque so)
Definition: hashpage.c:289
Buffer _hash_getbuf(Relation rel, BlockNumber blkno, int access, int flags)
Definition: hashpage.c:70
Buffer _hash_getbuf_with_strategy(Relation rel, BlockNumber blkno, int access, int flags, BufferAccessStrategy bstrategy)
Definition: hashpage.c:239
bool _hash_first(IndexScanDesc scan, ScanDirection dir)
Definition: hashsearch.c:288
bool _hash_next(IndexScanDesc scan, ScanDirection dir)
Definition: hashsearch.c:48
void _h_spool(HSpool *hspool, ItemPointer self, const Datum *values, const bool *isnull)
Definition: hashsort.c:109
void _h_indexbuild(HSpool *hspool, Relation heapRel)
Definition: hashsort.c:120
HSpool * _h_spoolinit(Relation heap, Relation index, uint32 num_buckets)
Definition: hashsort.c:60
void _h_spooldestroy(HSpool *hspool)
Definition: hashsort.c:99
uint32 _hash_get_indextuple_hashkey(IndexTuple itup)
Definition: hashutil.c:291
Bucket _hash_hashkey2bucket(uint32 hashkey, uint32 maxbucket, uint32 highmask, uint32 lowmask)
Definition: hashutil.c:125
void _hash_checkpage(Relation rel, Buffer buf, int flags)
Definition: hashutil.c:210
Bucket _hash_get_newbucket_from_oldbucket(Relation rel, Bucket old_bucket, uint32 lowmask, uint32 maxbucket)
Definition: hashutil.c:494
void _hash_kill_items(IndexScanDesc scan)
Definition: hashutil.c:536
bytea * hashoptions(Datum reloptions, bool validate)
Definition: hashutil.c:275
bool _hash_convert_tuple(Relation index, Datum *user_values, bool *user_isnull, Datum *index_values, bool *index_isnull)
Definition: hashutil.c:318
void hashadjustmembers(Oid opfamilyoid, Oid opclassoid, List *operators, List *functions)
Definition: hashvalidate.c:352
bool hashvalidate(Oid opclassoid)
Definition: hashvalidate.c:47
IndexTuple index_form_tuple(TupleDesc tupleDescriptor, const Datum *values, const bool *isnull)
Definition: indextuple.c:44
if(TABLE==NULL||TABLE_index==NULL)
Definition: isn.c:77
IndexTupleData * IndexTuple
Definition: itup.h:53
#define MaxIndexTuplesPerPage
Definition: itup.h:165
int NLocBuffer
Definition: localbuf.c:42
void pfree(void *pointer)
Definition: mcxt.c:1520
void * palloc0(Size size)
Definition: mcxt.c:1346
void * palloc(Size size)
Definition: mcxt.c:1316
#define START_CRIT_SECTION()
Definition: miscadmin.h:149
#define END_CRIT_SECTION()
Definition: miscadmin.h:151
#define makeNode(_type_)
Definition: nodes.h:155
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
uint16 OffsetNumber
Definition: off.h:24
#define FirstOffsetNumber
Definition: off.h:27
#define MaxOffsetNumber
Definition: off.h:28
static char * buf
Definition: pg_test_fsync.c:73
void estimate_rel_size(Relation rel, int32 *attr_widths, BlockNumber *pages, double *tuples, double *allvisfrac)
Definition: plancat.c:1041
uintptr_t Datum
Definition: postgres.h:64
#define PROGRESS_CREATEIDX_TUPLES_TOTAL
Definition: progress.h:86
#define RelationGetDescr(relation)
Definition: rel.h:531
#define RelationGetRelationName(relation)
Definition: rel.h:539
#define RelationNeedsWAL(relation)
Definition: rel.h:628
@ MAIN_FORKNUM
Definition: relpath.h:50
@ INIT_FORKNUM
Definition: relpath.h:53
ScanDirection
Definition: sdir.h:25
@ ForwardScanDirection
Definition: sdir.h:28
void hashcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, Cost *indexStartupCost, Cost *indexTotalCost, Selectivity *indexSelectivity, double *indexCorrelation, double *indexPages)
Definition: selfuncs.c:7131
#define HTMaxStrategyNumber
Definition: stratnum.h:43
HSpool * spool
Definition: hash.c:39
Relation heapRel
Definition: hash.c:41
double indtuples
Definition: hash.c:40
uint32 hashm_lowmask
Definition: hash.h:256
uint32 hashm_maxbucket
Definition: hash.h:254
double hashm_ntuples
Definition: hash.h:248
uint32 hashm_highmask
Definition: hash.h:255
BlockNumber hasho_nextblkno
Definition: hash.h:80
uint16 hasho_flag
Definition: hash.h:82
BlockNumber hasho_prevblkno
Definition: hash.h:79
bool hashso_buc_split
Definition: hash.h:180
HashScanPosData currPos
Definition: hash.h:189
bool hashso_buc_populated
Definition: hash.h:174
Buffer hashso_split_bucket_buf
Definition: hash.h:171
Buffer hashso_bucket_buf
Definition: hash.h:164
int * killedItems
Definition: hash.h:182
HashScanPosItem items[MaxIndexTuplesPerPage]
Definition: hash.h:127
int itemIndex
Definition: hash.h:125
ambuildphasename_function ambuildphasename
Definition: amapi.h:276
ambuildempty_function ambuildempty
Definition: amapi.h:267
amvacuumcleanup_function amvacuumcleanup
Definition: amapi.h:271
bool amclusterable
Definition: amapi.h:241
amoptions_function amoptions
Definition: amapi.h:274
amestimateparallelscan_function amestimateparallelscan
Definition: amapi.h:288
amrestrpos_function amrestrpos
Definition: amapi.h:285
aminsert_function aminsert
Definition: amapi.h:268
amendscan_function amendscan
Definition: amapi.h:283
uint16 amoptsprocnum
Definition: amapi.h:221
amparallelrescan_function amparallelrescan
Definition: amapi.h:290
Oid amkeytype
Definition: amapi.h:257
bool ampredlocks
Definition: amapi.h:243
uint16 amsupport
Definition: amapi.h:219
amcostestimate_function amcostestimate
Definition: amapi.h:273
bool amcanorderbyop
Definition: amapi.h:225
amadjustmembers_function amadjustmembers
Definition: amapi.h:278
ambuild_function ambuild
Definition: amapi.h:266
bool amstorage
Definition: amapi.h:239
uint16 amstrategies
Definition: amapi.h:217
bool amoptionalkey
Definition: amapi.h:233
amgettuple_function amgettuple
Definition: amapi.h:281
amcanreturn_function amcanreturn
Definition: amapi.h:272
bool amcanunique
Definition: amapi.h:229
amgetbitmap_function amgetbitmap
Definition: amapi.h:282
amproperty_function amproperty
Definition: amapi.h:275
ambulkdelete_function ambulkdelete
Definition: amapi.h:270
bool amsearcharray
Definition: amapi.h:235
bool amsummarizing
Definition: amapi.h:253
amvalidate_function amvalidate
Definition: amapi.h:277
ammarkpos_function ammarkpos
Definition: amapi.h:284
bool amcanmulticol
Definition: amapi.h:231
bool amusemaintenanceworkmem
Definition: amapi.h:251
ambeginscan_function ambeginscan
Definition: amapi.h:279
bool amcanparallel
Definition: amapi.h:245
amrescan_function amrescan
Definition: amapi.h:280
bool amcanorder
Definition: amapi.h:223
bool amcanbuildparallel
Definition: amapi.h:247
aminitparallelscan_function aminitparallelscan
Definition: amapi.h:289
uint8 amparallelvacuumoptions
Definition: amapi.h:255
aminsertcleanup_function aminsertcleanup
Definition: amapi.h:269
bool amcanbackward
Definition: amapi.h:227
bool amcaninclude
Definition: amapi.h:249
bool amsearchnulls
Definition: amapi.h:237
double heap_tuples
Definition: genam.h:32
double index_tuples
Definition: genam.h:33
bool estimated_count
Definition: genam.h:78
BlockNumber num_pages
Definition: genam.h:77
double tuples_removed
Definition: genam.h:80
double num_index_tuples
Definition: genam.h:79
struct ScanKeyData * keyData
Definition: relscan.h:122
bool kill_prior_tuple
Definition: relscan.h:128
Relation indexRelation
Definition: relscan.h:118
ItemPointerData t_tid
Definition: itup.h:37
Relation index
Definition: genam.h:46
BufferAccessStrategy strategy
Definition: genam.h:53
Definition: type.h:95
Definition: regguts.h:323
bool clear_dead_marking
Definition: hash_xlog.h:180
bool is_primary_bucket_page
Definition: hash_xlog.h:182
static double table_index_build_scan(Relation table_rel, Relation index_rel, struct IndexInfo *index_info, bool allow_sync, bool progress, IndexBuildCallback callback, void *callback_state, TableScanDesc scan)
Definition: tableam.h:1785
static void callback(struct sockaddr *addr, struct sockaddr *mask, void *unused)
Definition: test_ifaddrs.c:46
void tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids, bool recheck)
Definition: tidbitmap.c:377
void vacuum_delay_point(void)
Definition: vacuum.c:2336
#define VACUUM_OPTION_PARALLEL_BULKDEL
Definition: vacuum.h:48
uint64 XLogRecPtr
Definition: xlogdefs.h:21
void XLogRegisterData(char *data, uint32 len)
Definition: xloginsert.c:364
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:474
void XLogRegisterBufData(uint8 block_id, char *data, uint32 len)
Definition: xloginsert.c:405
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition: xloginsert.c:242
void XLogBeginInsert(void)
Definition: xloginsert.c:149
#define REGBUF_NO_CHANGE
Definition: xloginsert.h:36
#define REGBUF_STANDARD
Definition: xloginsert.h:34
#define REGBUF_NO_IMAGE
Definition: xloginsert.h:32