PostgreSQL Source Code  git master
hash.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * hash.c
4  * Implementation of Margo Seltzer's Hashing package for postgres.
5  *
6  * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  * src/backend/access/hash/hash.c
12  *
13  * NOTES
14  * This file contains only the public interface routines.
15  *
16  *-------------------------------------------------------------------------
17  */
18 
19 #include "postgres.h"
20 
21 #include "access/hash.h"
22 #include "access/hash_xlog.h"
23 #include "access/relscan.h"
24 #include "access/tableam.h"
25 #include "access/xloginsert.h"
26 #include "catalog/index.h"
27 #include "commands/progress.h"
28 #include "commands/vacuum.h"
29 #include "miscadmin.h"
30 #include "optimizer/plancat.h"
31 #include "pgstat.h"
32 #include "utils/builtins.h"
33 #include "utils/index_selfuncs.h"
34 #include "utils/rel.h"
35 
36 /* Working state for hashbuild and its callback */
37 typedef struct
38 {
39  HSpool *spool; /* NULL if not using spooling */
40  double indtuples; /* # tuples accepted into index */
41  Relation heapRel; /* heap relation descriptor */
43 
44 static void hashbuildCallback(Relation index,
45  ItemPointer tid,
46  Datum *values,
47  bool *isnull,
48  bool tupleIsAlive,
49  void *state);
50 
51 
52 /*
53  * Hash handler function: return IndexAmRoutine with access method parameters
54  * and callbacks.
55  */
56 Datum
58 {
60 
61  amroutine->amstrategies = HTMaxStrategyNumber;
62  amroutine->amsupport = HASHNProcs;
63  amroutine->amoptsprocnum = HASHOPTIONS_PROC;
64  amroutine->amcanorder = false;
65  amroutine->amcanorderbyop = false;
66  amroutine->amcanbackward = true;
67  amroutine->amcanunique = false;
68  amroutine->amcanmulticol = false;
69  amroutine->amoptionalkey = false;
70  amroutine->amsearcharray = false;
71  amroutine->amsearchnulls = false;
72  amroutine->amstorage = false;
73  amroutine->amclusterable = false;
74  amroutine->ampredlocks = true;
75  amroutine->amcanparallel = false;
76  amroutine->amcaninclude = false;
77  amroutine->amusemaintenanceworkmem = false;
78  amroutine->amsummarizing = false;
79  amroutine->amparallelvacuumoptions =
81  amroutine->amkeytype = INT4OID;
82 
83  amroutine->ambuild = hashbuild;
84  amroutine->ambuildempty = hashbuildempty;
85  amroutine->aminsert = hashinsert;
86  amroutine->ambulkdelete = hashbulkdelete;
88  amroutine->amcanreturn = NULL;
89  amroutine->amcostestimate = hashcostestimate;
90  amroutine->amoptions = hashoptions;
91  amroutine->amproperty = NULL;
92  amroutine->ambuildphasename = NULL;
93  amroutine->amvalidate = hashvalidate;
95  amroutine->ambeginscan = hashbeginscan;
96  amroutine->amrescan = hashrescan;
97  amroutine->amgettuple = hashgettuple;
98  amroutine->amgetbitmap = hashgetbitmap;
99  amroutine->amendscan = hashendscan;
100  amroutine->ammarkpos = NULL;
101  amroutine->amrestrpos = NULL;
102  amroutine->amestimateparallelscan = NULL;
103  amroutine->aminitparallelscan = NULL;
104  amroutine->amparallelrescan = NULL;
105 
106  PG_RETURN_POINTER(amroutine);
107 }
108 
109 /*
110  * hashbuild() -- build a new hash index.
111  */
114 {
115  IndexBuildResult *result;
116  BlockNumber relpages;
117  double reltuples;
118  double allvisfrac;
119  uint32 num_buckets;
120  long sort_threshold;
121  HashBuildState buildstate;
122 
123  /*
124  * We expect to be called exactly once for any index relation. If that's
125  * not the case, big trouble's what we have.
126  */
128  elog(ERROR, "index \"%s\" already contains data",
130 
131  /* Estimate the number of rows currently present in the table */
132  estimate_rel_size(heap, NULL, &relpages, &reltuples, &allvisfrac);
133 
134  /* Initialize the hash index metadata page and initial buckets */
135  num_buckets = _hash_init(index, reltuples, MAIN_FORKNUM);
136 
137  /*
138  * If we just insert the tuples into the index in scan order, then
139  * (assuming their hash codes are pretty random) there will be no locality
140  * of access to the index, and if the index is bigger than available RAM
141  * then we'll thrash horribly. To prevent that scenario, we can sort the
142  * tuples by (expected) bucket number. However, such a sort is useless
143  * overhead when the index does fit in RAM. We choose to sort if the
144  * initial index size exceeds maintenance_work_mem, or the number of
145  * buffers usable for the index, whichever is less. (Limiting by the
146  * number of buffers should reduce thrashing between PG buffers and kernel
147  * buffers, which seems useful even if no physical I/O results. Limiting
148  * by maintenance_work_mem is useful to allow easy testing of the sort
149  * code path, and may be useful to DBAs as an additional control knob.)
150  *
151  * NOTE: this test will need adjustment if a bucket is ever different from
152  * one page. Also, "initial index size" accounting does not include the
153  * metapage, nor the first bitmap page.
154  */
155  sort_threshold = (maintenance_work_mem * 1024L) / BLCKSZ;
156  if (index->rd_rel->relpersistence != RELPERSISTENCE_TEMP)
157  sort_threshold = Min(sort_threshold, NBuffers);
158  else
159  sort_threshold = Min(sort_threshold, NLocBuffer);
160 
161  if (num_buckets >= (uint32) sort_threshold)
162  buildstate.spool = _h_spoolinit(heap, index, num_buckets);
163  else
164  buildstate.spool = NULL;
165 
166  /* prepare to build the index */
167  buildstate.indtuples = 0;
168  buildstate.heapRel = heap;
169 
170  /* do the heap scan */
171  reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
173  (void *) &buildstate, NULL);
175  buildstate.indtuples);
176 
177  if (buildstate.spool)
178  {
179  /* sort the tuples and insert them into the index */
180  _h_indexbuild(buildstate.spool, buildstate.heapRel);
181  _h_spooldestroy(buildstate.spool);
182  }
183 
184  /*
185  * Return statistics
186  */
187  result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
188 
189  result->heap_tuples = reltuples;
190  result->index_tuples = buildstate.indtuples;
191 
192  return result;
193 }
194 
195 /*
196  * hashbuildempty() -- build an empty hash index in the initialization fork
197  */
198 void
200 {
202 }
203 
204 /*
205  * Per-tuple callback for table_index_build_scan
206  */
207 static void
209  ItemPointer tid,
210  Datum *values,
211  bool *isnull,
212  bool tupleIsAlive,
213  void *state)
214 {
215  HashBuildState *buildstate = (HashBuildState *) state;
216  Datum index_values[1];
217  bool index_isnull[1];
218  IndexTuple itup;
219 
220  /* convert data to a hash key; on failure, do not insert anything */
222  values, isnull,
223  index_values, index_isnull))
224  return;
225 
226  /* Either spool the tuple for sorting, or just put it into the index */
227  if (buildstate->spool)
228  _h_spool(buildstate->spool, tid, index_values, index_isnull);
229  else
230  {
231  /* form an index tuple and point it at the heap tuple */
233  index_values, index_isnull);
234  itup->t_tid = *tid;
235  _hash_doinsert(index, itup, buildstate->heapRel, false);
236  pfree(itup);
237  }
238 
239  buildstate->indtuples += 1;
240 }
241 
242 /*
243  * hashinsert() -- insert an index tuple into a hash table.
244  *
245  * Hash on the heap tuple's key, form an index tuple with hash code.
246  * Find the appropriate location for the new tuple, and put it there.
247  */
248 bool
249 hashinsert(Relation rel, Datum *values, bool *isnull,
250  ItemPointer ht_ctid, Relation heapRel,
251  IndexUniqueCheck checkUnique,
252  bool indexUnchanged,
253  IndexInfo *indexInfo)
254 {
255  Datum index_values[1];
256  bool index_isnull[1];
257  IndexTuple itup;
258 
259  /* convert data to a hash key; on failure, do not insert anything */
260  if (!_hash_convert_tuple(rel,
261  values, isnull,
262  index_values, index_isnull))
263  return false;
264 
265  /* form an index tuple and point it at the heap tuple */
266  itup = index_form_tuple(RelationGetDescr(rel), index_values, index_isnull);
267  itup->t_tid = *ht_ctid;
268 
269  _hash_doinsert(rel, itup, heapRel, false);
270 
271  pfree(itup);
272 
273  return false;
274 }
275 
276 
277 /*
278  * hashgettuple() -- Get the next tuple in the scan.
279  */
280 bool
282 {
283  HashScanOpaque so = (HashScanOpaque) scan->opaque;
284  bool res;
285 
286  /* Hash indexes are always lossy since we store only the hash code */
287  scan->xs_recheck = true;
288 
289  /*
290  * If we've already initialized this scan, we can just advance it in the
291  * appropriate direction. If we haven't done so yet, we call a routine to
292  * get the first item in the scan.
293  */
295  res = _hash_first(scan, dir);
296  else
297  {
298  /*
299  * Check to see if we should kill the previously-fetched tuple.
300  */
301  if (scan->kill_prior_tuple)
302  {
303  /*
304  * Yes, so remember it for later. (We'll deal with all such tuples
305  * at once right after leaving the index page or at end of scan.)
306  * In case if caller reverses the indexscan direction it is quite
307  * possible that the same item might get entered multiple times.
308  * But, we don't detect that; instead, we just forget any excess
309  * entries.
310  */
311  if (so->killedItems == NULL)
312  so->killedItems = (int *)
313  palloc(MaxIndexTuplesPerPage * sizeof(int));
314 
316  so->killedItems[so->numKilled++] = so->currPos.itemIndex;
317  }
318 
319  /*
320  * Now continue the scan.
321  */
322  res = _hash_next(scan, dir);
323  }
324 
325  return res;
326 }
327 
328 
329 /*
330  * hashgetbitmap() -- get all tuples at once
331  */
332 int64
334 {
335  HashScanOpaque so = (HashScanOpaque) scan->opaque;
336  bool res;
337  int64 ntids = 0;
338  HashScanPosItem *currItem;
339 
341 
342  while (res)
343  {
344  currItem = &so->currPos.items[so->currPos.itemIndex];
345 
346  /*
347  * _hash_first and _hash_next handle eliminate dead index entries
348  * whenever scan->ignore_killed_tuples is true. Therefore, there's
349  * nothing to do here except add the results to the TIDBitmap.
350  */
351  tbm_add_tuples(tbm, &(currItem->heapTid), 1, true);
352  ntids++;
353 
355  }
356 
357  return ntids;
358 }
359 
360 
361 /*
362  * hashbeginscan() -- start a scan on a hash index
363  */
365 hashbeginscan(Relation rel, int nkeys, int norderbys)
366 {
367  IndexScanDesc scan;
368  HashScanOpaque so;
369 
370  /* no order by operators allowed */
371  Assert(norderbys == 0);
372 
373  scan = RelationGetIndexScan(rel, nkeys, norderbys);
374 
375  so = (HashScanOpaque) palloc(sizeof(HashScanOpaqueData));
379 
380  so->hashso_buc_populated = false;
381  so->hashso_buc_split = false;
382 
383  so->killedItems = NULL;
384  so->numKilled = 0;
385 
386  scan->opaque = so;
387 
388  return scan;
389 }
390 
391 /*
392  * hashrescan() -- rescan an index relation
393  */
394 void
395 hashrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
396  ScanKey orderbys, int norderbys)
397 {
398  HashScanOpaque so = (HashScanOpaque) scan->opaque;
399  Relation rel = scan->indexRelation;
400 
402  {
403  /* Before leaving current page, deal with any killed items */
404  if (so->numKilled > 0)
405  _hash_kill_items(scan);
406  }
407 
408  _hash_dropscanbuf(rel, so);
409 
410  /* set position invalid (this will cause _hash_first call) */
412 
413  /* Update scan key, if a new one is given */
414  if (scankey && scan->numberOfKeys > 0)
415  {
416  memmove(scan->keyData,
417  scankey,
418  scan->numberOfKeys * sizeof(ScanKeyData));
419  }
420 
421  so->hashso_buc_populated = false;
422  so->hashso_buc_split = false;
423 }
424 
425 /*
426  * hashendscan() -- close down a scan
427  */
428 void
430 {
431  HashScanOpaque so = (HashScanOpaque) scan->opaque;
432  Relation rel = scan->indexRelation;
433 
435  {
436  /* Before leaving current page, deal with any killed items */
437  if (so->numKilled > 0)
438  _hash_kill_items(scan);
439  }
440 
441  _hash_dropscanbuf(rel, so);
442 
443  if (so->killedItems != NULL)
444  pfree(so->killedItems);
445  pfree(so);
446  scan->opaque = NULL;
447 }
448 
449 /*
450  * Bulk deletion of all index entries pointing to a set of heap tuples.
451  * The set of target tuples is specified via a callback routine that tells
452  * whether any given heap tuple (identified by ItemPointer) is being deleted.
453  *
454  * This function also deletes the tuples that are moved by split to other
455  * bucket.
456  *
457  * Result: a palloc'd struct containing statistical info for VACUUM displays.
458  */
461  IndexBulkDeleteCallback callback, void *callback_state)
462 {
463  Relation rel = info->index;
464  double tuples_removed;
465  double num_index_tuples;
466  double orig_ntuples;
467  Bucket orig_maxbucket;
468  Bucket cur_maxbucket;
469  Bucket cur_bucket;
470  Buffer metabuf = InvalidBuffer;
471  HashMetaPage metap;
472  HashMetaPage cachedmetap;
473 
474  tuples_removed = 0;
475  num_index_tuples = 0;
476 
477  /*
478  * We need a copy of the metapage so that we can use its hashm_spares[]
479  * values to compute bucket page addresses, but a cached copy should be
480  * good enough. (If not, we'll detect that further down and refresh the
481  * cache as necessary.)
482  */
483  cachedmetap = _hash_getcachedmetap(rel, &metabuf, false);
484  Assert(cachedmetap != NULL);
485 
486  orig_maxbucket = cachedmetap->hashm_maxbucket;
487  orig_ntuples = cachedmetap->hashm_ntuples;
488 
489  /* Scan the buckets that we know exist */
490  cur_bucket = 0;
491  cur_maxbucket = orig_maxbucket;
492 
493 loop_top:
494  while (cur_bucket <= cur_maxbucket)
495  {
496  BlockNumber bucket_blkno;
497  BlockNumber blkno;
498  Buffer bucket_buf;
499  Buffer buf;
500  HashPageOpaque bucket_opaque;
501  Page page;
502  bool split_cleanup = false;
503 
504  /* Get address of bucket's start page */
505  bucket_blkno = BUCKET_TO_BLKNO(cachedmetap, cur_bucket);
506 
507  blkno = bucket_blkno;
508 
509  /*
510  * We need to acquire a cleanup lock on the primary bucket page to out
511  * wait concurrent scans before deleting the dead tuples.
512  */
513  buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, info->strategy);
516 
517  page = BufferGetPage(buf);
518  bucket_opaque = HashPageGetOpaque(page);
519 
520  /*
521  * If the bucket contains tuples that are moved by split, then we need
522  * to delete such tuples. We can't delete such tuples if the split
523  * operation on bucket is not finished as those are needed by scans.
524  */
525  if (!H_BUCKET_BEING_SPLIT(bucket_opaque) &&
526  H_NEEDS_SPLIT_CLEANUP(bucket_opaque))
527  {
528  split_cleanup = true;
529 
530  /*
531  * This bucket might have been split since we last held a lock on
532  * the metapage. If so, hashm_maxbucket, hashm_highmask and
533  * hashm_lowmask might be old enough to cause us to fail to remove
534  * tuples left behind by the most recent split. To prevent that,
535  * now that the primary page of the target bucket has been locked
536  * (and thus can't be further split), check whether we need to
537  * update our cached metapage data.
538  */
539  Assert(bucket_opaque->hasho_prevblkno != InvalidBlockNumber);
540  if (bucket_opaque->hasho_prevblkno > cachedmetap->hashm_maxbucket)
541  {
542  cachedmetap = _hash_getcachedmetap(rel, &metabuf, true);
543  Assert(cachedmetap != NULL);
544  }
545  }
546 
547  bucket_buf = buf;
548 
549  hashbucketcleanup(rel, cur_bucket, bucket_buf, blkno, info->strategy,
550  cachedmetap->hashm_maxbucket,
551  cachedmetap->hashm_highmask,
552  cachedmetap->hashm_lowmask, &tuples_removed,
553  &num_index_tuples, split_cleanup,
554  callback, callback_state);
555 
556  _hash_dropbuf(rel, bucket_buf);
557 
558  /* Advance to next bucket */
559  cur_bucket++;
560  }
561 
562  if (BufferIsInvalid(metabuf))
564 
565  /* Write-lock metapage and check for split since we started */
567  metap = HashPageGetMeta(BufferGetPage(metabuf));
568 
569  if (cur_maxbucket != metap->hashm_maxbucket)
570  {
571  /* There's been a split, so process the additional bucket(s) */
572  LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
573  cachedmetap = _hash_getcachedmetap(rel, &metabuf, true);
574  Assert(cachedmetap != NULL);
575  cur_maxbucket = cachedmetap->hashm_maxbucket;
576  goto loop_top;
577  }
578 
579  /* Okay, we're really done. Update tuple count in metapage. */
581 
582  if (orig_maxbucket == metap->hashm_maxbucket &&
583  orig_ntuples == metap->hashm_ntuples)
584  {
585  /*
586  * No one has split or inserted anything since start of scan, so
587  * believe our count as gospel.
588  */
589  metap->hashm_ntuples = num_index_tuples;
590  }
591  else
592  {
593  /*
594  * Otherwise, our count is untrustworthy since we may have
595  * double-scanned tuples in split buckets. Proceed by dead-reckoning.
596  * (Note: we still return estimated_count = false, because using this
597  * count is better than not updating reltuples at all.)
598  */
599  if (metap->hashm_ntuples > tuples_removed)
600  metap->hashm_ntuples -= tuples_removed;
601  else
602  metap->hashm_ntuples = 0;
603  num_index_tuples = metap->hashm_ntuples;
604  }
605 
606  MarkBufferDirty(metabuf);
607 
608  /* XLOG stuff */
609  if (RelationNeedsWAL(rel))
610  {
612  XLogRecPtr recptr;
613 
614  xlrec.ntuples = metap->hashm_ntuples;
615 
616  XLogBeginInsert();
617  XLogRegisterData((char *) &xlrec, SizeOfHashUpdateMetaPage);
618 
619  XLogRegisterBuffer(0, metabuf, REGBUF_STANDARD);
620 
621  recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_UPDATE_META_PAGE);
622  PageSetLSN(BufferGetPage(metabuf), recptr);
623  }
624 
626 
627  _hash_relbuf(rel, metabuf);
628 
629  /* return statistics */
630  if (stats == NULL)
632  stats->estimated_count = false;
633  stats->num_index_tuples = num_index_tuples;
634  stats->tuples_removed += tuples_removed;
635  /* hashvacuumcleanup will fill in num_pages */
636 
637  return stats;
638 }
639 
640 /*
641  * Post-VACUUM cleanup.
642  *
643  * Result: a palloc'd struct containing statistical info for VACUUM displays.
644  */
647 {
648  Relation rel = info->index;
649  BlockNumber num_pages;
650 
651  /* If hashbulkdelete wasn't called, return NULL signifying no change */
652  /* Note: this covers the analyze_only case too */
653  if (stats == NULL)
654  return NULL;
655 
656  /* update statistics */
657  num_pages = RelationGetNumberOfBlocks(rel);
658  stats->num_pages = num_pages;
659 
660  return stats;
661 }
662 
663 /*
664  * Helper function to perform deletion of index entries from a bucket.
665  *
666  * This function expects that the caller has acquired a cleanup lock on the
667  * primary bucket page, and will return with a write lock again held on the
668  * primary bucket page. The lock won't necessarily be held continuously,
669  * though, because we'll release it when visiting overflow pages.
670  *
671  * There can't be any concurrent scans in progress when we first enter this
672  * function because of the cleanup lock we hold on the primary bucket page,
673  * but as soon as we release that lock, there might be. If those scans got
674  * ahead of our cleanup scan, they might see a tuple before we kill it and
675  * wake up only after VACUUM has completed and the TID has been recycled for
676  * an unrelated tuple. To avoid that calamity, we prevent scans from passing
677  * our cleanup scan by locking the next page in the bucket chain before
678  * releasing the lock on the previous page. (This type of lock chaining is not
679  * ideal, so we might want to look for a better solution at some point.)
680  *
681  * We need to retain a pin on the primary bucket to ensure that no concurrent
682  * split can start.
683  */
684 void
685 hashbucketcleanup(Relation rel, Bucket cur_bucket, Buffer bucket_buf,
686  BlockNumber bucket_blkno, BufferAccessStrategy bstrategy,
687  uint32 maxbucket, uint32 highmask, uint32 lowmask,
688  double *tuples_removed, double *num_index_tuples,
689  bool split_cleanup,
690  IndexBulkDeleteCallback callback, void *callback_state)
691 {
692  BlockNumber blkno;
693  Buffer buf;
695  bool bucket_dirty = false;
696 
697  blkno = bucket_blkno;
698  buf = bucket_buf;
699 
700  if (split_cleanup)
701  new_bucket = _hash_get_newbucket_from_oldbucket(rel, cur_bucket,
702  lowmask, maxbucket);
703 
704  /* Scan each page in bucket */
705  for (;;)
706  {
707  HashPageOpaque opaque;
708  OffsetNumber offno;
709  OffsetNumber maxoffno;
710  Buffer next_buf;
711  Page page;
712  OffsetNumber deletable[MaxOffsetNumber];
713  int ndeletable = 0;
714  bool retain_pin = false;
715  bool clear_dead_marking = false;
716 
718 
719  page = BufferGetPage(buf);
720  opaque = HashPageGetOpaque(page);
721 
722  /* Scan each tuple in page */
723  maxoffno = PageGetMaxOffsetNumber(page);
724  for (offno = FirstOffsetNumber;
725  offno <= maxoffno;
726  offno = OffsetNumberNext(offno))
727  {
728  ItemPointer htup;
729  IndexTuple itup;
730  Bucket bucket;
731  bool kill_tuple = false;
732 
733  itup = (IndexTuple) PageGetItem(page,
734  PageGetItemId(page, offno));
735  htup = &(itup->t_tid);
736 
737  /*
738  * To remove the dead tuples, we strictly want to rely on results
739  * of callback function. refer btvacuumpage for detailed reason.
740  */
741  if (callback && callback(htup, callback_state))
742  {
743  kill_tuple = true;
744  if (tuples_removed)
745  *tuples_removed += 1;
746  }
747  else if (split_cleanup)
748  {
749  /* delete the tuples that are moved by split. */
751  maxbucket,
752  highmask,
753  lowmask);
754  /* mark the item for deletion */
755  if (bucket != cur_bucket)
756  {
757  /*
758  * We expect tuples to either belong to current bucket or
759  * new_bucket. This is ensured because we don't allow
760  * further splits from bucket that contains garbage. See
761  * comments in _hash_expandtable.
762  */
763  Assert(bucket == new_bucket);
764  kill_tuple = true;
765  }
766  }
767 
768  if (kill_tuple)
769  {
770  /* mark the item for deletion */
771  deletable[ndeletable++] = offno;
772  }
773  else
774  {
775  /* we're keeping it, so count it */
776  if (num_index_tuples)
777  *num_index_tuples += 1;
778  }
779  }
780 
781  /* retain the pin on primary bucket page till end of bucket scan */
782  if (blkno == bucket_blkno)
783  retain_pin = true;
784  else
785  retain_pin = false;
786 
787  blkno = opaque->hasho_nextblkno;
788 
789  /*
790  * Apply deletions, advance to next page and write page if needed.
791  */
792  if (ndeletable > 0)
793  {
794  /* No ereport(ERROR) until changes are logged */
796 
797  PageIndexMultiDelete(page, deletable, ndeletable);
798  bucket_dirty = true;
799 
800  /*
801  * Let us mark the page as clean if vacuum removes the DEAD tuples
802  * from an index page. We do this by clearing
803  * LH_PAGE_HAS_DEAD_TUPLES flag.
804  */
805  if (tuples_removed && *tuples_removed > 0 &&
806  H_HAS_DEAD_TUPLES(opaque))
807  {
809  clear_dead_marking = true;
810  }
811 
813 
814  /* XLOG stuff */
815  if (RelationNeedsWAL(rel))
816  {
817  xl_hash_delete xlrec;
818  XLogRecPtr recptr;
819 
820  xlrec.clear_dead_marking = clear_dead_marking;
821  xlrec.is_primary_bucket_page = (buf == bucket_buf);
822 
823  XLogBeginInsert();
824  XLogRegisterData((char *) &xlrec, SizeOfHashDelete);
825 
826  /*
827  * bucket buffer needs to be registered to ensure that we can
828  * acquire a cleanup lock on it during replay.
829  */
830  if (!xlrec.is_primary_bucket_page)
832 
834  XLogRegisterBufData(1, (char *) deletable,
835  ndeletable * sizeof(OffsetNumber));
836 
837  recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_DELETE);
838  PageSetLSN(BufferGetPage(buf), recptr);
839  }
840 
842  }
843 
844  /* bail out if there are no more pages to scan. */
845  if (!BlockNumberIsValid(blkno))
846  break;
847 
848  next_buf = _hash_getbuf_with_strategy(rel, blkno, HASH_WRITE,
850  bstrategy);
851 
852  /*
853  * release the lock on previous page after acquiring the lock on next
854  * page
855  */
856  if (retain_pin)
858  else
859  _hash_relbuf(rel, buf);
860 
861  buf = next_buf;
862  }
863 
864  /*
865  * lock the bucket page to clear the garbage flag and squeeze the bucket.
866  * if the current buffer is same as bucket buffer, then we already have
867  * lock on bucket page.
868  */
869  if (buf != bucket_buf)
870  {
871  _hash_relbuf(rel, buf);
872  LockBuffer(bucket_buf, BUFFER_LOCK_EXCLUSIVE);
873  }
874 
875  /*
876  * Clear the garbage flag from bucket after deleting the tuples that are
877  * moved by split. We purposefully clear the flag before squeeze bucket,
878  * so that after restart, vacuum shouldn't again try to delete the moved
879  * by split tuples.
880  */
881  if (split_cleanup)
882  {
883  HashPageOpaque bucket_opaque;
884  Page page;
885 
886  page = BufferGetPage(bucket_buf);
887  bucket_opaque = HashPageGetOpaque(page);
888 
889  /* No ereport(ERROR) until changes are logged */
891 
892  bucket_opaque->hasho_flag &= ~LH_BUCKET_NEEDS_SPLIT_CLEANUP;
893  MarkBufferDirty(bucket_buf);
894 
895  /* XLOG stuff */
896  if (RelationNeedsWAL(rel))
897  {
898  XLogRecPtr recptr;
899 
900  XLogBeginInsert();
901  XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD);
902 
903  recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_CLEANUP);
904  PageSetLSN(page, recptr);
905  }
906 
908  }
909 
910  /*
911  * If we have deleted anything, try to compact free space. For squeezing
912  * the bucket, we must have a cleanup lock, else it can impact the
913  * ordering of tuples for a scan that has started before it.
914  */
915  if (bucket_dirty && IsBufferCleanupOK(bucket_buf))
916  _hash_squeezebucket(rel, cur_bucket, bucket_blkno, bucket_buf,
917  bstrategy);
918  else
919  LockBuffer(bucket_buf, BUFFER_LOCK_UNLOCK);
920 }
void pgstat_progress_update_param(int index, int64 val)
uint32 BlockNumber
Definition: block.h:31
#define InvalidBlockNumber
Definition: block.h:33
static bool BlockNumberIsValid(BlockNumber blockNumber)
Definition: block.h:71
static Datum values[MAXATTR]
Definition: bootstrap.c:156
int Buffer
Definition: buf.h:23
#define BufferIsInvalid(buffer)
Definition: buf.h:31
#define InvalidBuffer
Definition: buf.h:25
bool IsBufferCleanupOK(Buffer buffer)
Definition: bufmgr.c:5012
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:2111
void LockBufferForCleanup(Buffer buffer)
Definition: bufmgr.c:4795
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:4715
Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition: bufmgr.c:755
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:157
#define RelationGetNumberOfBlocks(reln)
Definition: bufmgr.h:227
static Page BufferGetPage(Buffer buffer)
Definition: bufmgr.h:350
#define BUFFER_LOCK_EXCLUSIVE
Definition: bufmgr.h:159
@ RBM_NORMAL
Definition: bufmgr.h:44
void PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems)
Definition: bufpage.c:1161
Pointer Page
Definition: bufpage.h:78
static Item PageGetItem(Page page, ItemId itemId)
Definition: bufpage.h:351
static ItemId PageGetItemId(Page page, OffsetNumber offsetNumber)
Definition: bufpage.h:240
static void PageSetLSN(Page page, XLogRecPtr lsn)
Definition: bufpage.h:388
static OffsetNumber PageGetMaxOffsetNumber(Page page)
Definition: bufpage.h:369
unsigned int uint32
Definition: c.h:495
#define Min(x, y)
Definition: c.h:993
#define PG_USED_FOR_ASSERTS_ONLY
Definition: c.h:171
#define ERROR
Definition: elog.h:39
#define PG_RETURN_POINTER(x)
Definition: fmgr.h:361
#define PG_FUNCTION_ARGS
Definition: fmgr.h:193
IndexScanDesc RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys)
Definition: genam.c:81
bool(* IndexBulkDeleteCallback)(ItemPointer itemptr, void *state)
Definition: genam.h:87
IndexUniqueCheck
Definition: genam.h:116
int NBuffers
Definition: globals.c:136
int maintenance_work_mem
Definition: globals.c:127
bool hashinsert(Relation rel, Datum *values, bool *isnull, ItemPointer ht_ctid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, IndexInfo *indexInfo)
Definition: hash.c:249
IndexBulkDeleteResult * hashvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
Definition: hash.c:646
Datum hashhandler(PG_FUNCTION_ARGS)
Definition: hash.c:57
IndexBulkDeleteResult * hashbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state)
Definition: hash.c:460
bool hashgettuple(IndexScanDesc scan, ScanDirection dir)
Definition: hash.c:281
static void hashbuildCallback(Relation index, ItemPointer tid, Datum *values, bool *isnull, bool tupleIsAlive, void *state)
Definition: hash.c:208
void hashbuildempty(Relation index)
Definition: hash.c:199
IndexScanDesc hashbeginscan(Relation rel, int nkeys, int norderbys)
Definition: hash.c:365
void hashrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, ScanKey orderbys, int norderbys)
Definition: hash.c:395
void hashendscan(IndexScanDesc scan)
Definition: hash.c:429
IndexBuildResult * hashbuild(Relation heap, Relation index, IndexInfo *indexInfo)
Definition: hash.c:113
int64 hashgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
Definition: hash.c:333
void hashbucketcleanup(Relation rel, Bucket cur_bucket, Buffer bucket_buf, BlockNumber bucket_blkno, BufferAccessStrategy bstrategy, uint32 maxbucket, uint32 highmask, uint32 lowmask, double *tuples_removed, double *num_index_tuples, bool split_cleanup, IndexBulkDeleteCallback callback, void *callback_state)
Definition: hash.c:685
#define HASH_NOLOCK
Definition: hash.h:341
#define HashPageGetOpaque(page)
Definition: hash.h:88
#define LH_BUCKET_PAGE
Definition: hash.h:55
#define HASH_WRITE
Definition: hash.h:340
#define H_BUCKET_BEING_SPLIT(opaque)
Definition: hash.h:91
#define LH_META_PAGE
Definition: hash.h:57
#define HashScanPosInvalidate(scanpos)
Definition: hash.h:144
#define HashPageGetMeta(page)
Definition: hash.h:323
#define BUCKET_TO_BLKNO(metap, B)
Definition: hash.h:39
#define HASH_METAPAGE
Definition: hash.h:198
#define H_HAS_DEAD_TUPLES(opaque)
Definition: hash.h:93
#define H_NEEDS_SPLIT_CLEANUP(opaque)
Definition: hash.h:90
uint32 Bucket
Definition: hash.h:35
#define LH_BUCKET_NEEDS_SPLIT_CLEANUP
Definition: hash.h:60
#define HASHNProcs
Definition: hash.h:358
HashScanOpaqueData * HashScanOpaque
Definition: hash.h:192
#define HASHOPTIONS_PROC
Definition: hash.h:357
#define HashScanPosIsValid(scanpos)
Definition: hash.h:137
#define LH_PAGE_HAS_DEAD_TUPLES
Definition: hash.h:61
#define LH_OVERFLOW_PAGE
Definition: hash.h:54
#define InvalidBucket
Definition: hash.h:37
#define XLOG_HASH_SPLIT_CLEANUP
Definition: hash_xlog.h:37
#define XLOG_HASH_UPDATE_META_PAGE
Definition: hash_xlog.h:38
#define SizeOfHashDelete
Definition: hash_xlog.h:186
#define XLOG_HASH_DELETE
Definition: hash_xlog.h:36
#define SizeOfHashUpdateMetaPage
Definition: hash_xlog.h:200
void _hash_doinsert(Relation rel, IndexTuple itup, Relation heapRel, bool sorted)
Definition: hashinsert.c:40
void _hash_squeezebucket(Relation rel, Bucket bucket, BlockNumber bucket_blkno, Buffer bucket_buf, BufferAccessStrategy bstrategy)
Definition: hashovfl.c:806
HashMetaPage _hash_getcachedmetap(Relation rel, Buffer *metabuf, bool force_refresh)
Definition: hashpage.c:1501
void _hash_relbuf(Relation rel, Buffer buf)
Definition: hashpage.c:266
uint32 _hash_init(Relation rel, double num_tuples, ForkNumber forkNum)
Definition: hashpage.c:327
void _hash_dropbuf(Relation rel, Buffer buf)
Definition: hashpage.c:277
void _hash_dropscanbuf(Relation rel, HashScanOpaque so)
Definition: hashpage.c:289
Buffer _hash_getbuf(Relation rel, BlockNumber blkno, int access, int flags)
Definition: hashpage.c:70
Buffer _hash_getbuf_with_strategy(Relation rel, BlockNumber blkno, int access, int flags, BufferAccessStrategy bstrategy)
Definition: hashpage.c:239
bool _hash_first(IndexScanDesc scan, ScanDirection dir)
Definition: hashsearch.c:288
bool _hash_next(IndexScanDesc scan, ScanDirection dir)
Definition: hashsearch.c:48
void _h_indexbuild(HSpool *hspool, Relation heapRel)
Definition: hashsort.c:120
HSpool * _h_spoolinit(Relation heap, Relation index, uint32 num_buckets)
Definition: hashsort.c:60
void _h_spool(HSpool *hspool, ItemPointer self, Datum *values, bool *isnull)
Definition: hashsort.c:109
void _h_spooldestroy(HSpool *hspool)
Definition: hashsort.c:99
uint32 _hash_get_indextuple_hashkey(IndexTuple itup)
Definition: hashutil.c:292
Bucket _hash_hashkey2bucket(uint32 hashkey, uint32 maxbucket, uint32 highmask, uint32 lowmask)
Definition: hashutil.c:126
void _hash_checkpage(Relation rel, Buffer buf, int flags)
Definition: hashutil.c:211
Bucket _hash_get_newbucket_from_oldbucket(Relation rel, Bucket old_bucket, uint32 lowmask, uint32 maxbucket)
Definition: hashutil.c:495
void _hash_kill_items(IndexScanDesc scan)
Definition: hashutil.c:537
bytea * hashoptions(Datum reloptions, bool validate)
Definition: hashutil.c:276
bool _hash_convert_tuple(Relation index, Datum *user_values, bool *user_isnull, Datum *index_values, bool *index_isnull)
Definition: hashutil.c:319
void hashadjustmembers(Oid opfamilyoid, Oid opclassoid, List *operators, List *functions)
Definition: hashvalidate.c:352
bool hashvalidate(Oid opclassoid)
Definition: hashvalidate.c:47
IndexTuple index_form_tuple(TupleDesc tupleDescriptor, Datum *values, bool *isnull)
Definition: indextuple.c:44
if(TABLE==NULL||TABLE_index==NULL)
Definition: isn.c:77
IndexTupleData * IndexTuple
Definition: itup.h:53
#define MaxIndexTuplesPerPage
Definition: itup.h:165
Assert(fmt[strlen(fmt) - 1] !='\n')
int NLocBuffer
Definition: localbuf.c:42
void pfree(void *pointer)
Definition: mcxt.c:1456
void * palloc0(Size size)
Definition: mcxt.c:1257
void * palloc(Size size)
Definition: mcxt.c:1226
#define START_CRIT_SECTION()
Definition: miscadmin.h:148
#define END_CRIT_SECTION()
Definition: miscadmin.h:150
#define makeNode(_type_)
Definition: nodes.h:176
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
uint16 OffsetNumber
Definition: off.h:24
#define FirstOffsetNumber
Definition: off.h:27
#define MaxOffsetNumber
Definition: off.h:28
static char * buf
Definition: pg_test_fsync.c:67
void estimate_rel_size(Relation rel, int32 *attr_widths, BlockNumber *pages, double *tuples, double *allvisfrac)
Definition: plancat.c:1013
uintptr_t Datum
Definition: postgres.h:64
#define PROGRESS_CREATEIDX_TUPLES_TOTAL
Definition: progress.h:86
#define RelationGetDescr(relation)
Definition: rel.h:530
#define RelationGetRelationName(relation)
Definition: rel.h:538
#define RelationNeedsWAL(relation)
Definition: rel.h:629
@ MAIN_FORKNUM
Definition: relpath.h:50
@ INIT_FORKNUM
Definition: relpath.h:53
ScanDirection
Definition: sdir.h:25
@ ForwardScanDirection
Definition: sdir.h:28
void hashcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, Cost *indexStartupCost, Cost *indexTotalCost, Selectivity *indexSelectivity, double *indexCorrelation, double *indexPages)
Definition: selfuncs.c:6973
#define HTMaxStrategyNumber
Definition: stratnum.h:43
HSpool * spool
Definition: hash.c:39
Relation heapRel
Definition: hash.c:41
double indtuples
Definition: hash.c:40
uint32 hashm_lowmask
Definition: hash.h:256
uint32 hashm_maxbucket
Definition: hash.h:254
double hashm_ntuples
Definition: hash.h:248
uint32 hashm_highmask
Definition: hash.h:255
BlockNumber hasho_nextblkno
Definition: hash.h:80
uint16 hasho_flag
Definition: hash.h:82
BlockNumber hasho_prevblkno
Definition: hash.h:79
bool hashso_buc_split
Definition: hash.h:180
HashScanPosData currPos
Definition: hash.h:189
bool hashso_buc_populated
Definition: hash.h:174
Buffer hashso_split_bucket_buf
Definition: hash.h:171
Buffer hashso_bucket_buf
Definition: hash.h:164
int * killedItems
Definition: hash.h:182
HashScanPosItem items[MaxIndexTuplesPerPage]
Definition: hash.h:127
int itemIndex
Definition: hash.h:125
ambuildphasename_function ambuildphasename
Definition: amapi.h:270
ambuildempty_function ambuildempty
Definition: amapi.h:262
amvacuumcleanup_function amvacuumcleanup
Definition: amapi.h:265
bool amclusterable
Definition: amapi.h:238
amoptions_function amoptions
Definition: amapi.h:268
amestimateparallelscan_function amestimateparallelscan
Definition: amapi.h:282
amrestrpos_function amrestrpos
Definition: amapi.h:279
aminsert_function aminsert
Definition: amapi.h:263
amendscan_function amendscan
Definition: amapi.h:277
uint16 amoptsprocnum
Definition: amapi.h:218
amparallelrescan_function amparallelrescan
Definition: amapi.h:284
Oid amkeytype
Definition: amapi.h:252
bool ampredlocks
Definition: amapi.h:240
uint16 amsupport
Definition: amapi.h:216
amcostestimate_function amcostestimate
Definition: amapi.h:267
bool amcanorderbyop
Definition: amapi.h:222
amadjustmembers_function amadjustmembers
Definition: amapi.h:272
ambuild_function ambuild
Definition: amapi.h:261
bool amstorage
Definition: amapi.h:236
uint16 amstrategies
Definition: amapi.h:214
bool amoptionalkey
Definition: amapi.h:230
amgettuple_function amgettuple
Definition: amapi.h:275
amcanreturn_function amcanreturn
Definition: amapi.h:266
bool amcanunique
Definition: amapi.h:226
amgetbitmap_function amgetbitmap
Definition: amapi.h:276
amproperty_function amproperty
Definition: amapi.h:269
ambulkdelete_function ambulkdelete
Definition: amapi.h:264
bool amsearcharray
Definition: amapi.h:232
bool amsummarizing
Definition: amapi.h:248
amvalidate_function amvalidate
Definition: amapi.h:271
ammarkpos_function ammarkpos
Definition: amapi.h:278
bool amcanmulticol
Definition: amapi.h:228
bool amusemaintenanceworkmem
Definition: amapi.h:246
ambeginscan_function ambeginscan
Definition: amapi.h:273
bool amcanparallel
Definition: amapi.h:242
amrescan_function amrescan
Definition: amapi.h:274
bool amcanorder
Definition: amapi.h:220
aminitparallelscan_function aminitparallelscan
Definition: amapi.h:283
uint8 amparallelvacuumoptions
Definition: amapi.h:250
bool amcanbackward
Definition: amapi.h:224
bool amcaninclude
Definition: amapi.h:244
bool amsearchnulls
Definition: amapi.h:234
double heap_tuples
Definition: genam.h:32
double index_tuples
Definition: genam.h:33
bool estimated_count
Definition: genam.h:78
BlockNumber num_pages
Definition: genam.h:77
double tuples_removed
Definition: genam.h:80
double num_index_tuples
Definition: genam.h:79
struct ScanKeyData * keyData
Definition: relscan.h:122
bool kill_prior_tuple
Definition: relscan.h:128
Relation indexRelation
Definition: relscan.h:118
ItemPointerData t_tid
Definition: itup.h:37
Relation index
Definition: genam.h:46
BufferAccessStrategy strategy
Definition: genam.h:53
Definition: type.h:95
Definition: regguts.h:323
bool clear_dead_marking
Definition: hash_xlog.h:180
bool is_primary_bucket_page
Definition: hash_xlog.h:182
static double table_index_build_scan(Relation table_rel, Relation index_rel, struct IndexInfo *index_info, bool allow_sync, bool progress, IndexBuildCallback callback, void *callback_state, TableScanDesc scan)
Definition: tableam.h:1772
static void callback(struct sockaddr *addr, struct sockaddr *mask, void *unused)
Definition: test_ifaddrs.c:46
void tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids, bool recheck)
Definition: tidbitmap.c:376
void vacuum_delay_point(void)
Definition: vacuum.c:2322
#define VACUUM_OPTION_PARALLEL_BULKDEL
Definition: vacuum.h:47
uint64 XLogRecPtr
Definition: xlogdefs.h:21
void XLogRegisterData(char *data, uint32 len)
Definition: xloginsert.c:351
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:461
void XLogRegisterBufData(uint8 block_id, char *data, uint32 len)
Definition: xloginsert.c:392
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition: xloginsert.c:243
void XLogBeginInsert(void)
Definition: xloginsert.c:150
#define REGBUF_STANDARD
Definition: xloginsert.h:34
#define REGBUF_NO_IMAGE
Definition: xloginsert.h:32