PostgreSQL Source Code  git master
All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
hash.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * hash.c
4  * Implementation of Margo Seltzer's Hashing package for postgres.
5  *
6  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  * src/backend/access/hash/hash.c
12  *
13  * NOTES
14  * This file contains only the public interface routines.
15  *
16  *-------------------------------------------------------------------------
17  */
18 
19 #include "postgres.h"
20 
21 #include "access/hash.h"
22 #include "access/hash_xlog.h"
23 #include "access/relscan.h"
24 #include "access/tableam.h"
25 #include "access/xloginsert.h"
26 #include "commands/progress.h"
27 #include "commands/vacuum.h"
28 #include "miscadmin.h"
29 #include "nodes/execnodes.h"
30 #include "optimizer/plancat.h"
31 #include "pgstat.h"
32 #include "utils/fmgrprotos.h"
33 #include "utils/index_selfuncs.h"
34 #include "utils/rel.h"
35 
36 /* Working state for hashbuild and its callback */
37 typedef struct
38 {
39  HSpool *spool; /* NULL if not using spooling */
40  double indtuples; /* # tuples accepted into index */
41  Relation heapRel; /* heap relation descriptor */
43 
44 static void hashbuildCallback(Relation index,
45  ItemPointer tid,
46  Datum *values,
47  bool *isnull,
48  bool tupleIsAlive,
49  void *state);
50 
51 
52 /*
53  * Hash handler function: return IndexAmRoutine with access method parameters
54  * and callbacks.
55  */
56 Datum
58 {
60 
61  amroutine->amstrategies = HTMaxStrategyNumber;
62  amroutine->amsupport = HASHNProcs;
63  amroutine->amoptsprocnum = HASHOPTIONS_PROC;
64  amroutine->amcanorder = false;
65  amroutine->amcanorderbyop = false;
66  amroutine->amcanbackward = true;
67  amroutine->amcanunique = false;
68  amroutine->amcanmulticol = false;
69  amroutine->amoptionalkey = false;
70  amroutine->amsearcharray = false;
71  amroutine->amsearchnulls = false;
72  amroutine->amstorage = false;
73  amroutine->amclusterable = false;
74  amroutine->ampredlocks = true;
75  amroutine->amcanparallel = false;
76  amroutine->amcanbuildparallel = false;
77  amroutine->amcaninclude = false;
78  amroutine->amusemaintenanceworkmem = false;
79  amroutine->amsummarizing = false;
80  amroutine->amparallelvacuumoptions =
82  amroutine->amkeytype = INT4OID;
83 
84  amroutine->ambuild = hashbuild;
85  amroutine->ambuildempty = hashbuildempty;
86  amroutine->aminsert = hashinsert;
87  amroutine->aminsertcleanup = NULL;
88  amroutine->ambulkdelete = hashbulkdelete;
90  amroutine->amcanreturn = NULL;
91  amroutine->amcostestimate = hashcostestimate;
92  amroutine->amgettreeheight = NULL;
93  amroutine->amoptions = hashoptions;
94  amroutine->amproperty = NULL;
95  amroutine->ambuildphasename = NULL;
96  amroutine->amvalidate = hashvalidate;
98  amroutine->ambeginscan = hashbeginscan;
99  amroutine->amrescan = hashrescan;
100  amroutine->amgettuple = hashgettuple;
101  amroutine->amgetbitmap = hashgetbitmap;
102  amroutine->amendscan = hashendscan;
103  amroutine->ammarkpos = NULL;
104  amroutine->amrestrpos = NULL;
105  amroutine->amestimateparallelscan = NULL;
106  amroutine->aminitparallelscan = NULL;
107  amroutine->amparallelrescan = NULL;
108 
109  PG_RETURN_POINTER(amroutine);
110 }
111 
112 /*
113  * hashbuild() -- build a new hash index.
114  */
117 {
118  IndexBuildResult *result;
119  BlockNumber relpages;
120  double reltuples;
121  double allvisfrac;
122  uint32 num_buckets;
123  long sort_threshold;
124  HashBuildState buildstate;
125 
126  /*
127  * We expect to be called exactly once for any index relation. If that's
128  * not the case, big trouble's what we have.
129  */
131  elog(ERROR, "index \"%s\" already contains data",
133 
134  /* Estimate the number of rows currently present in the table */
135  estimate_rel_size(heap, NULL, &relpages, &reltuples, &allvisfrac);
136 
137  /* Initialize the hash index metadata page and initial buckets */
138  num_buckets = _hash_init(index, reltuples, MAIN_FORKNUM);
139 
140  /*
141  * If we just insert the tuples into the index in scan order, then
142  * (assuming their hash codes are pretty random) there will be no locality
143  * of access to the index, and if the index is bigger than available RAM
144  * then we'll thrash horribly. To prevent that scenario, we can sort the
145  * tuples by (expected) bucket number. However, such a sort is useless
146  * overhead when the index does fit in RAM. We choose to sort if the
147  * initial index size exceeds maintenance_work_mem, or the number of
148  * buffers usable for the index, whichever is less. (Limiting by the
149  * number of buffers should reduce thrashing between PG buffers and kernel
150  * buffers, which seems useful even if no physical I/O results. Limiting
151  * by maintenance_work_mem is useful to allow easy testing of the sort
152  * code path, and may be useful to DBAs as an additional control knob.)
153  *
154  * NOTE: this test will need adjustment if a bucket is ever different from
155  * one page. Also, "initial index size" accounting does not include the
156  * metapage, nor the first bitmap page.
157  */
158  sort_threshold = (maintenance_work_mem * 1024L) / BLCKSZ;
159  if (index->rd_rel->relpersistence != RELPERSISTENCE_TEMP)
160  sort_threshold = Min(sort_threshold, NBuffers);
161  else
162  sort_threshold = Min(sort_threshold, NLocBuffer);
163 
164  if (num_buckets >= (uint32) sort_threshold)
165  buildstate.spool = _h_spoolinit(heap, index, num_buckets);
166  else
167  buildstate.spool = NULL;
168 
169  /* prepare to build the index */
170  buildstate.indtuples = 0;
171  buildstate.heapRel = heap;
172 
173  /* do the heap scan */
174  reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
176  (void *) &buildstate, NULL);
178  buildstate.indtuples);
179 
180  if (buildstate.spool)
181  {
182  /* sort the tuples and insert them into the index */
183  _h_indexbuild(buildstate.spool, buildstate.heapRel);
184  _h_spooldestroy(buildstate.spool);
185  }
186 
187  /*
188  * Return statistics
189  */
190  result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
191 
192  result->heap_tuples = reltuples;
193  result->index_tuples = buildstate.indtuples;
194 
195  return result;
196 }
197 
198 /*
199  * hashbuildempty() -- build an empty hash index in the initialization fork
200  */
201 void
203 {
205 }
206 
207 /*
208  * Per-tuple callback for table_index_build_scan
209  */
210 static void
212  ItemPointer tid,
213  Datum *values,
214  bool *isnull,
215  bool tupleIsAlive,
216  void *state)
217 {
218  HashBuildState *buildstate = (HashBuildState *) state;
219  Datum index_values[1];
220  bool index_isnull[1];
221  IndexTuple itup;
222 
223  /* convert data to a hash key; on failure, do not insert anything */
225  values, isnull,
226  index_values, index_isnull))
227  return;
228 
229  /* Either spool the tuple for sorting, or just put it into the index */
230  if (buildstate->spool)
231  _h_spool(buildstate->spool, tid, index_values, index_isnull);
232  else
233  {
234  /* form an index tuple and point it at the heap tuple */
236  index_values, index_isnull);
237  itup->t_tid = *tid;
238  _hash_doinsert(index, itup, buildstate->heapRel, false);
239  pfree(itup);
240  }
241 
242  buildstate->indtuples += 1;
243 }
244 
245 /*
246  * hashinsert() -- insert an index tuple into a hash table.
247  *
248  * Hash on the heap tuple's key, form an index tuple with hash code.
249  * Find the appropriate location for the new tuple, and put it there.
250  */
251 bool
252 hashinsert(Relation rel, Datum *values, bool *isnull,
253  ItemPointer ht_ctid, Relation heapRel,
254  IndexUniqueCheck checkUnique,
255  bool indexUnchanged,
256  IndexInfo *indexInfo)
257 {
258  Datum index_values[1];
259  bool index_isnull[1];
260  IndexTuple itup;
261 
262  /* convert data to a hash key; on failure, do not insert anything */
263  if (!_hash_convert_tuple(rel,
264  values, isnull,
265  index_values, index_isnull))
266  return false;
267 
268  /* form an index tuple and point it at the heap tuple */
269  itup = index_form_tuple(RelationGetDescr(rel), index_values, index_isnull);
270  itup->t_tid = *ht_ctid;
271 
272  _hash_doinsert(rel, itup, heapRel, false);
273 
274  pfree(itup);
275 
276  return false;
277 }
278 
279 
280 /*
281  * hashgettuple() -- Get the next tuple in the scan.
282  */
283 bool
285 {
286  HashScanOpaque so = (HashScanOpaque) scan->opaque;
287  bool res;
288 
289  /* Hash indexes are always lossy since we store only the hash code */
290  scan->xs_recheck = true;
291 
292  /*
293  * If we've already initialized this scan, we can just advance it in the
294  * appropriate direction. If we haven't done so yet, we call a routine to
295  * get the first item in the scan.
296  */
298  res = _hash_first(scan, dir);
299  else
300  {
301  /*
302  * Check to see if we should kill the previously-fetched tuple.
303  */
304  if (scan->kill_prior_tuple)
305  {
306  /*
307  * Yes, so remember it for later. (We'll deal with all such tuples
308  * at once right after leaving the index page or at end of scan.)
309  * In case if caller reverses the indexscan direction it is quite
310  * possible that the same item might get entered multiple times.
311  * But, we don't detect that; instead, we just forget any excess
312  * entries.
313  */
314  if (so->killedItems == NULL)
315  so->killedItems = (int *)
316  palloc(MaxIndexTuplesPerPage * sizeof(int));
317 
319  so->killedItems[so->numKilled++] = so->currPos.itemIndex;
320  }
321 
322  /*
323  * Now continue the scan.
324  */
325  res = _hash_next(scan, dir);
326  }
327 
328  return res;
329 }
330 
331 
332 /*
333  * hashgetbitmap() -- get all tuples at once
334  */
335 int64
337 {
338  HashScanOpaque so = (HashScanOpaque) scan->opaque;
339  bool res;
340  int64 ntids = 0;
341  HashScanPosItem *currItem;
342 
344 
345  while (res)
346  {
347  currItem = &so->currPos.items[so->currPos.itemIndex];
348 
349  /*
350  * _hash_first and _hash_next handle eliminate dead index entries
351  * whenever scan->ignore_killed_tuples is true. Therefore, there's
352  * nothing to do here except add the results to the TIDBitmap.
353  */
354  tbm_add_tuples(tbm, &(currItem->heapTid), 1, true);
355  ntids++;
356 
358  }
359 
360  return ntids;
361 }
362 
363 
364 /*
365  * hashbeginscan() -- start a scan on a hash index
366  */
368 hashbeginscan(Relation rel, int nkeys, int norderbys)
369 {
370  IndexScanDesc scan;
371  HashScanOpaque so;
372 
373  /* no order by operators allowed */
374  Assert(norderbys == 0);
375 
376  scan = RelationGetIndexScan(rel, nkeys, norderbys);
377 
378  so = (HashScanOpaque) palloc(sizeof(HashScanOpaqueData));
382 
383  so->hashso_buc_populated = false;
384  so->hashso_buc_split = false;
385 
386  so->killedItems = NULL;
387  so->numKilled = 0;
388 
389  scan->opaque = so;
390 
391  return scan;
392 }
393 
394 /*
395  * hashrescan() -- rescan an index relation
396  */
397 void
398 hashrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
399  ScanKey orderbys, int norderbys)
400 {
401  HashScanOpaque so = (HashScanOpaque) scan->opaque;
402  Relation rel = scan->indexRelation;
403 
405  {
406  /* Before leaving current page, deal with any killed items */
407  if (so->numKilled > 0)
408  _hash_kill_items(scan);
409  }
410 
411  _hash_dropscanbuf(rel, so);
412 
413  /* set position invalid (this will cause _hash_first call) */
415 
416  /* Update scan key, if a new one is given */
417  if (scankey && scan->numberOfKeys > 0)
418  memcpy(scan->keyData, scankey, scan->numberOfKeys * sizeof(ScanKeyData));
419 
420  so->hashso_buc_populated = false;
421  so->hashso_buc_split = false;
422 }
423 
424 /*
425  * hashendscan() -- close down a scan
426  */
427 void
429 {
430  HashScanOpaque so = (HashScanOpaque) scan->opaque;
431  Relation rel = scan->indexRelation;
432 
434  {
435  /* Before leaving current page, deal with any killed items */
436  if (so->numKilled > 0)
437  _hash_kill_items(scan);
438  }
439 
440  _hash_dropscanbuf(rel, so);
441 
442  if (so->killedItems != NULL)
443  pfree(so->killedItems);
444  pfree(so);
445  scan->opaque = NULL;
446 }
447 
448 /*
449  * Bulk deletion of all index entries pointing to a set of heap tuples.
450  * The set of target tuples is specified via a callback routine that tells
451  * whether any given heap tuple (identified by ItemPointer) is being deleted.
452  *
453  * This function also deletes the tuples that are moved by split to other
454  * bucket.
455  *
456  * Result: a palloc'd struct containing statistical info for VACUUM displays.
457  */
460  IndexBulkDeleteCallback callback, void *callback_state)
461 {
462  Relation rel = info->index;
463  double tuples_removed;
464  double num_index_tuples;
465  double orig_ntuples;
466  Bucket orig_maxbucket;
467  Bucket cur_maxbucket;
468  Bucket cur_bucket;
469  Buffer metabuf = InvalidBuffer;
470  HashMetaPage metap;
471  HashMetaPage cachedmetap;
472 
473  tuples_removed = 0;
474  num_index_tuples = 0;
475 
476  /*
477  * We need a copy of the metapage so that we can use its hashm_spares[]
478  * values to compute bucket page addresses, but a cached copy should be
479  * good enough. (If not, we'll detect that further down and refresh the
480  * cache as necessary.)
481  */
482  cachedmetap = _hash_getcachedmetap(rel, &metabuf, false);
483  Assert(cachedmetap != NULL);
484 
485  orig_maxbucket = cachedmetap->hashm_maxbucket;
486  orig_ntuples = cachedmetap->hashm_ntuples;
487 
488  /* Scan the buckets that we know exist */
489  cur_bucket = 0;
490  cur_maxbucket = orig_maxbucket;
491 
492 loop_top:
493  while (cur_bucket <= cur_maxbucket)
494  {
495  BlockNumber bucket_blkno;
496  BlockNumber blkno;
497  Buffer bucket_buf;
498  Buffer buf;
499  HashPageOpaque bucket_opaque;
500  Page page;
501  bool split_cleanup = false;
502 
503  /* Get address of bucket's start page */
504  bucket_blkno = BUCKET_TO_BLKNO(cachedmetap, cur_bucket);
505 
506  blkno = bucket_blkno;
507 
508  /*
509  * We need to acquire a cleanup lock on the primary bucket page to out
510  * wait concurrent scans before deleting the dead tuples.
511  */
512  buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, info->strategy);
515 
516  page = BufferGetPage(buf);
517  bucket_opaque = HashPageGetOpaque(page);
518 
519  /*
520  * If the bucket contains tuples that are moved by split, then we need
521  * to delete such tuples. We can't delete such tuples if the split
522  * operation on bucket is not finished as those are needed by scans.
523  */
524  if (!H_BUCKET_BEING_SPLIT(bucket_opaque) &&
525  H_NEEDS_SPLIT_CLEANUP(bucket_opaque))
526  {
527  split_cleanup = true;
528 
529  /*
530  * This bucket might have been split since we last held a lock on
531  * the metapage. If so, hashm_maxbucket, hashm_highmask and
532  * hashm_lowmask might be old enough to cause us to fail to remove
533  * tuples left behind by the most recent split. To prevent that,
534  * now that the primary page of the target bucket has been locked
535  * (and thus can't be further split), check whether we need to
536  * update our cached metapage data.
537  */
538  Assert(bucket_opaque->hasho_prevblkno != InvalidBlockNumber);
539  if (bucket_opaque->hasho_prevblkno > cachedmetap->hashm_maxbucket)
540  {
541  cachedmetap = _hash_getcachedmetap(rel, &metabuf, true);
542  Assert(cachedmetap != NULL);
543  }
544  }
545 
546  bucket_buf = buf;
547 
548  hashbucketcleanup(rel, cur_bucket, bucket_buf, blkno, info->strategy,
549  cachedmetap->hashm_maxbucket,
550  cachedmetap->hashm_highmask,
551  cachedmetap->hashm_lowmask, &tuples_removed,
552  &num_index_tuples, split_cleanup,
553  callback, callback_state);
554 
555  _hash_dropbuf(rel, bucket_buf);
556 
557  /* Advance to next bucket */
558  cur_bucket++;
559  }
560 
561  if (BufferIsInvalid(metabuf))
563 
564  /* Write-lock metapage and check for split since we started */
566  metap = HashPageGetMeta(BufferGetPage(metabuf));
567 
568  if (cur_maxbucket != metap->hashm_maxbucket)
569  {
570  /* There's been a split, so process the additional bucket(s) */
571  LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
572  cachedmetap = _hash_getcachedmetap(rel, &metabuf, true);
573  Assert(cachedmetap != NULL);
574  cur_maxbucket = cachedmetap->hashm_maxbucket;
575  goto loop_top;
576  }
577 
578  /* Okay, we're really done. Update tuple count in metapage. */
580 
581  if (orig_maxbucket == metap->hashm_maxbucket &&
582  orig_ntuples == metap->hashm_ntuples)
583  {
584  /*
585  * No one has split or inserted anything since start of scan, so
586  * believe our count as gospel.
587  */
588  metap->hashm_ntuples = num_index_tuples;
589  }
590  else
591  {
592  /*
593  * Otherwise, our count is untrustworthy since we may have
594  * double-scanned tuples in split buckets. Proceed by dead-reckoning.
595  * (Note: we still return estimated_count = false, because using this
596  * count is better than not updating reltuples at all.)
597  */
598  if (metap->hashm_ntuples > tuples_removed)
599  metap->hashm_ntuples -= tuples_removed;
600  else
601  metap->hashm_ntuples = 0;
602  num_index_tuples = metap->hashm_ntuples;
603  }
604 
605  MarkBufferDirty(metabuf);
606 
607  /* XLOG stuff */
608  if (RelationNeedsWAL(rel))
609  {
611  XLogRecPtr recptr;
612 
613  xlrec.ntuples = metap->hashm_ntuples;
614 
615  XLogBeginInsert();
616  XLogRegisterData((char *) &xlrec, SizeOfHashUpdateMetaPage);
617 
618  XLogRegisterBuffer(0, metabuf, REGBUF_STANDARD);
619 
620  recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_UPDATE_META_PAGE);
621  PageSetLSN(BufferGetPage(metabuf), recptr);
622  }
623 
625 
626  _hash_relbuf(rel, metabuf);
627 
628  /* return statistics */
629  if (stats == NULL)
631  stats->estimated_count = false;
632  stats->num_index_tuples = num_index_tuples;
633  stats->tuples_removed += tuples_removed;
634  /* hashvacuumcleanup will fill in num_pages */
635 
636  return stats;
637 }
638 
639 /*
640  * Post-VACUUM cleanup.
641  *
642  * Result: a palloc'd struct containing statistical info for VACUUM displays.
643  */
646 {
647  Relation rel = info->index;
648  BlockNumber num_pages;
649 
650  /* If hashbulkdelete wasn't called, return NULL signifying no change */
651  /* Note: this covers the analyze_only case too */
652  if (stats == NULL)
653  return NULL;
654 
655  /* update statistics */
656  num_pages = RelationGetNumberOfBlocks(rel);
657  stats->num_pages = num_pages;
658 
659  return stats;
660 }
661 
662 /*
663  * Helper function to perform deletion of index entries from a bucket.
664  *
665  * This function expects that the caller has acquired a cleanup lock on the
666  * primary bucket page, and will return with a write lock again held on the
667  * primary bucket page. The lock won't necessarily be held continuously,
668  * though, because we'll release it when visiting overflow pages.
669  *
670  * There can't be any concurrent scans in progress when we first enter this
671  * function because of the cleanup lock we hold on the primary bucket page,
672  * but as soon as we release that lock, there might be. If those scans got
673  * ahead of our cleanup scan, they might see a tuple before we kill it and
674  * wake up only after VACUUM has completed and the TID has been recycled for
675  * an unrelated tuple. To avoid that calamity, we prevent scans from passing
676  * our cleanup scan by locking the next page in the bucket chain before
677  * releasing the lock on the previous page. (This type of lock chaining is not
678  * ideal, so we might want to look for a better solution at some point.)
679  *
680  * We need to retain a pin on the primary bucket to ensure that no concurrent
681  * split can start.
682  */
683 void
684 hashbucketcleanup(Relation rel, Bucket cur_bucket, Buffer bucket_buf,
685  BlockNumber bucket_blkno, BufferAccessStrategy bstrategy,
686  uint32 maxbucket, uint32 highmask, uint32 lowmask,
687  double *tuples_removed, double *num_index_tuples,
688  bool split_cleanup,
689  IndexBulkDeleteCallback callback, void *callback_state)
690 {
691  BlockNumber blkno;
692  Buffer buf;
694  bool bucket_dirty = false;
695 
696  blkno = bucket_blkno;
697  buf = bucket_buf;
698 
699  if (split_cleanup)
700  new_bucket = _hash_get_newbucket_from_oldbucket(rel, cur_bucket,
701  lowmask, maxbucket);
702 
703  /* Scan each page in bucket */
704  for (;;)
705  {
706  HashPageOpaque opaque;
707  OffsetNumber offno;
708  OffsetNumber maxoffno;
709  Buffer next_buf;
710  Page page;
711  OffsetNumber deletable[MaxOffsetNumber];
712  int ndeletable = 0;
713  bool retain_pin = false;
714  bool clear_dead_marking = false;
715 
717 
718  page = BufferGetPage(buf);
719  opaque = HashPageGetOpaque(page);
720 
721  /* Scan each tuple in page */
722  maxoffno = PageGetMaxOffsetNumber(page);
723  for (offno = FirstOffsetNumber;
724  offno <= maxoffno;
725  offno = OffsetNumberNext(offno))
726  {
727  ItemPointer htup;
728  IndexTuple itup;
729  Bucket bucket;
730  bool kill_tuple = false;
731 
732  itup = (IndexTuple) PageGetItem(page,
733  PageGetItemId(page, offno));
734  htup = &(itup->t_tid);
735 
736  /*
737  * To remove the dead tuples, we strictly want to rely on results
738  * of callback function. refer btvacuumpage for detailed reason.
739  */
740  if (callback && callback(htup, callback_state))
741  {
742  kill_tuple = true;
743  if (tuples_removed)
744  *tuples_removed += 1;
745  }
746  else if (split_cleanup)
747  {
748  /* delete the tuples that are moved by split. */
750  maxbucket,
751  highmask,
752  lowmask);
753  /* mark the item for deletion */
754  if (bucket != cur_bucket)
755  {
756  /*
757  * We expect tuples to either belong to current bucket or
758  * new_bucket. This is ensured because we don't allow
759  * further splits from bucket that contains garbage. See
760  * comments in _hash_expandtable.
761  */
762  Assert(bucket == new_bucket);
763  kill_tuple = true;
764  }
765  }
766 
767  if (kill_tuple)
768  {
769  /* mark the item for deletion */
770  deletable[ndeletable++] = offno;
771  }
772  else
773  {
774  /* we're keeping it, so count it */
775  if (num_index_tuples)
776  *num_index_tuples += 1;
777  }
778  }
779 
780  /* retain the pin on primary bucket page till end of bucket scan */
781  if (blkno == bucket_blkno)
782  retain_pin = true;
783  else
784  retain_pin = false;
785 
786  blkno = opaque->hasho_nextblkno;
787 
788  /*
789  * Apply deletions, advance to next page and write page if needed.
790  */
791  if (ndeletable > 0)
792  {
793  /* No ereport(ERROR) until changes are logged */
795 
796  PageIndexMultiDelete(page, deletable, ndeletable);
797  bucket_dirty = true;
798 
799  /*
800  * Let us mark the page as clean if vacuum removes the DEAD tuples
801  * from an index page. We do this by clearing
802  * LH_PAGE_HAS_DEAD_TUPLES flag.
803  */
804  if (tuples_removed && *tuples_removed > 0 &&
805  H_HAS_DEAD_TUPLES(opaque))
806  {
808  clear_dead_marking = true;
809  }
810 
812 
813  /* XLOG stuff */
814  if (RelationNeedsWAL(rel))
815  {
816  xl_hash_delete xlrec;
817  XLogRecPtr recptr;
818 
819  xlrec.clear_dead_marking = clear_dead_marking;
820  xlrec.is_primary_bucket_page = (buf == bucket_buf);
821 
822  XLogBeginInsert();
823  XLogRegisterData((char *) &xlrec, SizeOfHashDelete);
824 
825  /*
826  * bucket buffer was not changed, but still needs to be
827  * registered to ensure that we can acquire a cleanup lock on
828  * it during replay.
829  */
830  if (!xlrec.is_primary_bucket_page)
831  {
833 
834  XLogRegisterBuffer(0, bucket_buf, flags);
835  }
836 
838  XLogRegisterBufData(1, (char *) deletable,
839  ndeletable * sizeof(OffsetNumber));
840 
841  recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_DELETE);
842  PageSetLSN(BufferGetPage(buf), recptr);
843  }
844 
846  }
847 
848  /* bail out if there are no more pages to scan. */
849  if (!BlockNumberIsValid(blkno))
850  break;
851 
852  next_buf = _hash_getbuf_with_strategy(rel, blkno, HASH_WRITE,
854  bstrategy);
855 
856  /*
857  * release the lock on previous page after acquiring the lock on next
858  * page
859  */
860  if (retain_pin)
862  else
863  _hash_relbuf(rel, buf);
864 
865  buf = next_buf;
866  }
867 
868  /*
869  * lock the bucket page to clear the garbage flag and squeeze the bucket.
870  * if the current buffer is same as bucket buffer, then we already have
871  * lock on bucket page.
872  */
873  if (buf != bucket_buf)
874  {
875  _hash_relbuf(rel, buf);
876  LockBuffer(bucket_buf, BUFFER_LOCK_EXCLUSIVE);
877  }
878 
879  /*
880  * Clear the garbage flag from bucket after deleting the tuples that are
881  * moved by split. We purposefully clear the flag before squeeze bucket,
882  * so that after restart, vacuum shouldn't again try to delete the moved
883  * by split tuples.
884  */
885  if (split_cleanup)
886  {
887  HashPageOpaque bucket_opaque;
888  Page page;
889 
890  page = BufferGetPage(bucket_buf);
891  bucket_opaque = HashPageGetOpaque(page);
892 
893  /* No ereport(ERROR) until changes are logged */
895 
896  bucket_opaque->hasho_flag &= ~LH_BUCKET_NEEDS_SPLIT_CLEANUP;
897  MarkBufferDirty(bucket_buf);
898 
899  /* XLOG stuff */
900  if (RelationNeedsWAL(rel))
901  {
902  XLogRecPtr recptr;
903 
904  XLogBeginInsert();
905  XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD);
906 
907  recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_CLEANUP);
908  PageSetLSN(page, recptr);
909  }
910 
912  }
913 
914  /*
915  * If we have deleted anything, try to compact free space. For squeezing
916  * the bucket, we must have a cleanup lock, else it can impact the
917  * ordering of tuples for a scan that has started before it.
918  */
919  if (bucket_dirty && IsBufferCleanupOK(bucket_buf))
920  _hash_squeezebucket(rel, cur_bucket, bucket_blkno, bucket_buf,
921  bstrategy);
922  else
923  LockBuffer(bucket_buf, BUFFER_LOCK_UNLOCK);
924 }
void pgstat_progress_update_param(int index, int64 val)
uint32 BlockNumber
Definition: block.h:31
#define InvalidBlockNumber
Definition: block.h:33
static bool BlockNumberIsValid(BlockNumber blockNumber)
Definition: block.h:71
static Datum values[MAXATTR]
Definition: bootstrap.c:151
int Buffer
Definition: buf.h:23
#define BufferIsInvalid(buffer)
Definition: buf.h:31
#define InvalidBuffer
Definition: buf.h:25
bool IsBufferCleanupOK(Buffer buffer)
Definition: bufmgr.c:5455
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:2532
void LockBufferForCleanup(Buffer buffer)
Definition: bufmgr.c:5238
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:5158
Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition: bufmgr.c:793
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:189
#define RelationGetNumberOfBlocks(reln)
Definition: bufmgr.h:273
static Page BufferGetPage(Buffer buffer)
Definition: bufmgr.h:400
#define BUFFER_LOCK_EXCLUSIVE
Definition: bufmgr.h:191
@ RBM_NORMAL
Definition: bufmgr.h:45
void PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems)
Definition: bufpage.c:1150
Pointer Page
Definition: bufpage.h:81
static Item PageGetItem(Page page, ItemId itemId)
Definition: bufpage.h:354
static ItemId PageGetItemId(Page page, OffsetNumber offsetNumber)
Definition: bufpage.h:243
static void PageSetLSN(Page page, XLogRecPtr lsn)
Definition: bufpage.h:391
static OffsetNumber PageGetMaxOffsetNumber(Page page)
Definition: bufpage.h:372
unsigned int uint32
Definition: c.h:518
#define Min(x, y)
Definition: c.h:1009
#define PG_USED_FOR_ASSERTS_ONLY
Definition: c.h:197
#define Assert(condition)
Definition: c.h:863
unsigned char uint8
Definition: c.h:516
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:225
#define PG_RETURN_POINTER(x)
Definition: fmgr.h:361
#define PG_FUNCTION_ARGS
Definition: fmgr.h:193
IndexScanDesc RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys)
Definition: genam.c:80
bool(* IndexBulkDeleteCallback)(ItemPointer itemptr, void *state)
Definition: genam.h:87
IndexUniqueCheck
Definition: genam.h:116
int NBuffers
Definition: globals.c:141
int maintenance_work_mem
Definition: globals.c:132
bool hashinsert(Relation rel, Datum *values, bool *isnull, ItemPointer ht_ctid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, IndexInfo *indexInfo)
Definition: hash.c:252
IndexBulkDeleteResult * hashvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
Definition: hash.c:645
Datum hashhandler(PG_FUNCTION_ARGS)
Definition: hash.c:57
IndexBulkDeleteResult * hashbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state)
Definition: hash.c:459
bool hashgettuple(IndexScanDesc scan, ScanDirection dir)
Definition: hash.c:284
static void hashbuildCallback(Relation index, ItemPointer tid, Datum *values, bool *isnull, bool tupleIsAlive, void *state)
Definition: hash.c:211
void hashbuildempty(Relation index)
Definition: hash.c:202
IndexScanDesc hashbeginscan(Relation rel, int nkeys, int norderbys)
Definition: hash.c:368
void hashrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, ScanKey orderbys, int norderbys)
Definition: hash.c:398
void hashendscan(IndexScanDesc scan)
Definition: hash.c:428
IndexBuildResult * hashbuild(Relation heap, Relation index, IndexInfo *indexInfo)
Definition: hash.c:116
int64 hashgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
Definition: hash.c:336
void hashbucketcleanup(Relation rel, Bucket cur_bucket, Buffer bucket_buf, BlockNumber bucket_blkno, BufferAccessStrategy bstrategy, uint32 maxbucket, uint32 highmask, uint32 lowmask, double *tuples_removed, double *num_index_tuples, bool split_cleanup, IndexBulkDeleteCallback callback, void *callback_state)
Definition: hash.c:684
#define HASH_NOLOCK
Definition: hash.h:341
#define HashPageGetOpaque(page)
Definition: hash.h:88
#define LH_BUCKET_PAGE
Definition: hash.h:55
#define HASH_WRITE
Definition: hash.h:340
#define H_BUCKET_BEING_SPLIT(opaque)
Definition: hash.h:91
#define LH_META_PAGE
Definition: hash.h:57
#define HashScanPosInvalidate(scanpos)
Definition: hash.h:144
#define HashPageGetMeta(page)
Definition: hash.h:323
#define BUCKET_TO_BLKNO(metap, B)
Definition: hash.h:39
#define HASH_METAPAGE
Definition: hash.h:198
#define H_HAS_DEAD_TUPLES(opaque)
Definition: hash.h:93
#define H_NEEDS_SPLIT_CLEANUP(opaque)
Definition: hash.h:90
uint32 Bucket
Definition: hash.h:35
#define LH_BUCKET_NEEDS_SPLIT_CLEANUP
Definition: hash.h:60
#define HASHNProcs
Definition: hash.h:358
HashScanOpaqueData * HashScanOpaque
Definition: hash.h:192
#define HASHOPTIONS_PROC
Definition: hash.h:357
#define HashScanPosIsValid(scanpos)
Definition: hash.h:137
#define LH_PAGE_HAS_DEAD_TUPLES
Definition: hash.h:61
#define LH_OVERFLOW_PAGE
Definition: hash.h:54
#define InvalidBucket
Definition: hash.h:37
#define XLOG_HASH_SPLIT_CLEANUP
Definition: hash_xlog.h:37
#define XLOG_HASH_UPDATE_META_PAGE
Definition: hash_xlog.h:38
#define SizeOfHashDelete
Definition: hash_xlog.h:186
#define XLOG_HASH_DELETE
Definition: hash_xlog.h:36
#define SizeOfHashUpdateMetaPage
Definition: hash_xlog.h:200
void _hash_doinsert(Relation rel, IndexTuple itup, Relation heapRel, bool sorted)
Definition: hashinsert.c:38
void _hash_squeezebucket(Relation rel, Bucket bucket, BlockNumber bucket_blkno, Buffer bucket_buf, BufferAccessStrategy bstrategy)
Definition: hashovfl.c:842
HashMetaPage _hash_getcachedmetap(Relation rel, Buffer *metabuf, bool force_refresh)
Definition: hashpage.c:1501
void _hash_relbuf(Relation rel, Buffer buf)
Definition: hashpage.c:266
uint32 _hash_init(Relation rel, double num_tuples, ForkNumber forkNum)
Definition: hashpage.c:327
void _hash_dropbuf(Relation rel, Buffer buf)
Definition: hashpage.c:277
void _hash_dropscanbuf(Relation rel, HashScanOpaque so)
Definition: hashpage.c:289
Buffer _hash_getbuf(Relation rel, BlockNumber blkno, int access, int flags)
Definition: hashpage.c:70
Buffer _hash_getbuf_with_strategy(Relation rel, BlockNumber blkno, int access, int flags, BufferAccessStrategy bstrategy)
Definition: hashpage.c:239
bool _hash_first(IndexScanDesc scan, ScanDirection dir)
Definition: hashsearch.c:288
bool _hash_next(IndexScanDesc scan, ScanDirection dir)
Definition: hashsearch.c:48
void _h_spool(HSpool *hspool, ItemPointer self, const Datum *values, const bool *isnull)
Definition: hashsort.c:109
void _h_indexbuild(HSpool *hspool, Relation heapRel)
Definition: hashsort.c:120
HSpool * _h_spoolinit(Relation heap, Relation index, uint32 num_buckets)
Definition: hashsort.c:60
void _h_spooldestroy(HSpool *hspool)
Definition: hashsort.c:99
uint32 _hash_get_indextuple_hashkey(IndexTuple itup)
Definition: hashutil.c:291
Bucket _hash_hashkey2bucket(uint32 hashkey, uint32 maxbucket, uint32 highmask, uint32 lowmask)
Definition: hashutil.c:125
void _hash_checkpage(Relation rel, Buffer buf, int flags)
Definition: hashutil.c:210
Bucket _hash_get_newbucket_from_oldbucket(Relation rel, Bucket old_bucket, uint32 lowmask, uint32 maxbucket)
Definition: hashutil.c:494
void _hash_kill_items(IndexScanDesc scan)
Definition: hashutil.c:536
bytea * hashoptions(Datum reloptions, bool validate)
Definition: hashutil.c:275
bool _hash_convert_tuple(Relation index, Datum *user_values, bool *user_isnull, Datum *index_values, bool *index_isnull)
Definition: hashutil.c:318
void hashadjustmembers(Oid opfamilyoid, Oid opclassoid, List *operators, List *functions)
Definition: hashvalidate.c:272
bool hashvalidate(Oid opclassoid)
Definition: hashvalidate.c:41
IndexTuple index_form_tuple(TupleDesc tupleDescriptor, const Datum *values, const bool *isnull)
Definition: indextuple.c:44
if(TABLE==NULL||TABLE_index==NULL)
Definition: isn.c:76
IndexTupleData * IndexTuple
Definition: itup.h:53
#define MaxIndexTuplesPerPage
Definition: itup.h:165
int NLocBuffer
Definition: localbuf.c:42
void pfree(void *pointer)
Definition: mcxt.c:1521
void * palloc0(Size size)
Definition: mcxt.c:1347
void * palloc(Size size)
Definition: mcxt.c:1317
#define START_CRIT_SECTION()
Definition: miscadmin.h:149
#define END_CRIT_SECTION()
Definition: miscadmin.h:151
#define makeNode(_type_)
Definition: nodes.h:155
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
uint16 OffsetNumber
Definition: off.h:24
#define FirstOffsetNumber
Definition: off.h:27
#define MaxOffsetNumber
Definition: off.h:28
static char * buf
Definition: pg_test_fsync.c:72
void estimate_rel_size(Relation rel, int32 *attr_widths, BlockNumber *pages, double *tuples, double *allvisfrac)
Definition: plancat.c:1066
uintptr_t Datum
Definition: postgres.h:64
#define PROGRESS_CREATEIDX_TUPLES_TOTAL
Definition: progress.h:87
#define RelationGetDescr(relation)
Definition: rel.h:531
#define RelationGetRelationName(relation)
Definition: rel.h:539
#define RelationNeedsWAL(relation)
Definition: rel.h:628
@ MAIN_FORKNUM
Definition: relpath.h:58
@ INIT_FORKNUM
Definition: relpath.h:61
ScanDirection
Definition: sdir.h:25
@ ForwardScanDirection
Definition: sdir.h:28
void hashcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, Cost *indexStartupCost, Cost *indexTotalCost, Selectivity *indexSelectivity, double *indexCorrelation, double *indexPages)
Definition: selfuncs.c:7142
#define HTMaxStrategyNumber
Definition: stratnum.h:43
HSpool * spool
Definition: hash.c:39
Relation heapRel
Definition: hash.c:41
double indtuples
Definition: hash.c:40
uint32 hashm_lowmask
Definition: hash.h:256
uint32 hashm_maxbucket
Definition: hash.h:254
double hashm_ntuples
Definition: hash.h:248
uint32 hashm_highmask
Definition: hash.h:255
BlockNumber hasho_nextblkno
Definition: hash.h:80
uint16 hasho_flag
Definition: hash.h:82
BlockNumber hasho_prevblkno
Definition: hash.h:79
bool hashso_buc_split
Definition: hash.h:180
HashScanPosData currPos
Definition: hash.h:189
bool hashso_buc_populated
Definition: hash.h:174
Buffer hashso_split_bucket_buf
Definition: hash.h:171
Buffer hashso_bucket_buf
Definition: hash.h:164
int * killedItems
Definition: hash.h:182
HashScanPosItem items[MaxIndexTuplesPerPage]
Definition: hash.h:127
int itemIndex
Definition: hash.h:125
ambuildphasename_function ambuildphasename
Definition: amapi.h:285
ambuildempty_function ambuildempty
Definition: amapi.h:275
amvacuumcleanup_function amvacuumcleanup
Definition: amapi.h:279
bool amclusterable
Definition: amapi.h:249
amoptions_function amoptions
Definition: amapi.h:283
amestimateparallelscan_function amestimateparallelscan
Definition: amapi.h:297
amrestrpos_function amrestrpos
Definition: amapi.h:294
aminsert_function aminsert
Definition: amapi.h:276
amendscan_function amendscan
Definition: amapi.h:292
uint16 amoptsprocnum
Definition: amapi.h:229
amparallelrescan_function amparallelrescan
Definition: amapi.h:299
Oid amkeytype
Definition: amapi.h:265
bool ampredlocks
Definition: amapi.h:251
uint16 amsupport
Definition: amapi.h:227
amcostestimate_function amcostestimate
Definition: amapi.h:281
bool amcanorderbyop
Definition: amapi.h:233
amadjustmembers_function amadjustmembers
Definition: amapi.h:287
ambuild_function ambuild
Definition: amapi.h:274
bool amstorage
Definition: amapi.h:247
uint16 amstrategies
Definition: amapi.h:225
bool amoptionalkey
Definition: amapi.h:241
amgettuple_function amgettuple
Definition: amapi.h:290
amcanreturn_function amcanreturn
Definition: amapi.h:280
bool amcanunique
Definition: amapi.h:237
amgetbitmap_function amgetbitmap
Definition: amapi.h:291
amproperty_function amproperty
Definition: amapi.h:284
ambulkdelete_function ambulkdelete
Definition: amapi.h:278
bool amsearcharray
Definition: amapi.h:243
bool amsummarizing
Definition: amapi.h:261
amvalidate_function amvalidate
Definition: amapi.h:286
ammarkpos_function ammarkpos
Definition: amapi.h:293
bool amcanmulticol
Definition: amapi.h:239
bool amusemaintenanceworkmem
Definition: amapi.h:259
ambeginscan_function ambeginscan
Definition: amapi.h:288
bool amcanparallel
Definition: amapi.h:253
amrescan_function amrescan
Definition: amapi.h:289
bool amcanorder
Definition: amapi.h:231
bool amcanbuildparallel
Definition: amapi.h:255
aminitparallelscan_function aminitparallelscan
Definition: amapi.h:298
uint8 amparallelvacuumoptions
Definition: amapi.h:263
aminsertcleanup_function aminsertcleanup
Definition: amapi.h:277
bool amcanbackward
Definition: amapi.h:235
amgettreeheight_function amgettreeheight
Definition: amapi.h:282
bool amcaninclude
Definition: amapi.h:257
bool amsearchnulls
Definition: amapi.h:245
double heap_tuples
Definition: genam.h:32
double index_tuples
Definition: genam.h:33
bool estimated_count
Definition: genam.h:78
BlockNumber num_pages
Definition: genam.h:77
double tuples_removed
Definition: genam.h:80
double num_index_tuples
Definition: genam.h:79
struct ScanKeyData * keyData
Definition: relscan.h:145
bool kill_prior_tuple
Definition: relscan.h:151
Relation indexRelation
Definition: relscan.h:141
ItemPointerData t_tid
Definition: itup.h:37
Relation index
Definition: genam.h:46
BufferAccessStrategy strategy
Definition: genam.h:53
Definition: type.h:95
Definition: regguts.h:323
bool clear_dead_marking
Definition: hash_xlog.h:180
bool is_primary_bucket_page
Definition: hash_xlog.h:182
static double table_index_build_scan(Relation table_rel, Relation index_rel, struct IndexInfo *index_info, bool allow_sync, bool progress, IndexBuildCallback callback, void *callback_state, TableScanDesc scan)
Definition: tableam.h:1784
static void callback(struct sockaddr *addr, struct sockaddr *mask, void *unused)
Definition: test_ifaddrs.c:46
void tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids, bool recheck)
Definition: tidbitmap.c:377
void vacuum_delay_point(void)
Definition: vacuum.c:2361
#define VACUUM_OPTION_PARALLEL_BULKDEL
Definition: vacuum.h:48
uint64 XLogRecPtr
Definition: xlogdefs.h:21
void XLogRegisterBufData(uint8 block_id, const char *data, uint32 len)
Definition: xloginsert.c:405
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:474
void XLogRegisterData(const char *data, uint32 len)
Definition: xloginsert.c:364
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition: xloginsert.c:242
void XLogBeginInsert(void)
Definition: xloginsert.c:149
#define REGBUF_NO_CHANGE
Definition: xloginsert.h:36
#define REGBUF_STANDARD
Definition: xloginsert.h:34
#define REGBUF_NO_IMAGE
Definition: xloginsert.h:32