PostgreSQL Source Code  git master
hash.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * hash.c
4  * Implementation of Margo Seltzer's Hashing package for postgres.
5  *
6  * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  * src/backend/access/hash/hash.c
12  *
13  * NOTES
14  * This file contains only the public interface routines.
15  *
16  *-------------------------------------------------------------------------
17  */
18 
19 #include "postgres.h"
20 
21 #include "access/hash.h"
22 #include "access/hash_xlog.h"
23 #include "access/relscan.h"
24 #include "access/tableam.h"
25 #include "catalog/index.h"
26 #include "commands/progress.h"
27 #include "commands/vacuum.h"
28 #include "miscadmin.h"
29 #include "optimizer/plancat.h"
30 #include "pgstat.h"
31 #include "utils/builtins.h"
32 #include "utils/index_selfuncs.h"
33 #include "utils/rel.h"
34 
35 /* Working state for hashbuild and its callback */
36 typedef struct
37 {
38  HSpool *spool; /* NULL if not using spooling */
39  double indtuples; /* # tuples accepted into index */
40  Relation heapRel; /* heap relation descriptor */
42 
43 static void hashbuildCallback(Relation index,
44  ItemPointer tid,
45  Datum *values,
46  bool *isnull,
47  bool tupleIsAlive,
48  void *state);
49 
50 
51 /*
52  * Hash handler function: return IndexAmRoutine with access method parameters
53  * and callbacks.
54  */
55 Datum
57 {
59 
60  amroutine->amstrategies = HTMaxStrategyNumber;
61  amroutine->amsupport = HASHNProcs;
62  amroutine->amoptsprocnum = HASHOPTIONS_PROC;
63  amroutine->amcanorder = false;
64  amroutine->amcanorderbyop = false;
65  amroutine->amcanbackward = true;
66  amroutine->amcanunique = false;
67  amroutine->amcanmulticol = false;
68  amroutine->amoptionalkey = false;
69  amroutine->amsearcharray = false;
70  amroutine->amsearchnulls = false;
71  amroutine->amstorage = false;
72  amroutine->amclusterable = false;
73  amroutine->ampredlocks = true;
74  amroutine->amcanparallel = false;
75  amroutine->amcaninclude = false;
76  amroutine->amusemaintenanceworkmem = false;
77  amroutine->amparallelvacuumoptions =
79  amroutine->amkeytype = INT4OID;
80 
81  amroutine->ambuild = hashbuild;
82  amroutine->ambuildempty = hashbuildempty;
83  amroutine->aminsert = hashinsert;
84  amroutine->ambulkdelete = hashbulkdelete;
86  amroutine->amcanreturn = NULL;
87  amroutine->amcostestimate = hashcostestimate;
88  amroutine->amoptions = hashoptions;
89  amroutine->amproperty = NULL;
90  amroutine->ambuildphasename = NULL;
91  amroutine->amvalidate = hashvalidate;
93  amroutine->ambeginscan = hashbeginscan;
94  amroutine->amrescan = hashrescan;
95  amroutine->amgettuple = hashgettuple;
96  amroutine->amgetbitmap = hashgetbitmap;
97  amroutine->amendscan = hashendscan;
98  amroutine->ammarkpos = NULL;
99  amroutine->amrestrpos = NULL;
100  amroutine->amestimateparallelscan = NULL;
101  amroutine->aminitparallelscan = NULL;
102  amroutine->amparallelrescan = NULL;
103 
104  PG_RETURN_POINTER(amroutine);
105 }
106 
107 /*
108  * hashbuild() -- build a new hash index.
109  */
112 {
113  IndexBuildResult *result;
114  BlockNumber relpages;
115  double reltuples;
116  double allvisfrac;
117  uint32 num_buckets;
118  long sort_threshold;
119  HashBuildState buildstate;
120 
121  /*
122  * We expect to be called exactly once for any index relation. If that's
123  * not the case, big trouble's what we have.
124  */
125  if (RelationGetNumberOfBlocks(index) != 0)
126  elog(ERROR, "index \"%s\" already contains data",
127  RelationGetRelationName(index));
128 
129  /* Estimate the number of rows currently present in the table */
130  estimate_rel_size(heap, NULL, &relpages, &reltuples, &allvisfrac);
131 
132  /* Initialize the hash index metadata page and initial buckets */
133  num_buckets = _hash_init(index, reltuples, MAIN_FORKNUM);
134 
135  /*
136  * If we just insert the tuples into the index in scan order, then
137  * (assuming their hash codes are pretty random) there will be no locality
138  * of access to the index, and if the index is bigger than available RAM
139  * then we'll thrash horribly. To prevent that scenario, we can sort the
140  * tuples by (expected) bucket number. However, such a sort is useless
141  * overhead when the index does fit in RAM. We choose to sort if the
142  * initial index size exceeds maintenance_work_mem, or the number of
143  * buffers usable for the index, whichever is less. (Limiting by the
144  * number of buffers should reduce thrashing between PG buffers and kernel
145  * buffers, which seems useful even if no physical I/O results. Limiting
146  * by maintenance_work_mem is useful to allow easy testing of the sort
147  * code path, and may be useful to DBAs as an additional control knob.)
148  *
149  * NOTE: this test will need adjustment if a bucket is ever different from
150  * one page. Also, "initial index size" accounting does not include the
151  * metapage, nor the first bitmap page.
152  */
153  sort_threshold = (maintenance_work_mem * 1024L) / BLCKSZ;
154  if (index->rd_rel->relpersistence != RELPERSISTENCE_TEMP)
155  sort_threshold = Min(sort_threshold, NBuffers);
156  else
157  sort_threshold = Min(sort_threshold, NLocBuffer);
158 
159  if (num_buckets >= (uint32) sort_threshold)
160  buildstate.spool = _h_spoolinit(heap, index, num_buckets);
161  else
162  buildstate.spool = NULL;
163 
164  /* prepare to build the index */
165  buildstate.indtuples = 0;
166  buildstate.heapRel = heap;
167 
168  /* do the heap scan */
169  reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
171  (void *) &buildstate, NULL);
173  buildstate.indtuples);
174 
175  if (buildstate.spool)
176  {
177  /* sort the tuples and insert them into the index */
178  _h_indexbuild(buildstate.spool, buildstate.heapRel);
179  _h_spooldestroy(buildstate.spool);
180  }
181 
182  /*
183  * Return statistics
184  */
185  result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
186 
187  result->heap_tuples = reltuples;
188  result->index_tuples = buildstate.indtuples;
189 
190  return result;
191 }
192 
193 /*
194  * hashbuildempty() -- build an empty hash index in the initialization fork
195  */
196 void
198 {
199  _hash_init(index, 0, INIT_FORKNUM);
200 }
201 
202 /*
203  * Per-tuple callback for table_index_build_scan
204  */
205 static void
207  ItemPointer tid,
208  Datum *values,
209  bool *isnull,
210  bool tupleIsAlive,
211  void *state)
212 {
213  HashBuildState *buildstate = (HashBuildState *) state;
214  Datum index_values[1];
215  bool index_isnull[1];
216  IndexTuple itup;
217 
218  /* convert data to a hash key; on failure, do not insert anything */
219  if (!_hash_convert_tuple(index,
220  values, isnull,
221  index_values, index_isnull))
222  return;
223 
224  /* Either spool the tuple for sorting, or just put it into the index */
225  if (buildstate->spool)
226  _h_spool(buildstate->spool, tid, index_values, index_isnull);
227  else
228  {
229  /* form an index tuple and point it at the heap tuple */
230  itup = index_form_tuple(RelationGetDescr(index),
231  index_values, index_isnull);
232  itup->t_tid = *tid;
233  _hash_doinsert(index, itup, buildstate->heapRel);
234  pfree(itup);
235  }
236 
237  buildstate->indtuples += 1;
238 }
239 
240 /*
241  * hashinsert() -- insert an index tuple into a hash table.
242  *
243  * Hash on the heap tuple's key, form an index tuple with hash code.
244  * Find the appropriate location for the new tuple, and put it there.
245  */
246 bool
247 hashinsert(Relation rel, Datum *values, bool *isnull,
248  ItemPointer ht_ctid, Relation heapRel,
249  IndexUniqueCheck checkUnique,
250  IndexInfo *indexInfo)
251 {
252  Datum index_values[1];
253  bool index_isnull[1];
254  IndexTuple itup;
255 
256  /* convert data to a hash key; on failure, do not insert anything */
257  if (!_hash_convert_tuple(rel,
258  values, isnull,
259  index_values, index_isnull))
260  return false;
261 
262  /* form an index tuple and point it at the heap tuple */
263  itup = index_form_tuple(RelationGetDescr(rel), index_values, index_isnull);
264  itup->t_tid = *ht_ctid;
265 
266  _hash_doinsert(rel, itup, heapRel);
267 
268  pfree(itup);
269 
270  return false;
271 }
272 
273 
274 /*
275  * hashgettuple() -- Get the next tuple in the scan.
276  */
277 bool
279 {
280  HashScanOpaque so = (HashScanOpaque) scan->opaque;
281  bool res;
282 
283  /* Hash indexes are always lossy since we store only the hash code */
284  scan->xs_recheck = true;
285 
286  /*
287  * If we've already initialized this scan, we can just advance it in the
288  * appropriate direction. If we haven't done so yet, we call a routine to
289  * get the first item in the scan.
290  */
291  if (!HashScanPosIsValid(so->currPos))
292  res = _hash_first(scan, dir);
293  else
294  {
295  /*
296  * Check to see if we should kill the previously-fetched tuple.
297  */
298  if (scan->kill_prior_tuple)
299  {
300  /*
301  * Yes, so remember it for later. (We'll deal with all such tuples
302  * at once right after leaving the index page or at end of scan.)
303  * In case if caller reverses the indexscan direction it is quite
304  * possible that the same item might get entered multiple times.
305  * But, we don't detect that; instead, we just forget any excess
306  * entries.
307  */
308  if (so->killedItems == NULL)
309  so->killedItems = (int *)
310  palloc(MaxIndexTuplesPerPage * sizeof(int));
311 
313  so->killedItems[so->numKilled++] = so->currPos.itemIndex;
314  }
315 
316  /*
317  * Now continue the scan.
318  */
319  res = _hash_next(scan, dir);
320  }
321 
322  return res;
323 }
324 
325 
326 /*
327  * hashgetbitmap() -- get all tuples at once
328  */
329 int64
331 {
332  HashScanOpaque so = (HashScanOpaque) scan->opaque;
333  bool res;
334  int64 ntids = 0;
335  HashScanPosItem *currItem;
336 
337  res = _hash_first(scan, ForwardScanDirection);
338 
339  while (res)
340  {
341  currItem = &so->currPos.items[so->currPos.itemIndex];
342 
343  /*
344  * _hash_first and _hash_next handle eliminate dead index entries
345  * whenever scan->ignore_killed_tuples is true. Therefore, there's
346  * nothing to do here except add the results to the TIDBitmap.
347  */
348  tbm_add_tuples(tbm, &(currItem->heapTid), 1, true);
349  ntids++;
350 
351  res = _hash_next(scan, ForwardScanDirection);
352  }
353 
354  return ntids;
355 }
356 
357 
358 /*
359  * hashbeginscan() -- start a scan on a hash index
360  */
362 hashbeginscan(Relation rel, int nkeys, int norderbys)
363 {
364  IndexScanDesc scan;
365  HashScanOpaque so;
366 
367  /* no order by operators allowed */
368  Assert(norderbys == 0);
369 
370  scan = RelationGetIndexScan(rel, nkeys, norderbys);
371 
372  so = (HashScanOpaque) palloc(sizeof(HashScanOpaqueData));
376 
377  so->hashso_buc_populated = false;
378  so->hashso_buc_split = false;
379 
380  so->killedItems = NULL;
381  so->numKilled = 0;
382 
383  scan->opaque = so;
384 
385  return scan;
386 }
387 
388 /*
389  * hashrescan() -- rescan an index relation
390  */
391 void
392 hashrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
393  ScanKey orderbys, int norderbys)
394 {
395  HashScanOpaque so = (HashScanOpaque) scan->opaque;
396  Relation rel = scan->indexRelation;
397 
398  if (HashScanPosIsValid(so->currPos))
399  {
400  /* Before leaving current page, deal with any killed items */
401  if (so->numKilled > 0)
402  _hash_kill_items(scan);
403  }
404 
405  _hash_dropscanbuf(rel, so);
406 
407  /* set position invalid (this will cause _hash_first call) */
409 
410  /* Update scan key, if a new one is given */
411  if (scankey && scan->numberOfKeys > 0)
412  {
413  memmove(scan->keyData,
414  scankey,
415  scan->numberOfKeys * sizeof(ScanKeyData));
416  }
417 
418  so->hashso_buc_populated = false;
419  so->hashso_buc_split = false;
420 }
421 
422 /*
423  * hashendscan() -- close down a scan
424  */
425 void
427 {
428  HashScanOpaque so = (HashScanOpaque) scan->opaque;
429  Relation rel = scan->indexRelation;
430 
431  if (HashScanPosIsValid(so->currPos))
432  {
433  /* Before leaving current page, deal with any killed items */
434  if (so->numKilled > 0)
435  _hash_kill_items(scan);
436  }
437 
438  _hash_dropscanbuf(rel, so);
439 
440  if (so->killedItems != NULL)
441  pfree(so->killedItems);
442  pfree(so);
443  scan->opaque = NULL;
444 }
445 
446 /*
447  * Bulk deletion of all index entries pointing to a set of heap tuples.
448  * The set of target tuples is specified via a callback routine that tells
449  * whether any given heap tuple (identified by ItemPointer) is being deleted.
450  *
451  * This function also deletes the tuples that are moved by split to other
452  * bucket.
453  *
454  * Result: a palloc'd struct containing statistical info for VACUUM displays.
455  */
458  IndexBulkDeleteCallback callback, void *callback_state)
459 {
460  Relation rel = info->index;
461  double tuples_removed;
462  double num_index_tuples;
463  double orig_ntuples;
464  Bucket orig_maxbucket;
465  Bucket cur_maxbucket;
466  Bucket cur_bucket;
467  Buffer metabuf = InvalidBuffer;
468  HashMetaPage metap;
469  HashMetaPage cachedmetap;
470 
471  tuples_removed = 0;
472  num_index_tuples = 0;
473 
474  /*
475  * We need a copy of the metapage so that we can use its hashm_spares[]
476  * values to compute bucket page addresses, but a cached copy should be
477  * good enough. (If not, we'll detect that further down and refresh the
478  * cache as necessary.)
479  */
480  cachedmetap = _hash_getcachedmetap(rel, &metabuf, false);
481  Assert(cachedmetap != NULL);
482 
483  orig_maxbucket = cachedmetap->hashm_maxbucket;
484  orig_ntuples = cachedmetap->hashm_ntuples;
485 
486  /* Scan the buckets that we know exist */
487  cur_bucket = 0;
488  cur_maxbucket = orig_maxbucket;
489 
490 loop_top:
491  while (cur_bucket <= cur_maxbucket)
492  {
493  BlockNumber bucket_blkno;
495  Buffer bucket_buf;
496  Buffer buf;
497  HashPageOpaque bucket_opaque;
498  Page page;
499  bool split_cleanup = false;
500 
501  /* Get address of bucket's start page */
502  bucket_blkno = BUCKET_TO_BLKNO(cachedmetap, cur_bucket);
503 
504  blkno = bucket_blkno;
505 
506  /*
507  * We need to acquire a cleanup lock on the primary bucket page to out
508  * wait concurrent scans before deleting the dead tuples.
509  */
510  buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, info->strategy);
512  _hash_checkpage(rel, buf, LH_BUCKET_PAGE);
513 
514  page = BufferGetPage(buf);
515  bucket_opaque = (HashPageOpaque) PageGetSpecialPointer(page);
516 
517  /*
518  * If the bucket contains tuples that are moved by split, then we need
519  * to delete such tuples. We can't delete such tuples if the split
520  * operation on bucket is not finished as those are needed by scans.
521  */
522  if (!H_BUCKET_BEING_SPLIT(bucket_opaque) &&
523  H_NEEDS_SPLIT_CLEANUP(bucket_opaque))
524  {
525  split_cleanup = true;
526 
527  /*
528  * This bucket might have been split since we last held a lock on
529  * the metapage. If so, hashm_maxbucket, hashm_highmask and
530  * hashm_lowmask might be old enough to cause us to fail to remove
531  * tuples left behind by the most recent split. To prevent that,
532  * now that the primary page of the target bucket has been locked
533  * (and thus can't be further split), check whether we need to
534  * update our cached metapage data.
535  */
536  Assert(bucket_opaque->hasho_prevblkno != InvalidBlockNumber);
537  if (bucket_opaque->hasho_prevblkno > cachedmetap->hashm_maxbucket)
538  {
539  cachedmetap = _hash_getcachedmetap(rel, &metabuf, true);
540  Assert(cachedmetap != NULL);
541  }
542  }
543 
544  bucket_buf = buf;
545 
546  hashbucketcleanup(rel, cur_bucket, bucket_buf, blkno, info->strategy,
547  cachedmetap->hashm_maxbucket,
548  cachedmetap->hashm_highmask,
549  cachedmetap->hashm_lowmask, &tuples_removed,
550  &num_index_tuples, split_cleanup,
551  callback, callback_state);
552 
553  _hash_dropbuf(rel, bucket_buf);
554 
555  /* Advance to next bucket */
556  cur_bucket++;
557  }
558 
559  if (BufferIsInvalid(metabuf))
561 
562  /* Write-lock metapage and check for split since we started */
564  metap = HashPageGetMeta(BufferGetPage(metabuf));
565 
566  if (cur_maxbucket != metap->hashm_maxbucket)
567  {
568  /* There's been a split, so process the additional bucket(s) */
569  LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
570  cachedmetap = _hash_getcachedmetap(rel, &metabuf, true);
571  Assert(cachedmetap != NULL);
572  cur_maxbucket = cachedmetap->hashm_maxbucket;
573  goto loop_top;
574  }
575 
576  /* Okay, we're really done. Update tuple count in metapage. */
578 
579  if (orig_maxbucket == metap->hashm_maxbucket &&
580  orig_ntuples == metap->hashm_ntuples)
581  {
582  /*
583  * No one has split or inserted anything since start of scan, so
584  * believe our count as gospel.
585  */
586  metap->hashm_ntuples = num_index_tuples;
587  }
588  else
589  {
590  /*
591  * Otherwise, our count is untrustworthy since we may have
592  * double-scanned tuples in split buckets. Proceed by dead-reckoning.
593  * (Note: we still return estimated_count = false, because using this
594  * count is better than not updating reltuples at all.)
595  */
596  if (metap->hashm_ntuples > tuples_removed)
597  metap->hashm_ntuples -= tuples_removed;
598  else
599  metap->hashm_ntuples = 0;
600  num_index_tuples = metap->hashm_ntuples;
601  }
602 
603  MarkBufferDirty(metabuf);
604 
605  /* XLOG stuff */
606  if (RelationNeedsWAL(rel))
607  {
609  XLogRecPtr recptr;
610 
611  xlrec.ntuples = metap->hashm_ntuples;
612 
613  XLogBeginInsert();
614  XLogRegisterData((char *) &xlrec, SizeOfHashUpdateMetaPage);
615 
616  XLogRegisterBuffer(0, metabuf, REGBUF_STANDARD);
617 
618  recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_UPDATE_META_PAGE);
619  PageSetLSN(BufferGetPage(metabuf), recptr);
620  }
621 
623 
624  _hash_relbuf(rel, metabuf);
625 
626  /* return statistics */
627  if (stats == NULL)
629  stats->estimated_count = false;
630  stats->num_index_tuples = num_index_tuples;
631  stats->tuples_removed += tuples_removed;
632  /* hashvacuumcleanup will fill in num_pages */
633 
634  return stats;
635 }
636 
637 /*
638  * Post-VACUUM cleanup.
639  *
640  * Result: a palloc'd struct containing statistical info for VACUUM displays.
641  */
644 {
645  Relation rel = info->index;
646  BlockNumber num_pages;
647 
648  /* If hashbulkdelete wasn't called, return NULL signifying no change */
649  /* Note: this covers the analyze_only case too */
650  if (stats == NULL)
651  return NULL;
652 
653  /* update statistics */
654  num_pages = RelationGetNumberOfBlocks(rel);
655  stats->num_pages = num_pages;
656 
657  return stats;
658 }
659 
660 /*
661  * Helper function to perform deletion of index entries from a bucket.
662  *
663  * This function expects that the caller has acquired a cleanup lock on the
664  * primary bucket page, and will return with a write lock again held on the
665  * primary bucket page. The lock won't necessarily be held continuously,
666  * though, because we'll release it when visiting overflow pages.
667  *
668  * There can't be any concurrent scans in progress when we first enter this
669  * function because of the cleanup lock we hold on the primary bucket page,
670  * but as soon as we release that lock, there might be. If those scans got
671  * ahead of our cleanup scan, they might see a tuple before we kill it and
672  * wake up only after VACUUM has completed and the TID has been recycled for
673  * an unrelated tuple. To avoid that calamity, we prevent scans from passing
674  * our cleanup scan by locking the next page in the bucket chain before
675  * releasing the lock on the previous page. (This type of lock chaining is not
676  * ideal, so we might want to look for a better solution at some point.)
677  *
678  * We need to retain a pin on the primary bucket to ensure that no concurrent
679  * split can start.
680  */
681 void
682 hashbucketcleanup(Relation rel, Bucket cur_bucket, Buffer bucket_buf,
683  BlockNumber bucket_blkno, BufferAccessStrategy bstrategy,
684  uint32 maxbucket, uint32 highmask, uint32 lowmask,
685  double *tuples_removed, double *num_index_tuples,
686  bool split_cleanup,
687  IndexBulkDeleteCallback callback, void *callback_state)
688 {
690  Buffer buf;
692  bool bucket_dirty = false;
693 
694  blkno = bucket_blkno;
695  buf = bucket_buf;
696 
697  if (split_cleanup)
698  new_bucket = _hash_get_newbucket_from_oldbucket(rel, cur_bucket,
699  lowmask, maxbucket);
700 
701  /* Scan each page in bucket */
702  for (;;)
703  {
704  HashPageOpaque opaque;
705  OffsetNumber offno;
706  OffsetNumber maxoffno;
707  Buffer next_buf;
708  Page page;
709  OffsetNumber deletable[MaxOffsetNumber];
710  int ndeletable = 0;
711  bool retain_pin = false;
712  bool clear_dead_marking = false;
713 
715 
716  page = BufferGetPage(buf);
717  opaque = (HashPageOpaque) PageGetSpecialPointer(page);
718 
719  /* Scan each tuple in page */
720  maxoffno = PageGetMaxOffsetNumber(page);
721  for (offno = FirstOffsetNumber;
722  offno <= maxoffno;
723  offno = OffsetNumberNext(offno))
724  {
725  ItemPointer htup;
726  IndexTuple itup;
727  Bucket bucket;
728  bool kill_tuple = false;
729 
730  itup = (IndexTuple) PageGetItem(page,
731  PageGetItemId(page, offno));
732  htup = &(itup->t_tid);
733 
734  /*
735  * To remove the dead tuples, we strictly want to rely on results
736  * of callback function. refer btvacuumpage for detailed reason.
737  */
738  if (callback && callback(htup, callback_state))
739  {
740  kill_tuple = true;
741  if (tuples_removed)
742  *tuples_removed += 1;
743  }
744  else if (split_cleanup)
745  {
746  /* delete the tuples that are moved by split. */
748  maxbucket,
749  highmask,
750  lowmask);
751  /* mark the item for deletion */
752  if (bucket != cur_bucket)
753  {
754  /*
755  * We expect tuples to either belong to current bucket or
756  * new_bucket. This is ensured because we don't allow
757  * further splits from bucket that contains garbage. See
758  * comments in _hash_expandtable.
759  */
760  Assert(bucket == new_bucket);
761  kill_tuple = true;
762  }
763  }
764 
765  if (kill_tuple)
766  {
767  /* mark the item for deletion */
768  deletable[ndeletable++] = offno;
769  }
770  else
771  {
772  /* we're keeping it, so count it */
773  if (num_index_tuples)
774  *num_index_tuples += 1;
775  }
776  }
777 
778  /* retain the pin on primary bucket page till end of bucket scan */
779  if (blkno == bucket_blkno)
780  retain_pin = true;
781  else
782  retain_pin = false;
783 
784  blkno = opaque->hasho_nextblkno;
785 
786  /*
787  * Apply deletions, advance to next page and write page if needed.
788  */
789  if (ndeletable > 0)
790  {
791  /* No ereport(ERROR) until changes are logged */
793 
794  PageIndexMultiDelete(page, deletable, ndeletable);
795  bucket_dirty = true;
796 
797  /*
798  * Let us mark the page as clean if vacuum removes the DEAD tuples
799  * from an index page. We do this by clearing
800  * LH_PAGE_HAS_DEAD_TUPLES flag.
801  */
802  if (tuples_removed && *tuples_removed > 0 &&
803  H_HAS_DEAD_TUPLES(opaque))
804  {
806  clear_dead_marking = true;
807  }
808 
809  MarkBufferDirty(buf);
810 
811  /* XLOG stuff */
812  if (RelationNeedsWAL(rel))
813  {
814  xl_hash_delete xlrec;
815  XLogRecPtr recptr;
816 
817  xlrec.clear_dead_marking = clear_dead_marking;
818  xlrec.is_primary_bucket_page = (buf == bucket_buf) ? true : false;
819 
820  XLogBeginInsert();
821  XLogRegisterData((char *) &xlrec, SizeOfHashDelete);
822 
823  /*
824  * bucket buffer needs to be registered to ensure that we can
825  * acquire a cleanup lock on it during replay.
826  */
827  if (!xlrec.is_primary_bucket_page)
829 
831  XLogRegisterBufData(1, (char *) deletable,
832  ndeletable * sizeof(OffsetNumber));
833 
834  recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_DELETE);
835  PageSetLSN(BufferGetPage(buf), recptr);
836  }
837 
839  }
840 
841  /* bail out if there are no more pages to scan. */
842  if (!BlockNumberIsValid(blkno))
843  break;
844 
845  next_buf = _hash_getbuf_with_strategy(rel, blkno, HASH_WRITE,
847  bstrategy);
848 
849  /*
850  * release the lock on previous page after acquiring the lock on next
851  * page
852  */
853  if (retain_pin)
855  else
856  _hash_relbuf(rel, buf);
857 
858  buf = next_buf;
859  }
860 
861  /*
862  * lock the bucket page to clear the garbage flag and squeeze the bucket.
863  * if the current buffer is same as bucket buffer, then we already have
864  * lock on bucket page.
865  */
866  if (buf != bucket_buf)
867  {
868  _hash_relbuf(rel, buf);
869  LockBuffer(bucket_buf, BUFFER_LOCK_EXCLUSIVE);
870  }
871 
872  /*
873  * Clear the garbage flag from bucket after deleting the tuples that are
874  * moved by split. We purposefully clear the flag before squeeze bucket,
875  * so that after restart, vacuum shouldn't again try to delete the moved
876  * by split tuples.
877  */
878  if (split_cleanup)
879  {
880  HashPageOpaque bucket_opaque;
881  Page page;
882 
883  page = BufferGetPage(bucket_buf);
884  bucket_opaque = (HashPageOpaque) PageGetSpecialPointer(page);
885 
886  /* No ereport(ERROR) until changes are logged */
888 
889  bucket_opaque->hasho_flag &= ~LH_BUCKET_NEEDS_SPLIT_CLEANUP;
890  MarkBufferDirty(bucket_buf);
891 
892  /* XLOG stuff */
893  if (RelationNeedsWAL(rel))
894  {
895  XLogRecPtr recptr;
896 
897  XLogBeginInsert();
898  XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD);
899 
900  recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_CLEANUP);
901  PageSetLSN(page, recptr);
902  }
903 
905  }
906 
907  /*
908  * If we have deleted anything, try to compact free space. For squeezing
909  * the bucket, we must have a cleanup lock, else it can impact the
910  * ordering of tuples for a scan that has started before it.
911  */
912  if (bucket_dirty && IsBufferCleanupOK(bucket_buf))
913  _hash_squeezebucket(rel, cur_bucket, bucket_blkno, bucket_buf,
914  bstrategy);
915  else
916  LockBuffer(bucket_buf, BUFFER_LOCK_UNLOCK);
917 }
ambeginscan_function ambeginscan
Definition: amapi.h:270
void XLogRegisterBufData(uint8 block_id, char *data, int len)
Definition: xloginsert.c:368
uint8 amparallelvacuumoptions
Definition: amapi.h:247
bytea * hashoptions(Datum reloptions, bool validate)
Definition: hashutil.c:276
#define PG_RETURN_POINTER(x)
Definition: fmgr.h:360
HashScanOpaqueData * HashScanOpaque
Definition: hash.h:190
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:96
void LockBufferForCleanup(Buffer buffer)
Definition: bufmgr.c:3814
ambulkdelete_function ambulkdelete
Definition: amapi.h:261
bool hashgettuple(IndexScanDesc scan, ScanDirection dir)
Definition: hash.c:278
bool amcanmulticol
Definition: amapi.h:227
void _hash_doinsert(Relation rel, IndexTuple itup, Relation heapRel)
Definition: hashinsert.c:36
uint16 amsupport
Definition: amapi.h:215
double tuples_removed
Definition: genam.h:78
#define HTMaxStrategyNumber
Definition: stratnum.h:43
void _hash_dropscanbuf(Relation rel, HashScanOpaque so)
Definition: hashpage.c:288
#define HASHNProcs
Definition: hash.h:356
amgettuple_function amgettuple
Definition: amapi.h:272
#define RelationGetDescr(relation)
Definition: rel.h:482
Bucket _hash_hashkey2bucket(uint32 hashkey, uint32 maxbucket, uint32 highmask, uint32 lowmask)
Definition: hashutil.c:126
bool amcanorderbyop
Definition: amapi.h:221
#define LH_META_PAGE
Definition: hash.h:57
IndexBulkDeleteResult * hashbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state)
Definition: hash.c:457
amproperty_function amproperty
Definition: amapi.h:266
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:1469
#define SizeOfHashDelete
Definition: hash_xlog.h:192
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition: xloginsert.c:220
#define HashScanPosIsValid(scanpos)
Definition: hash.h:135
#define MaxOffsetNumber
Definition: off.h:28
Buffer _hash_getbuf_with_strategy(Relation rel, BlockNumber blkno, int access, int flags, BufferAccessStrategy bstrategy)
Definition: hashpage.c:238
void pgstat_progress_update_param(int index, int64 val)
Definition: pgstat.c:3231
void tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids, bool recheck)
Definition: tidbitmap.c:376
Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition: bufmgr.c:653
ItemPointerData t_tid
Definition: itup.h:37
amparallelrescan_function amparallelrescan
Definition: amapi.h:281
#define Min(x, y)
Definition: c.h:927
#define END_CRIT_SECTION()
Definition: miscadmin.h:134
BufferAccessStrategy strategy
Definition: genam.h:52
void hashcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, Cost *indexStartupCost, Cost *indexTotalCost, Selectivity *indexSelectivity, double *indexCorrelation, double *indexPages)
Definition: selfuncs.c:6543
bool amstorage
Definition: amapi.h:235
double indtuples
Definition: hash.c:39
uint32 hashm_highmask
Definition: hash.h:253
#define InvalidBuffer
Definition: buf.h:25
#define START_CRIT_SECTION()
Definition: miscadmin.h:132
Relation index
Definition: genam.h:46
#define XLOG_HASH_SPLIT_CLEANUP
Definition: hash_xlog.h:40
bool ampredlocks
Definition: amapi.h:239
bool clear_dead_marking
Definition: hash_xlog.h:186
void hashbucketcleanup(Relation rel, Bucket cur_bucket, Buffer bucket_buf, BlockNumber bucket_blkno, BufferAccessStrategy bstrategy, uint32 maxbucket, uint32 highmask, uint32 lowmask, double *tuples_removed, double *num_index_tuples, bool split_cleanup, IndexBulkDeleteCallback callback, void *callback_state)
Definition: hash.c:682
#define PROGRESS_CREATEIDX_TUPLES_TOTAL
Definition: progress.h:84
uint32 BlockNumber
Definition: block.h:31
IndexScanDesc hashbeginscan(Relation rel, int nkeys, int norderbys)
Definition: hash.c:362
aminsert_function aminsert
Definition: amapi.h:260
void _hash_dropbuf(Relation rel, Buffer buf)
Definition: hashpage.c:276
Buffer _hash_getbuf(Relation rel, BlockNumber blkno, int access, int flags)
Definition: hashpage.c:69
#define BUFFER_LOCK_EXCLUSIVE
Definition: bufmgr.h:98
bool hashinsert(Relation rel, Datum *values, bool *isnull, ItemPointer ht_ctid, Relation heapRel, IndexUniqueCheck checkUnique, IndexInfo *indexInfo)
Definition: hash.c:247
#define LH_BUCKET_NEEDS_SPLIT_CLEANUP
Definition: hash.h:60
Form_pg_class rd_rel
Definition: rel.h:109
Oid amkeytype
Definition: amapi.h:249
uint32 hashm_lowmask
Definition: hash.h:254
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:357
bool amoptionalkey
Definition: amapi.h:229
amvalidate_function amvalidate
Definition: amapi.h:268
void hashbuildempty(Relation index)
Definition: hash.c:197
#define BUCKET_TO_BLKNO(metap, B)
Definition: hash.h:39
Relation indexRelation
Definition: relscan.h:115
uint16 OffsetNumber
Definition: off.h:24
Definition: type.h:89
uint32 Bucket
Definition: hash.h:35
IndexUniqueCheck
Definition: genam.h:112
IndexTuple index_form_tuple(TupleDesc tupleDescriptor, Datum *values, bool *isnull)
Definition: indextuple.c:47
int * killedItems
Definition: hash.h:180
void pfree(void *pointer)
Definition: mcxt.c:1056
#define InvalidBucket
Definition: hash.h:37
amgetbitmap_function amgetbitmap
Definition: amapi.h:273
#define H_NEEDS_SPLIT_CLEANUP(opaque)
Definition: hash.h:88
BlockNumber hasho_prevblkno
Definition: hash.h:79
#define ERROR
Definition: elog.h:43
ambuild_function ambuild
Definition: amapi.h:258
uint32 _hash_get_indextuple_hashkey(IndexTuple itup)
Definition: hashutil.c:292
bool _hash_first(IndexScanDesc scan, ScanDirection dir)
Definition: hashsearch.c:292
amoptions_function amoptions
Definition: amapi.h:265
static double table_index_build_scan(Relation table_rel, Relation index_rel, struct IndexInfo *index_info, bool allow_sync, bool progress, IndexBuildCallback callback, void *callback_state, TableScanDesc scan)
Definition: tableam.h:1552
IndexBuildResult * hashbuild(Relation heap, Relation index, IndexInfo *indexInfo)
Definition: hash.c:111
BlockNumber num_pages
Definition: genam.h:74
static void callback(struct sockaddr *addr, struct sockaddr *mask, void *unused)
Definition: test_ifaddrs.c:48
bool amcaninclude
Definition: amapi.h:243
int NLocBuffer
Definition: localbuf.c:41
bool _hash_convert_tuple(Relation index, Datum *user_values, bool *user_isnull, Datum *index_values, bool *index_isnull)
Definition: hashutil.c:319
BlockNumber blkno
Definition: ginvacuum.c:119
amcostestimate_function amcostestimate
Definition: amapi.h:264
bool amcanunique
Definition: amapi.h:225
bool is_primary_bucket_page
Definition: hash_xlog.h:188
bool _hash_next(IndexScanDesc scan, ScanDirection dir)
Definition: hashsearch.c:48
#define BufferIsInvalid(buffer)
Definition: buf.h:31
static char * buf
Definition: pg_test_fsync.c:67
amvacuumcleanup_function amvacuumcleanup
Definition: amapi.h:262
#define HASH_WRITE
Definition: hash.h:338
amendscan_function amendscan
Definition: amapi.h:274
#define HASH_NOLOCK
Definition: hash.h:339
bool amcanbackward
Definition: amapi.h:223
#define FirstOffsetNumber
Definition: off.h:27
IndexTupleData * IndexTuple
Definition: itup.h:53
#define REGBUF_STANDARD
Definition: xloginsert.h:35
Bucket _hash_get_newbucket_from_oldbucket(Relation rel, Bucket old_bucket, uint32 lowmask, uint32 maxbucket)
Definition: hashutil.c:495
void _h_spool(HSpool *hspool, ItemPointer self, Datum *values, bool *isnull)
Definition: hashsort.c:108
#define XLOG_HASH_DELETE
Definition: hash_xlog.h:39
ScanDirection
Definition: sdir.h:22
void _h_indexbuild(HSpool *hspool, Relation heapRel)
Definition: hashsort.c:119
#define RelationGetRelationName(relation)
Definition: rel.h:490
unsigned int uint32
Definition: c.h:374
void estimate_rel_size(Relation rel, int32 *attr_widths, BlockNumber *pages, double *tuples, double *allvisfrac)
Definition: plancat.c:949
#define BufferGetPage(buffer)
Definition: bufmgr.h:169
amrescan_function amrescan
Definition: amapi.h:271
bool amcanparallel
Definition: amapi.h:241
Buffer hashso_bucket_buf
Definition: hash.h:162
int64 hashgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
Definition: hash.c:330
bool IsBufferCleanupOK(Buffer buffer)
Definition: bufmgr.c:4007
#define SizeOfHashUpdateMetaPage
Definition: hash_xlog.h:206
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:235
void XLogRegisterData(char *data, int len)
Definition: xloginsert.c:330
bool amsearchnulls
Definition: amapi.h:233
void hashendscan(IndexScanDesc scan)
Definition: hash.c:426
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:422
void _hash_checkpage(Relation rel, Buffer buf, int flags)
Definition: hashutil.c:211
HSpool * spool
Definition: hash.c:38
void * palloc0(Size size)
Definition: mcxt.c:980
#define HASH_METAPAGE
Definition: hash.h:196
uintptr_t Datum
Definition: postgres.h:367
#define HashScanPosInvalidate(scanpos)
Definition: hash.h:142
double hashm_ntuples
Definition: hash.h:246
bool hashso_buc_populated
Definition: hash.h:172
#define LH_OVERFLOW_PAGE
Definition: hash.h:54
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:3757
bool amclusterable
Definition: amapi.h:237
#define RelationGetNumberOfBlocks(reln)
Definition: bufmgr.h:211
#define HASHOPTIONS_PROC
Definition: hash.h:355
bool amsearcharray
Definition: amapi.h:231
HashMetaPage _hash_getcachedmetap(Relation rel, Buffer *metabuf, bool force_refresh)
Definition: hashpage.c:1497
void _hash_relbuf(Relation rel, Buffer buf)
Definition: hashpage.c:265
int maintenance_work_mem
Definition: globals.c:123
Relation heapRel
Definition: hash.c:40
bool amusemaintenanceworkmem
Definition: amapi.h:245
#define BlockNumberIsValid(blockNumber)
Definition: block.h:70
#define LH_BUCKET_PAGE
Definition: hash.h:55
IndexBulkDeleteResult * hashvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
Definition: hash.c:643
#define H_BUCKET_BEING_SPLIT(opaque)
Definition: hash.h:89
Datum hashhandler(PG_FUNCTION_ARGS)
Definition: hash.c:56
#define makeNode(_type_)
Definition: nodes.h:577
amadjustmembers_function amadjustmembers
Definition: amapi.h:269
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define Assert(condition)
Definition: c.h:745
Definition: regguts.h:298
void PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems)
Definition: bufpage.c:828
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define REGBUF_NO_IMAGE
Definition: xloginsert.h:32
#define InvalidBlockNumber
Definition: block.h:33
HashPageOpaqueData * HashPageOpaque
Definition: hash.h:86
void _hash_kill_items(IndexScanDesc scan)
Definition: hashutil.c:537
void _hash_squeezebucket(Relation rel, Bucket bucket, BlockNumber bucket_blkno, Buffer bucket_buf, BufferAccessStrategy bstrategy)
Definition: hashovfl.c:805
ammarkpos_function ammarkpos
Definition: amapi.h:275
bool amcanorder
Definition: amapi.h:219
ambuildphasename_function ambuildphasename
Definition: amapi.h:267
#define VACUUM_OPTION_PARALLEL_BULKDEL
Definition: vacuum.h:45
#define RelationNeedsWAL(relation)
Definition: rel.h:562
amestimateparallelscan_function amestimateparallelscan
Definition: amapi.h:279
uint32 hashm_maxbucket
Definition: hash.h:252
struct ScanKeyData * keyData
Definition: relscan.h:119
HashScanPosData currPos
Definition: hash.h:187
uint16 hasho_flag
Definition: hash.h:82
bool hashso_buc_split
Definition: hash.h:178
uint32 _hash_init(Relation rel, double num_tuples, ForkNumber forkNum)
Definition: hashpage.c:326
bool hashvalidate(Oid opclassoid)
Definition: hashvalidate.c:47
uint16 amstrategies
Definition: amapi.h:213
static Datum values[MAXATTR]
Definition: bootstrap.c:167
void _h_spooldestroy(HSpool *hspool)
Definition: hashsort.c:98
#define MaxIndexTuplesPerPage
Definition: itup.h:145
#define HashPageGetMeta(page)
Definition: hash.h:321
void * palloc(Size size)
Definition: mcxt.c:949
void hashadjustmembers(Oid opfamilyoid, Oid opclassoid, List *operators, List *functions)
Definition: hashvalidate.c:352
uint16 amoptsprocnum
Definition: amapi.h:217
#define elog(elevel,...)
Definition: elog.h:214
ambuildempty_function ambuildempty
Definition: amapi.h:259
bool kill_prior_tuple
Definition: relscan.h:125
Buffer hashso_split_bucket_buf
Definition: hash.h:169
int NBuffers
Definition: globals.c:132
BlockNumber hasho_nextblkno
Definition: hash.h:80
IndexScanDesc RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys)
Definition: genam.c:81
static void hashbuildCallback(Relation index, ItemPointer tid, Datum *values, bool *isnull, bool tupleIsAlive, void *state)
Definition: hash.c:206
#define PG_FUNCTION_ARGS
Definition: fmgr.h:193
int itemIndex
Definition: hash.h:123
HSpool * _h_spoolinit(Relation heap, Relation index, uint32 num_buckets)
Definition: hashsort.c:59
void vacuum_delay_point(void)
Definition: vacuum.c:1995
void XLogBeginInsert(void)
Definition: xloginsert.c:123
#define XLOG_HASH_UPDATE_META_PAGE
Definition: hash_xlog.h:43
#define PageSetLSN(page, lsn)
Definition: bufpage.h:368
double num_index_tuples
Definition: genam.h:77
int Buffer
Definition: buf.h:23
amcanreturn_function amcanreturn
Definition: amapi.h:263
#define H_HAS_DEAD_TUPLES(opaque)
Definition: hash.h:91
bool estimated_count
Definition: genam.h:76
void hashrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, ScanKey orderbys, int norderbys)
Definition: hash.c:392
#define LH_PAGE_HAS_DEAD_TUPLES
Definition: hash.h:61
#define PageGetItem(page, itemId)
Definition: bufpage.h:340
Pointer Page
Definition: bufpage.h:78
double index_tuples
Definition: genam.h:33
bool(* IndexBulkDeleteCallback)(ItemPointer itemptr, void *state)
Definition: genam.h:84
aminitparallelscan_function aminitparallelscan
Definition: amapi.h:280
HashScanPosItem items[MaxIndexTuplesPerPage]
Definition: hash.h:125
double heap_tuples
Definition: genam.h:32
#define PG_USED_FOR_ASSERTS_ONLY
Definition: c.h:121
amrestrpos_function amrestrpos
Definition: amapi.h:276