PostgreSQL Source Code  git master
hash.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * hash.c
4  * Implementation of Margo Seltzer's Hashing package for postgres.
5  *
6  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  * src/backend/access/hash/hash.c
12  *
13  * NOTES
14  * This file contains only the public interface routines.
15  *
16  *-------------------------------------------------------------------------
17  */
18 
19 #include "postgres.h"
20 
21 #include "access/hash.h"
22 #include "access/hash_xlog.h"
23 #include "access/relscan.h"
24 #include "access/tableam.h"
25 #include "catalog/index.h"
26 #include "commands/progress.h"
27 #include "commands/vacuum.h"
28 #include "miscadmin.h"
29 #include "optimizer/plancat.h"
30 #include "pgstat.h"
31 #include "utils/builtins.h"
32 #include "utils/index_selfuncs.h"
33 #include "utils/rel.h"
34 
35 /* Working state for hashbuild and its callback */
36 typedef struct
37 {
38  HSpool *spool; /* NULL if not using spooling */
39  double indtuples; /* # tuples accepted into index */
40  Relation heapRel; /* heap relation descriptor */
42 
43 static void hashbuildCallback(Relation index,
44  ItemPointer tid,
45  Datum *values,
46  bool *isnull,
47  bool tupleIsAlive,
48  void *state);
49 
50 
51 /*
52  * Hash handler function: return IndexAmRoutine with access method parameters
53  * and callbacks.
54  */
55 Datum
57 {
59 
60  amroutine->amstrategies = HTMaxStrategyNumber;
61  amroutine->amsupport = HASHNProcs;
62  amroutine->amoptsprocnum = HASHOPTIONS_PROC;
63  amroutine->amcanorder = false;
64  amroutine->amcanorderbyop = false;
65  amroutine->amcanbackward = true;
66  amroutine->amcanunique = false;
67  amroutine->amcanmulticol = false;
68  amroutine->amoptionalkey = false;
69  amroutine->amsearcharray = false;
70  amroutine->amsearchnulls = false;
71  amroutine->amstorage = false;
72  amroutine->amclusterable = false;
73  amroutine->ampredlocks = true;
74  amroutine->amcanparallel = false;
75  amroutine->amcaninclude = false;
76  amroutine->amusemaintenanceworkmem = false;
77  amroutine->amparallelvacuumoptions =
79  amroutine->amkeytype = INT4OID;
80 
81  amroutine->ambuild = hashbuild;
82  amroutine->ambuildempty = hashbuildempty;
83  amroutine->aminsert = hashinsert;
84  amroutine->ambulkdelete = hashbulkdelete;
86  amroutine->amcanreturn = NULL;
87  amroutine->amcostestimate = hashcostestimate;
88  amroutine->amoptions = hashoptions;
89  amroutine->amproperty = NULL;
90  amroutine->ambuildphasename = NULL;
91  amroutine->amvalidate = hashvalidate;
93  amroutine->ambeginscan = hashbeginscan;
94  amroutine->amrescan = hashrescan;
95  amroutine->amgettuple = hashgettuple;
96  amroutine->amgetbitmap = hashgetbitmap;
97  amroutine->amendscan = hashendscan;
98  amroutine->ammarkpos = NULL;
99  amroutine->amrestrpos = NULL;
100  amroutine->amestimateparallelscan = NULL;
101  amroutine->aminitparallelscan = NULL;
102  amroutine->amparallelrescan = NULL;
103 
104  PG_RETURN_POINTER(amroutine);
105 }
106 
107 /*
108  * hashbuild() -- build a new hash index.
109  */
112 {
113  IndexBuildResult *result;
114  BlockNumber relpages;
115  double reltuples;
116  double allvisfrac;
117  uint32 num_buckets;
118  long sort_threshold;
119  HashBuildState buildstate;
120 
121  /*
122  * We expect to be called exactly once for any index relation. If that's
123  * not the case, big trouble's what we have.
124  */
125  if (RelationGetNumberOfBlocks(index) != 0)
126  elog(ERROR, "index \"%s\" already contains data",
127  RelationGetRelationName(index));
128 
129  /* Estimate the number of rows currently present in the table */
130  estimate_rel_size(heap, NULL, &relpages, &reltuples, &allvisfrac);
131 
132  /* Initialize the hash index metadata page and initial buckets */
133  num_buckets = _hash_init(index, reltuples, MAIN_FORKNUM);
134 
135  /*
136  * If we just insert the tuples into the index in scan order, then
137  * (assuming their hash codes are pretty random) there will be no locality
138  * of access to the index, and if the index is bigger than available RAM
139  * then we'll thrash horribly. To prevent that scenario, we can sort the
140  * tuples by (expected) bucket number. However, such a sort is useless
141  * overhead when the index does fit in RAM. We choose to sort if the
142  * initial index size exceeds maintenance_work_mem, or the number of
143  * buffers usable for the index, whichever is less. (Limiting by the
144  * number of buffers should reduce thrashing between PG buffers and kernel
145  * buffers, which seems useful even if no physical I/O results. Limiting
146  * by maintenance_work_mem is useful to allow easy testing of the sort
147  * code path, and may be useful to DBAs as an additional control knob.)
148  *
149  * NOTE: this test will need adjustment if a bucket is ever different from
150  * one page. Also, "initial index size" accounting does not include the
151  * metapage, nor the first bitmap page.
152  */
153  sort_threshold = (maintenance_work_mem * 1024L) / BLCKSZ;
154  if (index->rd_rel->relpersistence != RELPERSISTENCE_TEMP)
155  sort_threshold = Min(sort_threshold, NBuffers);
156  else
157  sort_threshold = Min(sort_threshold, NLocBuffer);
158 
159  if (num_buckets >= (uint32) sort_threshold)
160  buildstate.spool = _h_spoolinit(heap, index, num_buckets);
161  else
162  buildstate.spool = NULL;
163 
164  /* prepare to build the index */
165  buildstate.indtuples = 0;
166  buildstate.heapRel = heap;
167 
168  /* do the heap scan */
169  reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
171  (void *) &buildstate, NULL);
173  buildstate.indtuples);
174 
175  if (buildstate.spool)
176  {
177  /* sort the tuples and insert them into the index */
178  _h_indexbuild(buildstate.spool, buildstate.heapRel);
179  _h_spooldestroy(buildstate.spool);
180  }
181 
182  /*
183  * Return statistics
184  */
185  result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
186 
187  result->heap_tuples = reltuples;
188  result->index_tuples = buildstate.indtuples;
189 
190  return result;
191 }
192 
193 /*
194  * hashbuildempty() -- build an empty hash index in the initialization fork
195  */
196 void
198 {
199  _hash_init(index, 0, INIT_FORKNUM);
200 }
201 
202 /*
203  * Per-tuple callback for table_index_build_scan
204  */
205 static void
207  ItemPointer tid,
208  Datum *values,
209  bool *isnull,
210  bool tupleIsAlive,
211  void *state)
212 {
213  HashBuildState *buildstate = (HashBuildState *) state;
214  Datum index_values[1];
215  bool index_isnull[1];
216  IndexTuple itup;
217 
218  /* convert data to a hash key; on failure, do not insert anything */
219  if (!_hash_convert_tuple(index,
220  values, isnull,
221  index_values, index_isnull))
222  return;
223 
224  /* Either spool the tuple for sorting, or just put it into the index */
225  if (buildstate->spool)
226  _h_spool(buildstate->spool, tid, index_values, index_isnull);
227  else
228  {
229  /* form an index tuple and point it at the heap tuple */
230  itup = index_form_tuple(RelationGetDescr(index),
231  index_values, index_isnull);
232  itup->t_tid = *tid;
233  _hash_doinsert(index, itup, buildstate->heapRel);
234  pfree(itup);
235  }
236 
237  buildstate->indtuples += 1;
238 }
239 
240 /*
241  * hashinsert() -- insert an index tuple into a hash table.
242  *
243  * Hash on the heap tuple's key, form an index tuple with hash code.
244  * Find the appropriate location for the new tuple, and put it there.
245  */
246 bool
247 hashinsert(Relation rel, Datum *values, bool *isnull,
248  ItemPointer ht_ctid, Relation heapRel,
249  IndexUniqueCheck checkUnique,
250  bool indexUnchanged,
251  IndexInfo *indexInfo)
252 {
253  Datum index_values[1];
254  bool index_isnull[1];
255  IndexTuple itup;
256 
257  /* convert data to a hash key; on failure, do not insert anything */
258  if (!_hash_convert_tuple(rel,
259  values, isnull,
260  index_values, index_isnull))
261  return false;
262 
263  /* form an index tuple and point it at the heap tuple */
264  itup = index_form_tuple(RelationGetDescr(rel), index_values, index_isnull);
265  itup->t_tid = *ht_ctid;
266 
267  _hash_doinsert(rel, itup, heapRel);
268 
269  pfree(itup);
270 
271  return false;
272 }
273 
274 
275 /*
276  * hashgettuple() -- Get the next tuple in the scan.
277  */
278 bool
280 {
281  HashScanOpaque so = (HashScanOpaque) scan->opaque;
282  bool res;
283 
284  /* Hash indexes are always lossy since we store only the hash code */
285  scan->xs_recheck = true;
286 
287  /*
288  * If we've already initialized this scan, we can just advance it in the
289  * appropriate direction. If we haven't done so yet, we call a routine to
290  * get the first item in the scan.
291  */
292  if (!HashScanPosIsValid(so->currPos))
293  res = _hash_first(scan, dir);
294  else
295  {
296  /*
297  * Check to see if we should kill the previously-fetched tuple.
298  */
299  if (scan->kill_prior_tuple)
300  {
301  /*
302  * Yes, so remember it for later. (We'll deal with all such tuples
303  * at once right after leaving the index page or at end of scan.)
304  * In case if caller reverses the indexscan direction it is quite
305  * possible that the same item might get entered multiple times.
306  * But, we don't detect that; instead, we just forget any excess
307  * entries.
308  */
309  if (so->killedItems == NULL)
310  so->killedItems = (int *)
311  palloc(MaxIndexTuplesPerPage * sizeof(int));
312 
314  so->killedItems[so->numKilled++] = so->currPos.itemIndex;
315  }
316 
317  /*
318  * Now continue the scan.
319  */
320  res = _hash_next(scan, dir);
321  }
322 
323  return res;
324 }
325 
326 
327 /*
328  * hashgetbitmap() -- get all tuples at once
329  */
330 int64
332 {
333  HashScanOpaque so = (HashScanOpaque) scan->opaque;
334  bool res;
335  int64 ntids = 0;
336  HashScanPosItem *currItem;
337 
338  res = _hash_first(scan, ForwardScanDirection);
339 
340  while (res)
341  {
342  currItem = &so->currPos.items[so->currPos.itemIndex];
343 
344  /*
345  * _hash_first and _hash_next handle eliminate dead index entries
346  * whenever scan->ignore_killed_tuples is true. Therefore, there's
347  * nothing to do here except add the results to the TIDBitmap.
348  */
349  tbm_add_tuples(tbm, &(currItem->heapTid), 1, true);
350  ntids++;
351 
352  res = _hash_next(scan, ForwardScanDirection);
353  }
354 
355  return ntids;
356 }
357 
358 
359 /*
360  * hashbeginscan() -- start a scan on a hash index
361  */
363 hashbeginscan(Relation rel, int nkeys, int norderbys)
364 {
365  IndexScanDesc scan;
366  HashScanOpaque so;
367 
368  /* no order by operators allowed */
369  Assert(norderbys == 0);
370 
371  scan = RelationGetIndexScan(rel, nkeys, norderbys);
372 
373  so = (HashScanOpaque) palloc(sizeof(HashScanOpaqueData));
377 
378  so->hashso_buc_populated = false;
379  so->hashso_buc_split = false;
380 
381  so->killedItems = NULL;
382  so->numKilled = 0;
383 
384  scan->opaque = so;
385 
386  return scan;
387 }
388 
389 /*
390  * hashrescan() -- rescan an index relation
391  */
392 void
393 hashrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
394  ScanKey orderbys, int norderbys)
395 {
396  HashScanOpaque so = (HashScanOpaque) scan->opaque;
397  Relation rel = scan->indexRelation;
398 
399  if (HashScanPosIsValid(so->currPos))
400  {
401  /* Before leaving current page, deal with any killed items */
402  if (so->numKilled > 0)
403  _hash_kill_items(scan);
404  }
405 
406  _hash_dropscanbuf(rel, so);
407 
408  /* set position invalid (this will cause _hash_first call) */
410 
411  /* Update scan key, if a new one is given */
412  if (scankey && scan->numberOfKeys > 0)
413  {
414  memmove(scan->keyData,
415  scankey,
416  scan->numberOfKeys * sizeof(ScanKeyData));
417  }
418 
419  so->hashso_buc_populated = false;
420  so->hashso_buc_split = false;
421 }
422 
423 /*
424  * hashendscan() -- close down a scan
425  */
426 void
428 {
429  HashScanOpaque so = (HashScanOpaque) scan->opaque;
430  Relation rel = scan->indexRelation;
431 
432  if (HashScanPosIsValid(so->currPos))
433  {
434  /* Before leaving current page, deal with any killed items */
435  if (so->numKilled > 0)
436  _hash_kill_items(scan);
437  }
438 
439  _hash_dropscanbuf(rel, so);
440 
441  if (so->killedItems != NULL)
442  pfree(so->killedItems);
443  pfree(so);
444  scan->opaque = NULL;
445 }
446 
447 /*
448  * Bulk deletion of all index entries pointing to a set of heap tuples.
449  * The set of target tuples is specified via a callback routine that tells
450  * whether any given heap tuple (identified by ItemPointer) is being deleted.
451  *
452  * This function also deletes the tuples that are moved by split to other
453  * bucket.
454  *
455  * Result: a palloc'd struct containing statistical info for VACUUM displays.
456  */
459  IndexBulkDeleteCallback callback, void *callback_state)
460 {
461  Relation rel = info->index;
462  double tuples_removed;
463  double num_index_tuples;
464  double orig_ntuples;
465  Bucket orig_maxbucket;
466  Bucket cur_maxbucket;
467  Bucket cur_bucket;
468  Buffer metabuf = InvalidBuffer;
469  HashMetaPage metap;
470  HashMetaPage cachedmetap;
471 
472  tuples_removed = 0;
473  num_index_tuples = 0;
474 
475  /*
476  * We need a copy of the metapage so that we can use its hashm_spares[]
477  * values to compute bucket page addresses, but a cached copy should be
478  * good enough. (If not, we'll detect that further down and refresh the
479  * cache as necessary.)
480  */
481  cachedmetap = _hash_getcachedmetap(rel, &metabuf, false);
482  Assert(cachedmetap != NULL);
483 
484  orig_maxbucket = cachedmetap->hashm_maxbucket;
485  orig_ntuples = cachedmetap->hashm_ntuples;
486 
487  /* Scan the buckets that we know exist */
488  cur_bucket = 0;
489  cur_maxbucket = orig_maxbucket;
490 
491 loop_top:
492  while (cur_bucket <= cur_maxbucket)
493  {
494  BlockNumber bucket_blkno;
495  BlockNumber blkno;
496  Buffer bucket_buf;
497  Buffer buf;
498  HashPageOpaque bucket_opaque;
499  Page page;
500  bool split_cleanup = false;
501 
502  /* Get address of bucket's start page */
503  bucket_blkno = BUCKET_TO_BLKNO(cachedmetap, cur_bucket);
504 
505  blkno = bucket_blkno;
506 
507  /*
508  * We need to acquire a cleanup lock on the primary bucket page to out
509  * wait concurrent scans before deleting the dead tuples.
510  */
511  buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, info->strategy);
513  _hash_checkpage(rel, buf, LH_BUCKET_PAGE);
514 
515  page = BufferGetPage(buf);
516  bucket_opaque = (HashPageOpaque) PageGetSpecialPointer(page);
517 
518  /*
519  * If the bucket contains tuples that are moved by split, then we need
520  * to delete such tuples. We can't delete such tuples if the split
521  * operation on bucket is not finished as those are needed by scans.
522  */
523  if (!H_BUCKET_BEING_SPLIT(bucket_opaque) &&
524  H_NEEDS_SPLIT_CLEANUP(bucket_opaque))
525  {
526  split_cleanup = true;
527 
528  /*
529  * This bucket might have been split since we last held a lock on
530  * the metapage. If so, hashm_maxbucket, hashm_highmask and
531  * hashm_lowmask might be old enough to cause us to fail to remove
532  * tuples left behind by the most recent split. To prevent that,
533  * now that the primary page of the target bucket has been locked
534  * (and thus can't be further split), check whether we need to
535  * update our cached metapage data.
536  */
537  Assert(bucket_opaque->hasho_prevblkno != InvalidBlockNumber);
538  if (bucket_opaque->hasho_prevblkno > cachedmetap->hashm_maxbucket)
539  {
540  cachedmetap = _hash_getcachedmetap(rel, &metabuf, true);
541  Assert(cachedmetap != NULL);
542  }
543  }
544 
545  bucket_buf = buf;
546 
547  hashbucketcleanup(rel, cur_bucket, bucket_buf, blkno, info->strategy,
548  cachedmetap->hashm_maxbucket,
549  cachedmetap->hashm_highmask,
550  cachedmetap->hashm_lowmask, &tuples_removed,
551  &num_index_tuples, split_cleanup,
552  callback, callback_state);
553 
554  _hash_dropbuf(rel, bucket_buf);
555 
556  /* Advance to next bucket */
557  cur_bucket++;
558  }
559 
560  if (BufferIsInvalid(metabuf))
562 
563  /* Write-lock metapage and check for split since we started */
565  metap = HashPageGetMeta(BufferGetPage(metabuf));
566 
567  if (cur_maxbucket != metap->hashm_maxbucket)
568  {
569  /* There's been a split, so process the additional bucket(s) */
570  LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
571  cachedmetap = _hash_getcachedmetap(rel, &metabuf, true);
572  Assert(cachedmetap != NULL);
573  cur_maxbucket = cachedmetap->hashm_maxbucket;
574  goto loop_top;
575  }
576 
577  /* Okay, we're really done. Update tuple count in metapage. */
579 
580  if (orig_maxbucket == metap->hashm_maxbucket &&
581  orig_ntuples == metap->hashm_ntuples)
582  {
583  /*
584  * No one has split or inserted anything since start of scan, so
585  * believe our count as gospel.
586  */
587  metap->hashm_ntuples = num_index_tuples;
588  }
589  else
590  {
591  /*
592  * Otherwise, our count is untrustworthy since we may have
593  * double-scanned tuples in split buckets. Proceed by dead-reckoning.
594  * (Note: we still return estimated_count = false, because using this
595  * count is better than not updating reltuples at all.)
596  */
597  if (metap->hashm_ntuples > tuples_removed)
598  metap->hashm_ntuples -= tuples_removed;
599  else
600  metap->hashm_ntuples = 0;
601  num_index_tuples = metap->hashm_ntuples;
602  }
603 
604  MarkBufferDirty(metabuf);
605 
606  /* XLOG stuff */
607  if (RelationNeedsWAL(rel))
608  {
610  XLogRecPtr recptr;
611 
612  xlrec.ntuples = metap->hashm_ntuples;
613 
614  XLogBeginInsert();
615  XLogRegisterData((char *) &xlrec, SizeOfHashUpdateMetaPage);
616 
617  XLogRegisterBuffer(0, metabuf, REGBUF_STANDARD);
618 
619  recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_UPDATE_META_PAGE);
620  PageSetLSN(BufferGetPage(metabuf), recptr);
621  }
622 
624 
625  _hash_relbuf(rel, metabuf);
626 
627  /* return statistics */
628  if (stats == NULL)
630  stats->estimated_count = false;
631  stats->num_index_tuples = num_index_tuples;
632  stats->tuples_removed += tuples_removed;
633  /* hashvacuumcleanup will fill in num_pages */
634 
635  return stats;
636 }
637 
638 /*
639  * Post-VACUUM cleanup.
640  *
641  * Result: a palloc'd struct containing statistical info for VACUUM displays.
642  */
645 {
646  Relation rel = info->index;
647  BlockNumber num_pages;
648 
649  /* If hashbulkdelete wasn't called, return NULL signifying no change */
650  /* Note: this covers the analyze_only case too */
651  if (stats == NULL)
652  return NULL;
653 
654  /* update statistics */
655  num_pages = RelationGetNumberOfBlocks(rel);
656  stats->num_pages = num_pages;
657 
658  return stats;
659 }
660 
661 /*
662  * Helper function to perform deletion of index entries from a bucket.
663  *
664  * This function expects that the caller has acquired a cleanup lock on the
665  * primary bucket page, and will return with a write lock again held on the
666  * primary bucket page. The lock won't necessarily be held continuously,
667  * though, because we'll release it when visiting overflow pages.
668  *
669  * There can't be any concurrent scans in progress when we first enter this
670  * function because of the cleanup lock we hold on the primary bucket page,
671  * but as soon as we release that lock, there might be. If those scans got
672  * ahead of our cleanup scan, they might see a tuple before we kill it and
673  * wake up only after VACUUM has completed and the TID has been recycled for
674  * an unrelated tuple. To avoid that calamity, we prevent scans from passing
675  * our cleanup scan by locking the next page in the bucket chain before
676  * releasing the lock on the previous page. (This type of lock chaining is not
677  * ideal, so we might want to look for a better solution at some point.)
678  *
679  * We need to retain a pin on the primary bucket to ensure that no concurrent
680  * split can start.
681  */
682 void
683 hashbucketcleanup(Relation rel, Bucket cur_bucket, Buffer bucket_buf,
684  BlockNumber bucket_blkno, BufferAccessStrategy bstrategy,
685  uint32 maxbucket, uint32 highmask, uint32 lowmask,
686  double *tuples_removed, double *num_index_tuples,
687  bool split_cleanup,
688  IndexBulkDeleteCallback callback, void *callback_state)
689 {
690  BlockNumber blkno;
691  Buffer buf;
693  bool bucket_dirty = false;
694 
695  blkno = bucket_blkno;
696  buf = bucket_buf;
697 
698  if (split_cleanup)
699  new_bucket = _hash_get_newbucket_from_oldbucket(rel, cur_bucket,
700  lowmask, maxbucket);
701 
702  /* Scan each page in bucket */
703  for (;;)
704  {
705  HashPageOpaque opaque;
706  OffsetNumber offno;
707  OffsetNumber maxoffno;
708  Buffer next_buf;
709  Page page;
710  OffsetNumber deletable[MaxOffsetNumber];
711  int ndeletable = 0;
712  bool retain_pin = false;
713  bool clear_dead_marking = false;
714 
716 
717  page = BufferGetPage(buf);
718  opaque = (HashPageOpaque) PageGetSpecialPointer(page);
719 
720  /* Scan each tuple in page */
721  maxoffno = PageGetMaxOffsetNumber(page);
722  for (offno = FirstOffsetNumber;
723  offno <= maxoffno;
724  offno = OffsetNumberNext(offno))
725  {
726  ItemPointer htup;
727  IndexTuple itup;
728  Bucket bucket;
729  bool kill_tuple = false;
730 
731  itup = (IndexTuple) PageGetItem(page,
732  PageGetItemId(page, offno));
733  htup = &(itup->t_tid);
734 
735  /*
736  * To remove the dead tuples, we strictly want to rely on results
737  * of callback function. refer btvacuumpage for detailed reason.
738  */
739  if (callback && callback(htup, callback_state))
740  {
741  kill_tuple = true;
742  if (tuples_removed)
743  *tuples_removed += 1;
744  }
745  else if (split_cleanup)
746  {
747  /* delete the tuples that are moved by split. */
749  maxbucket,
750  highmask,
751  lowmask);
752  /* mark the item for deletion */
753  if (bucket != cur_bucket)
754  {
755  /*
756  * We expect tuples to either belong to current bucket or
757  * new_bucket. This is ensured because we don't allow
758  * further splits from bucket that contains garbage. See
759  * comments in _hash_expandtable.
760  */
761  Assert(bucket == new_bucket);
762  kill_tuple = true;
763  }
764  }
765 
766  if (kill_tuple)
767  {
768  /* mark the item for deletion */
769  deletable[ndeletable++] = offno;
770  }
771  else
772  {
773  /* we're keeping it, so count it */
774  if (num_index_tuples)
775  *num_index_tuples += 1;
776  }
777  }
778 
779  /* retain the pin on primary bucket page till end of bucket scan */
780  if (blkno == bucket_blkno)
781  retain_pin = true;
782  else
783  retain_pin = false;
784 
785  blkno = opaque->hasho_nextblkno;
786 
787  /*
788  * Apply deletions, advance to next page and write page if needed.
789  */
790  if (ndeletable > 0)
791  {
792  /* No ereport(ERROR) until changes are logged */
794 
795  PageIndexMultiDelete(page, deletable, ndeletable);
796  bucket_dirty = true;
797 
798  /*
799  * Let us mark the page as clean if vacuum removes the DEAD tuples
800  * from an index page. We do this by clearing
801  * LH_PAGE_HAS_DEAD_TUPLES flag.
802  */
803  if (tuples_removed && *tuples_removed > 0 &&
804  H_HAS_DEAD_TUPLES(opaque))
805  {
807  clear_dead_marking = true;
808  }
809 
810  MarkBufferDirty(buf);
811 
812  /* XLOG stuff */
813  if (RelationNeedsWAL(rel))
814  {
815  xl_hash_delete xlrec;
816  XLogRecPtr recptr;
817 
818  xlrec.clear_dead_marking = clear_dead_marking;
819  xlrec.is_primary_bucket_page = (buf == bucket_buf) ? true : false;
820 
821  XLogBeginInsert();
822  XLogRegisterData((char *) &xlrec, SizeOfHashDelete);
823 
824  /*
825  * bucket buffer needs to be registered to ensure that we can
826  * acquire a cleanup lock on it during replay.
827  */
828  if (!xlrec.is_primary_bucket_page)
830 
832  XLogRegisterBufData(1, (char *) deletable,
833  ndeletable * sizeof(OffsetNumber));
834 
835  recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_DELETE);
836  PageSetLSN(BufferGetPage(buf), recptr);
837  }
838 
840  }
841 
842  /* bail out if there are no more pages to scan. */
843  if (!BlockNumberIsValid(blkno))
844  break;
845 
846  next_buf = _hash_getbuf_with_strategy(rel, blkno, HASH_WRITE,
848  bstrategy);
849 
850  /*
851  * release the lock on previous page after acquiring the lock on next
852  * page
853  */
854  if (retain_pin)
856  else
857  _hash_relbuf(rel, buf);
858 
859  buf = next_buf;
860  }
861 
862  /*
863  * lock the bucket page to clear the garbage flag and squeeze the bucket.
864  * if the current buffer is same as bucket buffer, then we already have
865  * lock on bucket page.
866  */
867  if (buf != bucket_buf)
868  {
869  _hash_relbuf(rel, buf);
870  LockBuffer(bucket_buf, BUFFER_LOCK_EXCLUSIVE);
871  }
872 
873  /*
874  * Clear the garbage flag from bucket after deleting the tuples that are
875  * moved by split. We purposefully clear the flag before squeeze bucket,
876  * so that after restart, vacuum shouldn't again try to delete the moved
877  * by split tuples.
878  */
879  if (split_cleanup)
880  {
881  HashPageOpaque bucket_opaque;
882  Page page;
883 
884  page = BufferGetPage(bucket_buf);
885  bucket_opaque = (HashPageOpaque) PageGetSpecialPointer(page);
886 
887  /* No ereport(ERROR) until changes are logged */
889 
890  bucket_opaque->hasho_flag &= ~LH_BUCKET_NEEDS_SPLIT_CLEANUP;
891  MarkBufferDirty(bucket_buf);
892 
893  /* XLOG stuff */
894  if (RelationNeedsWAL(rel))
895  {
896  XLogRecPtr recptr;
897 
898  XLogBeginInsert();
899  XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD);
900 
901  recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_CLEANUP);
902  PageSetLSN(page, recptr);
903  }
904 
906  }
907 
908  /*
909  * If we have deleted anything, try to compact free space. For squeezing
910  * the bucket, we must have a cleanup lock, else it can impact the
911  * ordering of tuples for a scan that has started before it.
912  */
913  if (bucket_dirty && IsBufferCleanupOK(bucket_buf))
914  _hash_squeezebucket(rel, cur_bucket, bucket_blkno, bucket_buf,
915  bstrategy);
916  else
917  LockBuffer(bucket_buf, BUFFER_LOCK_UNLOCK);
918 }
ambeginscan_function ambeginscan
Definition: amapi.h:271
void XLogRegisterBufData(uint8 block_id, char *data, int len)
Definition: xloginsert.c:368
uint8 amparallelvacuumoptions
Definition: amapi.h:248
bytea * hashoptions(Datum reloptions, bool validate)
Definition: hashutil.c:276
#define PG_RETURN_POINTER(x)
Definition: fmgr.h:361
HashScanOpaqueData * HashScanOpaque
Definition: hash.h:190
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:96
void LockBufferForCleanup(Buffer buffer)
Definition: bufmgr.c:3996
ambulkdelete_function ambulkdelete
Definition: amapi.h:262
bool hashgettuple(IndexScanDesc scan, ScanDirection dir)
Definition: hash.c:279
bool amcanmulticol
Definition: amapi.h:228
void _hash_doinsert(Relation rel, IndexTuple itup, Relation heapRel)
Definition: hashinsert.c:36
uint16 amsupport
Definition: amapi.h:216
double tuples_removed
Definition: genam.h:78
#define HTMaxStrategyNumber
Definition: stratnum.h:43
void _hash_dropscanbuf(Relation rel, HashScanOpaque so)
Definition: hashpage.c:288
#define HASHNProcs
Definition: hash.h:356
amgettuple_function amgettuple
Definition: amapi.h:273
#define RelationGetDescr(relation)
Definition: rel.h:483
Bucket _hash_hashkey2bucket(uint32 hashkey, uint32 maxbucket, uint32 highmask, uint32 lowmask)
Definition: hashutil.c:126
bool amcanorderbyop
Definition: amapi.h:222
#define LH_META_PAGE
Definition: hash.h:57
IndexBulkDeleteResult * hashbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state)
Definition: hash.c:458
amproperty_function amproperty
Definition: amapi.h:267
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:1483
#define SizeOfHashDelete
Definition: hash_xlog.h:192
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition: xloginsert.c:220
#define HashScanPosIsValid(scanpos)
Definition: hash.h:135
#define MaxOffsetNumber
Definition: off.h:28
Buffer _hash_getbuf_with_strategy(Relation rel, BlockNumber blkno, int access, int flags, BufferAccessStrategy bstrategy)
Definition: hashpage.c:238
void pgstat_progress_update_param(int index, int64 val)
Definition: pgstat.c:3478
void tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids, bool recheck)
Definition: tidbitmap.c:376
Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition: bufmgr.c:666
ItemPointerData t_tid
Definition: itup.h:37
amparallelrescan_function amparallelrescan
Definition: amapi.h:282
#define Min(x, y)
Definition: c.h:974
#define END_CRIT_SECTION()
Definition: miscadmin.h:135
BufferAccessStrategy strategy
Definition: genam.h:52
void hashcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, Cost *indexStartupCost, Cost *indexTotalCost, Selectivity *indexSelectivity, double *indexCorrelation, double *indexPages)
Definition: selfuncs.c:6547
bool amstorage
Definition: amapi.h:236
double indtuples
Definition: hash.c:39
uint32 hashm_highmask
Definition: hash.h:253
#define InvalidBuffer
Definition: buf.h:25
#define START_CRIT_SECTION()
Definition: miscadmin.h:133
Relation index
Definition: genam.h:46
#define XLOG_HASH_SPLIT_CLEANUP
Definition: hash_xlog.h:40
bool ampredlocks
Definition: amapi.h:240
bool clear_dead_marking
Definition: hash_xlog.h:186
void hashbucketcleanup(Relation rel, Bucket cur_bucket, Buffer bucket_buf, BlockNumber bucket_blkno, BufferAccessStrategy bstrategy, uint32 maxbucket, uint32 highmask, uint32 lowmask, double *tuples_removed, double *num_index_tuples, bool split_cleanup, IndexBulkDeleteCallback callback, void *callback_state)
Definition: hash.c:683
#define PROGRESS_CREATEIDX_TUPLES_TOTAL
Definition: progress.h:84
uint32 BlockNumber
Definition: block.h:31
IndexScanDesc hashbeginscan(Relation rel, int nkeys, int norderbys)
Definition: hash.c:363
aminsert_function aminsert
Definition: amapi.h:261
void _hash_dropbuf(Relation rel, Buffer buf)
Definition: hashpage.c:276
Buffer _hash_getbuf(Relation rel, BlockNumber blkno, int access, int flags)
Definition: hashpage.c:69
#define BUFFER_LOCK_EXCLUSIVE
Definition: bufmgr.h:98
#define LH_BUCKET_NEEDS_SPLIT_CLEANUP
Definition: hash.h:60
Form_pg_class rd_rel
Definition: rel.h:110
Oid amkeytype
Definition: amapi.h:250
uint32 hashm_lowmask
Definition: hash.h:254
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:357
bool amoptionalkey
Definition: amapi.h:230
amvalidate_function amvalidate
Definition: amapi.h:269
void hashbuildempty(Relation index)
Definition: hash.c:197
#define BUCKET_TO_BLKNO(metap, B)
Definition: hash.h:39
Relation indexRelation
Definition: relscan.h:115
uint16 OffsetNumber
Definition: off.h:24
Definition: type.h:89
uint32 Bucket
Definition: hash.h:35
IndexUniqueCheck
Definition: genam.h:112
IndexTuple index_form_tuple(TupleDesc tupleDescriptor, Datum *values, bool *isnull)
Definition: indextuple.c:47
int * killedItems
Definition: hash.h:180
void pfree(void *pointer)
Definition: mcxt.c:1057
#define InvalidBucket
Definition: hash.h:37
amgetbitmap_function amgetbitmap
Definition: amapi.h:274
#define H_NEEDS_SPLIT_CLEANUP(opaque)
Definition: hash.h:88
BlockNumber hasho_prevblkno
Definition: hash.h:79
#define ERROR
Definition: elog.h:45
ambuild_function ambuild
Definition: amapi.h:259
uint32 _hash_get_indextuple_hashkey(IndexTuple itup)
Definition: hashutil.c:292
bool _hash_first(IndexScanDesc scan, ScanDirection dir)
Definition: hashsearch.c:292
amoptions_function amoptions
Definition: amapi.h:266
static double table_index_build_scan(Relation table_rel, Relation index_rel, struct IndexInfo *index_info, bool allow_sync, bool progress, IndexBuildCallback callback, void *callback_state, TableScanDesc scan)
Definition: tableam.h:1658
IndexBuildResult * hashbuild(Relation heap, Relation index, IndexInfo *indexInfo)
Definition: hash.c:111
BlockNumber num_pages
Definition: genam.h:74
static void callback(struct sockaddr *addr, struct sockaddr *mask, void *unused)
Definition: test_ifaddrs.c:48
bool amcaninclude
Definition: amapi.h:244
int NLocBuffer
Definition: localbuf.c:41
bool _hash_convert_tuple(Relation index, Datum *user_values, bool *user_isnull, Datum *index_values, bool *index_isnull)
Definition: hashutil.c:319
amcostestimate_function amcostestimate
Definition: amapi.h:265
bool amcanunique
Definition: amapi.h:226
bool is_primary_bucket_page
Definition: hash_xlog.h:188
bool _hash_next(IndexScanDesc scan, ScanDirection dir)
Definition: hashsearch.c:48
#define BufferIsInvalid(buffer)
Definition: buf.h:31
static char * buf
Definition: pg_test_fsync.c:68
amvacuumcleanup_function amvacuumcleanup
Definition: amapi.h:263
#define HASH_WRITE
Definition: hash.h:338
amendscan_function amendscan
Definition: amapi.h:275
#define HASH_NOLOCK
Definition: hash.h:339
bool amcanbackward
Definition: amapi.h:224
#define FirstOffsetNumber
Definition: off.h:27
IndexTupleData * IndexTuple
Definition: itup.h:53
#define REGBUF_STANDARD
Definition: xloginsert.h:35
Bucket _hash_get_newbucket_from_oldbucket(Relation rel, Bucket old_bucket, uint32 lowmask, uint32 maxbucket)
Definition: hashutil.c:495
void _h_spool(HSpool *hspool, ItemPointer self, Datum *values, bool *isnull)
Definition: hashsort.c:108
#define XLOG_HASH_DELETE
Definition: hash_xlog.h:39
ScanDirection
Definition: sdir.h:22
void _h_indexbuild(HSpool *hspool, Relation heapRel)
Definition: hashsort.c:119
#define RelationGetRelationName(relation)
Definition: rel.h:491
unsigned int uint32
Definition: c.h:429
void estimate_rel_size(Relation rel, int32 *attr_widths, BlockNumber *pages, double *tuples, double *allvisfrac)
Definition: plancat.c:961
#define BufferGetPage(buffer)
Definition: bufmgr.h:169
amrescan_function amrescan
Definition: amapi.h:272
bool amcanparallel
Definition: amapi.h:242
Buffer hashso_bucket_buf
Definition: hash.h:162
int64 hashgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
Definition: hash.c:331
bool IsBufferCleanupOK(Buffer buffer)
Definition: bufmgr.c:4229
#define SizeOfHashUpdateMetaPage
Definition: hash_xlog.h:206
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:235
void XLogRegisterData(char *data, int len)
Definition: xloginsert.c:330
bool amsearchnulls
Definition: amapi.h:234
void hashendscan(IndexScanDesc scan)
Definition: hash.c:427
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:422
void _hash_checkpage(Relation rel, Buffer buf, int flags)
Definition: hashutil.c:211
HSpool * spool
Definition: hash.c:38
void * palloc0(Size size)
Definition: mcxt.c:981
#define HASH_METAPAGE
Definition: hash.h:196
uintptr_t Datum
Definition: postgres.h:367
#define HashScanPosInvalidate(scanpos)
Definition: hash.h:142
double hashm_ntuples
Definition: hash.h:246
bool hashso_buc_populated
Definition: hash.h:172
#define LH_OVERFLOW_PAGE
Definition: hash.h:54
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:3939
bool amclusterable
Definition: amapi.h:238
#define RelationGetNumberOfBlocks(reln)
Definition: bufmgr.h:211
#define HASHOPTIONS_PROC
Definition: hash.h:355
bool amsearcharray
Definition: amapi.h:232
HashMetaPage _hash_getcachedmetap(Relation rel, Buffer *metabuf, bool force_refresh)
Definition: hashpage.c:1496
void _hash_relbuf(Relation rel, Buffer buf)
Definition: hashpage.c:265
int maintenance_work_mem
Definition: globals.c:124
Relation heapRel
Definition: hash.c:40
bool amusemaintenanceworkmem
Definition: amapi.h:246
#define BlockNumberIsValid(blockNumber)
Definition: block.h:70
#define LH_BUCKET_PAGE
Definition: hash.h:55
IndexBulkDeleteResult * hashvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
Definition: hash.c:644
#define H_BUCKET_BEING_SPLIT(opaque)
Definition: hash.h:89
Datum hashhandler(PG_FUNCTION_ARGS)
Definition: hash.c:56
#define makeNode(_type_)
Definition: nodes.h:576
amadjustmembers_function amadjustmembers
Definition: amapi.h:270
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define Assert(condition)
Definition: c.h:792
Definition: regguts.h:298
void PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems)
Definition: bufpage.c:1044
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define REGBUF_NO_IMAGE
Definition: xloginsert.h:32
#define InvalidBlockNumber
Definition: block.h:33
HashPageOpaqueData * HashPageOpaque
Definition: hash.h:86
void _hash_kill_items(IndexScanDesc scan)
Definition: hashutil.c:537
void _hash_squeezebucket(Relation rel, Bucket bucket, BlockNumber bucket_blkno, Buffer bucket_buf, BufferAccessStrategy bstrategy)
Definition: hashovfl.c:805
ammarkpos_function ammarkpos
Definition: amapi.h:276
bool amcanorder
Definition: amapi.h:220
ambuildphasename_function ambuildphasename
Definition: amapi.h:268
#define VACUUM_OPTION_PARALLEL_BULKDEL
Definition: vacuum.h:45
#define RelationNeedsWAL(relation)
Definition: rel.h:563
amestimateparallelscan_function amestimateparallelscan
Definition: amapi.h:280
uint32 hashm_maxbucket
Definition: hash.h:252
struct ScanKeyData * keyData
Definition: relscan.h:119
HashScanPosData currPos
Definition: hash.h:187
uint16 hasho_flag
Definition: hash.h:82
bool hashso_buc_split
Definition: hash.h:178
uint32 _hash_init(Relation rel, double num_tuples, ForkNumber forkNum)
Definition: hashpage.c:326
bool hashvalidate(Oid opclassoid)
Definition: hashvalidate.c:47
uint16 amstrategies
Definition: amapi.h:214
static Datum values[MAXATTR]
Definition: bootstrap.c:165
void _h_spooldestroy(HSpool *hspool)
Definition: hashsort.c:98
#define MaxIndexTuplesPerPage
Definition: itup.h:145
#define HashPageGetMeta(page)
Definition: hash.h:321
void * palloc(Size size)
Definition: mcxt.c:950
void hashadjustmembers(Oid opfamilyoid, Oid opclassoid, List *operators, List *functions)
Definition: hashvalidate.c:352
uint16 amoptsprocnum
Definition: amapi.h:218
#define elog(elevel,...)
Definition: elog.h:228
ambuildempty_function ambuildempty
Definition: amapi.h:260
bool kill_prior_tuple
Definition: relscan.h:125
Buffer hashso_split_bucket_buf
Definition: hash.h:169
int NBuffers
Definition: globals.c:133
bool hashinsert(Relation rel, Datum *values, bool *isnull, ItemPointer ht_ctid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, IndexInfo *indexInfo)
Definition: hash.c:247
BlockNumber hasho_nextblkno
Definition: hash.h:80
IndexScanDesc RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys)
Definition: genam.c:81
static void hashbuildCallback(Relation index, ItemPointer tid, Datum *values, bool *isnull, bool tupleIsAlive, void *state)
Definition: hash.c:206
#define PG_FUNCTION_ARGS
Definition: fmgr.h:193
int itemIndex
Definition: hash.h:123
HSpool * _h_spoolinit(Relation heap, Relation index, uint32 num_buckets)
Definition: hashsort.c:59
void vacuum_delay_point(void)
Definition: vacuum.c:2047
void XLogBeginInsert(void)
Definition: xloginsert.c:123
#define XLOG_HASH_UPDATE_META_PAGE
Definition: hash_xlog.h:43
#define PageSetLSN(page, lsn)
Definition: bufpage.h:368
double num_index_tuples
Definition: genam.h:77
int Buffer
Definition: buf.h:23
amcanreturn_function amcanreturn
Definition: amapi.h:264
#define H_HAS_DEAD_TUPLES(opaque)
Definition: hash.h:91
bool estimated_count
Definition: genam.h:76
void hashrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, ScanKey orderbys, int norderbys)
Definition: hash.c:393
#define LH_PAGE_HAS_DEAD_TUPLES
Definition: hash.h:61
#define PageGetItem(page, itemId)
Definition: bufpage.h:340
Pointer Page
Definition: bufpage.h:78
double index_tuples
Definition: genam.h:33
bool(* IndexBulkDeleteCallback)(ItemPointer itemptr, void *state)
Definition: genam.h:84
aminitparallelscan_function aminitparallelscan
Definition: amapi.h:281
HashScanPosItem items[MaxIndexTuplesPerPage]
Definition: hash.h:125
double heap_tuples
Definition: genam.h:32
#define PG_USED_FOR_ASSERTS_ONLY
Definition: c.h:143
amrestrpos_function amrestrpos
Definition: amapi.h:277