PostgreSQL Source Code  git master
hash.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * hash.c
4  * Implementation of Margo Seltzer's Hashing package for postgres.
5  *
6  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  * src/backend/access/hash/hash.c
12  *
13  * NOTES
14  * This file contains only the public interface routines.
15  *
16  *-------------------------------------------------------------------------
17  */
18 
19 #include "postgres.h"
20 
21 #include "access/hash.h"
22 #include "access/hash_xlog.h"
23 #include "access/relscan.h"
24 #include "catalog/index.h"
25 #include "commands/vacuum.h"
26 #include "miscadmin.h"
27 #include "optimizer/plancat.h"
28 #include "utils/builtins.h"
29 #include "utils/index_selfuncs.h"
30 #include "utils/rel.h"
31 #include "miscadmin.h"
32 
33 
34 /* Working state for hashbuild and its callback */
35 typedef struct
36 {
37  HSpool *spool; /* NULL if not using spooling */
38  double indtuples; /* # tuples accepted into index */
39  Relation heapRel; /* heap relation descriptor */
41 
42 static void hashbuildCallback(Relation index,
43  HeapTuple htup,
44  Datum *values,
45  bool *isnull,
46  bool tupleIsAlive,
47  void *state);
48 
49 
50 /*
51  * Hash handler function: return IndexAmRoutine with access method parameters
52  * and callbacks.
53  */
54 Datum
56 {
58 
59  amroutine->amstrategies = HTMaxStrategyNumber;
60  amroutine->amsupport = HASHNProcs;
61  amroutine->amcanorder = false;
62  amroutine->amcanorderbyop = false;
63  amroutine->amcanbackward = true;
64  amroutine->amcanunique = false;
65  amroutine->amcanmulticol = false;
66  amroutine->amoptionalkey = false;
67  amroutine->amsearcharray = false;
68  amroutine->amsearchnulls = false;
69  amroutine->amstorage = false;
70  amroutine->amclusterable = false;
71  amroutine->ampredlocks = false;
72  amroutine->amcanparallel = false;
73  amroutine->amkeytype = INT4OID;
74 
75  amroutine->ambuild = hashbuild;
76  amroutine->ambuildempty = hashbuildempty;
77  amroutine->aminsert = hashinsert;
78  amroutine->ambulkdelete = hashbulkdelete;
80  amroutine->amcanreturn = NULL;
81  amroutine->amcostestimate = hashcostestimate;
82  amroutine->amoptions = hashoptions;
83  amroutine->amproperty = NULL;
84  amroutine->amvalidate = hashvalidate;
85  amroutine->ambeginscan = hashbeginscan;
86  amroutine->amrescan = hashrescan;
87  amroutine->amgettuple = hashgettuple;
88  amroutine->amgetbitmap = hashgetbitmap;
89  amroutine->amendscan = hashendscan;
90  amroutine->ammarkpos = NULL;
91  amroutine->amrestrpos = NULL;
92  amroutine->amestimateparallelscan = NULL;
93  amroutine->aminitparallelscan = NULL;
94  amroutine->amparallelrescan = NULL;
95 
96  PG_RETURN_POINTER(amroutine);
97 }
98 
99 /*
100  * hashbuild() -- build a new hash index.
101  */
104 {
105  IndexBuildResult *result;
106  BlockNumber relpages;
107  double reltuples;
108  double allvisfrac;
109  uint32 num_buckets;
110  long sort_threshold;
111  HashBuildState buildstate;
112 
113  /*
114  * We expect to be called exactly once for any index relation. If that's
115  * not the case, big trouble's what we have.
116  */
117  if (RelationGetNumberOfBlocks(index) != 0)
118  elog(ERROR, "index \"%s\" already contains data",
119  RelationGetRelationName(index));
120 
121  /* Estimate the number of rows currently present in the table */
122  estimate_rel_size(heap, NULL, &relpages, &reltuples, &allvisfrac);
123 
124  /* Initialize the hash index metadata page and initial buckets */
125  num_buckets = _hash_init(index, reltuples, MAIN_FORKNUM);
126 
127  /*
128  * If we just insert the tuples into the index in scan order, then
129  * (assuming their hash codes are pretty random) there will be no locality
130  * of access to the index, and if the index is bigger than available RAM
131  * then we'll thrash horribly. To prevent that scenario, we can sort the
132  * tuples by (expected) bucket number. However, such a sort is useless
133  * overhead when the index does fit in RAM. We choose to sort if the
134  * initial index size exceeds maintenance_work_mem, or the number of
135  * buffers usable for the index, whichever is less. (Limiting by the
136  * number of buffers should reduce thrashing between PG buffers and kernel
137  * buffers, which seems useful even if no physical I/O results. Limiting
138  * by maintenance_work_mem is useful to allow easy testing of the sort
139  * code path, and may be useful to DBAs as an additional control knob.)
140  *
141  * NOTE: this test will need adjustment if a bucket is ever different from
142  * one page. Also, "initial index size" accounting does not include the
143  * metapage, nor the first bitmap page.
144  */
145  sort_threshold = (maintenance_work_mem * 1024L) / BLCKSZ;
146  if (index->rd_rel->relpersistence != RELPERSISTENCE_TEMP)
147  sort_threshold = Min(sort_threshold, NBuffers);
148  else
149  sort_threshold = Min(sort_threshold, NLocBuffer);
150 
151  if (num_buckets >= (uint32) sort_threshold)
152  buildstate.spool = _h_spoolinit(heap, index, num_buckets);
153  else
154  buildstate.spool = NULL;
155 
156  /* prepare to build the index */
157  buildstate.indtuples = 0;
158  buildstate.heapRel = heap;
159 
160  /* do the heap scan */
161  reltuples = IndexBuildHeapScan(heap, index, indexInfo, true,
162  hashbuildCallback, (void *) &buildstate);
163 
164  if (buildstate.spool)
165  {
166  /* sort the tuples and insert them into the index */
167  _h_indexbuild(buildstate.spool, buildstate.heapRel);
168  _h_spooldestroy(buildstate.spool);
169  }
170 
171  /*
172  * Return statistics
173  */
174  result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
175 
176  result->heap_tuples = reltuples;
177  result->index_tuples = buildstate.indtuples;
178 
179  return result;
180 }
181 
182 /*
183  * hashbuildempty() -- build an empty hash index in the initialization fork
184  */
185 void
187 {
188  _hash_init(index, 0, INIT_FORKNUM);
189 }
190 
191 /*
192  * Per-tuple callback from IndexBuildHeapScan
193  */
194 static void
196  HeapTuple htup,
197  Datum *values,
198  bool *isnull,
199  bool tupleIsAlive,
200  void *state)
201 {
202  HashBuildState *buildstate = (HashBuildState *) state;
203  Datum index_values[1];
204  bool index_isnull[1];
205  IndexTuple itup;
206 
207  /* convert data to a hash key; on failure, do not insert anything */
208  if (!_hash_convert_tuple(index,
209  values, isnull,
210  index_values, index_isnull))
211  return;
212 
213  /* Either spool the tuple for sorting, or just put it into the index */
214  if (buildstate->spool)
215  _h_spool(buildstate->spool, &htup->t_self,
216  index_values, index_isnull);
217  else
218  {
219  /* form an index tuple and point it at the heap tuple */
220  itup = index_form_tuple(RelationGetDescr(index),
221  index_values, index_isnull);
222  itup->t_tid = htup->t_self;
223  _hash_doinsert(index, itup, buildstate->heapRel);
224  pfree(itup);
225  }
226 
227  buildstate->indtuples += 1;
228 }
229 
230 /*
231  * hashinsert() -- insert an index tuple into a hash table.
232  *
233  * Hash on the heap tuple's key, form an index tuple with hash code.
234  * Find the appropriate location for the new tuple, and put it there.
235  */
236 bool
237 hashinsert(Relation rel, Datum *values, bool *isnull,
238  ItemPointer ht_ctid, Relation heapRel,
239  IndexUniqueCheck checkUnique,
240  IndexInfo *indexInfo)
241 {
242  Datum index_values[1];
243  bool index_isnull[1];
244  IndexTuple itup;
245 
246  /* convert data to a hash key; on failure, do not insert anything */
247  if (!_hash_convert_tuple(rel,
248  values, isnull,
249  index_values, index_isnull))
250  return false;
251 
252  /* form an index tuple and point it at the heap tuple */
253  itup = index_form_tuple(RelationGetDescr(rel), index_values, index_isnull);
254  itup->t_tid = *ht_ctid;
255 
256  _hash_doinsert(rel, itup, heapRel);
257 
258  pfree(itup);
259 
260  return false;
261 }
262 
263 
264 /*
265  * hashgettuple() -- Get the next tuple in the scan.
266  */
267 bool
269 {
270  HashScanOpaque so = (HashScanOpaque) scan->opaque;
271  bool res;
272 
273  /* Hash indexes are always lossy since we store only the hash code */
274  scan->xs_recheck = true;
275 
276  /*
277  * If we've already initialized this scan, we can just advance it in the
278  * appropriate direction. If we haven't done so yet, we call a routine to
279  * get the first item in the scan.
280  */
281  if (!HashScanPosIsValid(so->currPos))
282  res = _hash_first(scan, dir);
283  else
284  {
285  /*
286  * Check to see if we should kill the previously-fetched tuple.
287  */
288  if (scan->kill_prior_tuple)
289  {
290  /*
291  * Yes, so remember it for later. (We'll deal with all such tuples
292  * at once right after leaving the index page or at end of scan.)
293  * In case if caller reverses the indexscan direction it is quite
294  * possible that the same item might get entered multiple times.
295  * But, we don't detect that; instead, we just forget any excess
296  * entries.
297  */
298  if (so->killedItems == NULL)
299  so->killedItems = (int *)
300  palloc(MaxIndexTuplesPerPage * sizeof(int));
301 
303  so->killedItems[so->numKilled++] = so->currPos.itemIndex;
304  }
305 
306  /*
307  * Now continue the scan.
308  */
309  res = _hash_next(scan, dir);
310  }
311 
312  return res;
313 }
314 
315 
316 /*
317  * hashgetbitmap() -- get all tuples at once
318  */
319 int64
321 {
322  HashScanOpaque so = (HashScanOpaque) scan->opaque;
323  bool res;
324  int64 ntids = 0;
325  HashScanPosItem *currItem;
326 
327  res = _hash_first(scan, ForwardScanDirection);
328 
329  while (res)
330  {
331  currItem = &so->currPos.items[so->currPos.itemIndex];
332 
333  /*
334  * _hash_first and _hash_next handle eliminate dead index entries
335  * whenever scan->ignored_killed_tuples is true. Therefore, there's
336  * nothing to do here except add the results to the TIDBitmap.
337  */
338  tbm_add_tuples(tbm, &(currItem->heapTid), 1, true);
339  ntids++;
340 
341  res = _hash_next(scan, ForwardScanDirection);
342  }
343 
344  return ntids;
345 }
346 
347 
348 /*
349  * hashbeginscan() -- start a scan on a hash index
350  */
352 hashbeginscan(Relation rel, int nkeys, int norderbys)
353 {
354  IndexScanDesc scan;
355  HashScanOpaque so;
356 
357  /* no order by operators allowed */
358  Assert(norderbys == 0);
359 
360  scan = RelationGetIndexScan(rel, nkeys, norderbys);
361 
362  so = (HashScanOpaque) palloc(sizeof(HashScanOpaqueData));
366 
367  so->hashso_buc_populated = false;
368  so->hashso_buc_split = false;
369 
370  so->killedItems = NULL;
371  so->numKilled = 0;
372 
373  scan->opaque = so;
374 
375  return scan;
376 }
377 
378 /*
379  * hashrescan() -- rescan an index relation
380  */
381 void
382 hashrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
383  ScanKey orderbys, int norderbys)
384 {
385  HashScanOpaque so = (HashScanOpaque) scan->opaque;
386  Relation rel = scan->indexRelation;
387 
388  if (HashScanPosIsValid(so->currPos))
389  {
390  /* Before leaving current page, deal with any killed items */
391  if (so->numKilled > 0)
392  _hash_kill_items(scan);
393  }
394 
395  _hash_dropscanbuf(rel, so);
396 
397  /* set position invalid (this will cause _hash_first call) */
399 
400  /* Update scan key, if a new one is given */
401  if (scankey && scan->numberOfKeys > 0)
402  {
403  memmove(scan->keyData,
404  scankey,
405  scan->numberOfKeys * sizeof(ScanKeyData));
406  }
407 
408  so->hashso_buc_populated = false;
409  so->hashso_buc_split = false;
410 }
411 
412 /*
413  * hashendscan() -- close down a scan
414  */
415 void
417 {
418  HashScanOpaque so = (HashScanOpaque) scan->opaque;
419  Relation rel = scan->indexRelation;
420 
421  if (HashScanPosIsValid(so->currPos))
422  {
423  /* Before leaving current page, deal with any killed items */
424  if (so->numKilled > 0)
425  _hash_kill_items(scan);
426  }
427 
428  _hash_dropscanbuf(rel, so);
429 
430  if (so->killedItems != NULL)
431  pfree(so->killedItems);
432  pfree(so);
433  scan->opaque = NULL;
434 }
435 
436 /*
437  * Bulk deletion of all index entries pointing to a set of heap tuples.
438  * The set of target tuples is specified via a callback routine that tells
439  * whether any given heap tuple (identified by ItemPointer) is being deleted.
440  *
441  * This function also deletes the tuples that are moved by split to other
442  * bucket.
443  *
444  * Result: a palloc'd struct containing statistical info for VACUUM displays.
445  */
448  IndexBulkDeleteCallback callback, void *callback_state)
449 {
450  Relation rel = info->index;
451  double tuples_removed;
452  double num_index_tuples;
453  double orig_ntuples;
454  Bucket orig_maxbucket;
455  Bucket cur_maxbucket;
456  Bucket cur_bucket;
457  Buffer metabuf = InvalidBuffer;
458  HashMetaPage metap;
459  HashMetaPage cachedmetap;
460 
461  tuples_removed = 0;
462  num_index_tuples = 0;
463 
464  /*
465  * We need a copy of the metapage so that we can use its hashm_spares[]
466  * values to compute bucket page addresses, but a cached copy should be
467  * good enough. (If not, we'll detect that further down and refresh the
468  * cache as necessary.)
469  */
470  cachedmetap = _hash_getcachedmetap(rel, &metabuf, false);
471  Assert(cachedmetap != NULL);
472 
473  orig_maxbucket = cachedmetap->hashm_maxbucket;
474  orig_ntuples = cachedmetap->hashm_ntuples;
475 
476  /* Scan the buckets that we know exist */
477  cur_bucket = 0;
478  cur_maxbucket = orig_maxbucket;
479 
480 loop_top:
481  while (cur_bucket <= cur_maxbucket)
482  {
483  BlockNumber bucket_blkno;
485  Buffer bucket_buf;
486  Buffer buf;
487  HashPageOpaque bucket_opaque;
488  Page page;
489  bool split_cleanup = false;
490 
491  /* Get address of bucket's start page */
492  bucket_blkno = BUCKET_TO_BLKNO(cachedmetap, cur_bucket);
493 
494  blkno = bucket_blkno;
495 
496  /*
497  * We need to acquire a cleanup lock on the primary bucket page to out
498  * wait concurrent scans before deleting the dead tuples.
499  */
500  buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, info->strategy);
502  _hash_checkpage(rel, buf, LH_BUCKET_PAGE);
503 
504  page = BufferGetPage(buf);
505  bucket_opaque = (HashPageOpaque) PageGetSpecialPointer(page);
506 
507  /*
508  * If the bucket contains tuples that are moved by split, then we need
509  * to delete such tuples. We can't delete such tuples if the split
510  * operation on bucket is not finished as those are needed by scans.
511  */
512  if (!H_BUCKET_BEING_SPLIT(bucket_opaque) &&
513  H_NEEDS_SPLIT_CLEANUP(bucket_opaque))
514  {
515  split_cleanup = true;
516 
517  /*
518  * This bucket might have been split since we last held a lock on
519  * the metapage. If so, hashm_maxbucket, hashm_highmask and
520  * hashm_lowmask might be old enough to cause us to fail to remove
521  * tuples left behind by the most recent split. To prevent that,
522  * now that the primary page of the target bucket has been locked
523  * (and thus can't be further split), check whether we need to
524  * update our cached metapage data.
525  */
526  Assert(bucket_opaque->hasho_prevblkno != InvalidBlockNumber);
527  if (bucket_opaque->hasho_prevblkno > cachedmetap->hashm_maxbucket)
528  {
529  cachedmetap = _hash_getcachedmetap(rel, &metabuf, true);
530  Assert(cachedmetap != NULL);
531  }
532  }
533 
534  bucket_buf = buf;
535 
536  hashbucketcleanup(rel, cur_bucket, bucket_buf, blkno, info->strategy,
537  cachedmetap->hashm_maxbucket,
538  cachedmetap->hashm_highmask,
539  cachedmetap->hashm_lowmask, &tuples_removed,
540  &num_index_tuples, split_cleanup,
541  callback, callback_state);
542 
543  _hash_dropbuf(rel, bucket_buf);
544 
545  /* Advance to next bucket */
546  cur_bucket++;
547  }
548 
549  if (BufferIsInvalid(metabuf))
551 
552  /* Write-lock metapage and check for split since we started */
554  metap = HashPageGetMeta(BufferGetPage(metabuf));
555 
556  if (cur_maxbucket != metap->hashm_maxbucket)
557  {
558  /* There's been a split, so process the additional bucket(s) */
559  LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
560  cachedmetap = _hash_getcachedmetap(rel, &metabuf, true);
561  Assert(cachedmetap != NULL);
562  cur_maxbucket = cachedmetap->hashm_maxbucket;
563  goto loop_top;
564  }
565 
566  /* Okay, we're really done. Update tuple count in metapage. */
568 
569  if (orig_maxbucket == metap->hashm_maxbucket &&
570  orig_ntuples == metap->hashm_ntuples)
571  {
572  /*
573  * No one has split or inserted anything since start of scan, so
574  * believe our count as gospel.
575  */
576  metap->hashm_ntuples = num_index_tuples;
577  }
578  else
579  {
580  /*
581  * Otherwise, our count is untrustworthy since we may have
582  * double-scanned tuples in split buckets. Proceed by dead-reckoning.
583  * (Note: we still return estimated_count = false, because using this
584  * count is better than not updating reltuples at all.)
585  */
586  if (metap->hashm_ntuples > tuples_removed)
587  metap->hashm_ntuples -= tuples_removed;
588  else
589  metap->hashm_ntuples = 0;
590  num_index_tuples = metap->hashm_ntuples;
591  }
592 
593  MarkBufferDirty(metabuf);
594 
595  /* XLOG stuff */
596  if (RelationNeedsWAL(rel))
597  {
599  XLogRecPtr recptr;
600 
601  xlrec.ntuples = metap->hashm_ntuples;
602 
603  XLogBeginInsert();
604  XLogRegisterData((char *) &xlrec, SizeOfHashUpdateMetaPage);
605 
606  XLogRegisterBuffer(0, metabuf, REGBUF_STANDARD);
607 
608  recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_UPDATE_META_PAGE);
609  PageSetLSN(BufferGetPage(metabuf), recptr);
610  }
611 
613 
614  _hash_relbuf(rel, metabuf);
615 
616  /* return statistics */
617  if (stats == NULL)
619  stats->estimated_count = false;
620  stats->num_index_tuples = num_index_tuples;
621  stats->tuples_removed += tuples_removed;
622  /* hashvacuumcleanup will fill in num_pages */
623 
624  return stats;
625 }
626 
627 /*
628  * Post-VACUUM cleanup.
629  *
630  * Result: a palloc'd struct containing statistical info for VACUUM displays.
631  */
634 {
635  Relation rel = info->index;
636  BlockNumber num_pages;
637 
638  /* If hashbulkdelete wasn't called, return NULL signifying no change */
639  /* Note: this covers the analyze_only case too */
640  if (stats == NULL)
641  return NULL;
642 
643  /* update statistics */
644  num_pages = RelationGetNumberOfBlocks(rel);
645  stats->num_pages = num_pages;
646 
647  return stats;
648 }
649 
650 /*
651  * Helper function to perform deletion of index entries from a bucket.
652  *
653  * This function expects that the caller has acquired a cleanup lock on the
654  * primary bucket page, and will return with a write lock again held on the
655  * primary bucket page. The lock won't necessarily be held continuously,
656  * though, because we'll release it when visiting overflow pages.
657  *
658  * There can't be any concurrent scans in progress when we first enter this
659  * function because of the cleanup lock we hold on the primary bucket page,
660  * but as soon as we release that lock, there might be. If those scans got
661  * ahead of our cleanup scan, they might see a tuple before we kill it and
662  * wake up only after VACUUM has completed and the TID has been recycled for
663  * an unrelated tuple. To avoid that calamity, we prevent scans from passing
664  * our cleanup scan by locking the next page in the bucket chain before
665  * releasing the lock on the previous page. (This type of lock chaining is not
666  * ideal, so we might want to look for a better solution at some point.)
667  *
668  * We need to retain a pin on the primary bucket to ensure that no concurrent
669  * split can start.
670  */
671 void
672 hashbucketcleanup(Relation rel, Bucket cur_bucket, Buffer bucket_buf,
673  BlockNumber bucket_blkno, BufferAccessStrategy bstrategy,
674  uint32 maxbucket, uint32 highmask, uint32 lowmask,
675  double *tuples_removed, double *num_index_tuples,
676  bool split_cleanup,
677  IndexBulkDeleteCallback callback, void *callback_state)
678 {
680  Buffer buf;
682  bool bucket_dirty = false;
683 
684  blkno = bucket_blkno;
685  buf = bucket_buf;
686 
687  if (split_cleanup)
688  new_bucket = _hash_get_newbucket_from_oldbucket(rel, cur_bucket,
689  lowmask, maxbucket);
690 
691  /* Scan each page in bucket */
692  for (;;)
693  {
694  HashPageOpaque opaque;
695  OffsetNumber offno;
696  OffsetNumber maxoffno;
697  Buffer next_buf;
698  Page page;
699  OffsetNumber deletable[MaxOffsetNumber];
700  int ndeletable = 0;
701  bool retain_pin = false;
702  bool clear_dead_marking = false;
703 
705 
706  page = BufferGetPage(buf);
707  opaque = (HashPageOpaque) PageGetSpecialPointer(page);
708 
709  /* Scan each tuple in page */
710  maxoffno = PageGetMaxOffsetNumber(page);
711  for (offno = FirstOffsetNumber;
712  offno <= maxoffno;
713  offno = OffsetNumberNext(offno))
714  {
715  ItemPointer htup;
716  IndexTuple itup;
717  Bucket bucket;
718  bool kill_tuple = false;
719 
720  itup = (IndexTuple) PageGetItem(page,
721  PageGetItemId(page, offno));
722  htup = &(itup->t_tid);
723 
724  /*
725  * To remove the dead tuples, we strictly want to rely on results
726  * of callback function. refer btvacuumpage for detailed reason.
727  */
728  if (callback && callback(htup, callback_state))
729  {
730  kill_tuple = true;
731  if (tuples_removed)
732  *tuples_removed += 1;
733  }
734  else if (split_cleanup)
735  {
736  /* delete the tuples that are moved by split. */
738  maxbucket,
739  highmask,
740  lowmask);
741  /* mark the item for deletion */
742  if (bucket != cur_bucket)
743  {
744  /*
745  * We expect tuples to either belong to current bucket or
746  * new_bucket. This is ensured because we don't allow
747  * further splits from bucket that contains garbage. See
748  * comments in _hash_expandtable.
749  */
750  Assert(bucket == new_bucket);
751  kill_tuple = true;
752  }
753  }
754 
755  if (kill_tuple)
756  {
757  /* mark the item for deletion */
758  deletable[ndeletable++] = offno;
759  }
760  else
761  {
762  /* we're keeping it, so count it */
763  if (num_index_tuples)
764  *num_index_tuples += 1;
765  }
766  }
767 
768  /* retain the pin on primary bucket page till end of bucket scan */
769  if (blkno == bucket_blkno)
770  retain_pin = true;
771  else
772  retain_pin = false;
773 
774  blkno = opaque->hasho_nextblkno;
775 
776  /*
777  * Apply deletions, advance to next page and write page if needed.
778  */
779  if (ndeletable > 0)
780  {
781  /* No ereport(ERROR) until changes are logged */
783 
784  PageIndexMultiDelete(page, deletable, ndeletable);
785  bucket_dirty = true;
786 
787  /*
788  * Let us mark the page as clean if vacuum removes the DEAD tuples
789  * from an index page. We do this by clearing
790  * LH_PAGE_HAS_DEAD_TUPLES flag.
791  */
792  if (tuples_removed && *tuples_removed > 0 &&
793  H_HAS_DEAD_TUPLES(opaque))
794  {
796  clear_dead_marking = true;
797  }
798 
799  MarkBufferDirty(buf);
800 
801  /* XLOG stuff */
802  if (RelationNeedsWAL(rel))
803  {
804  xl_hash_delete xlrec;
805  XLogRecPtr recptr;
806 
807  xlrec.clear_dead_marking = clear_dead_marking;
808  xlrec.is_primary_bucket_page = (buf == bucket_buf) ? true : false;
809 
810  XLogBeginInsert();
811  XLogRegisterData((char *) &xlrec, SizeOfHashDelete);
812 
813  /*
814  * bucket buffer needs to be registered to ensure that we can
815  * acquire a cleanup lock on it during replay.
816  */
817  if (!xlrec.is_primary_bucket_page)
819 
821  XLogRegisterBufData(1, (char *) deletable,
822  ndeletable * sizeof(OffsetNumber));
823 
824  recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_DELETE);
825  PageSetLSN(BufferGetPage(buf), recptr);
826  }
827 
829  }
830 
831  /* bail out if there are no more pages to scan. */
832  if (!BlockNumberIsValid(blkno))
833  break;
834 
835  next_buf = _hash_getbuf_with_strategy(rel, blkno, HASH_WRITE,
837  bstrategy);
838 
839  /*
840  * release the lock on previous page after acquiring the lock on next
841  * page
842  */
843  if (retain_pin)
845  else
846  _hash_relbuf(rel, buf);
847 
848  buf = next_buf;
849  }
850 
851  /*
852  * lock the bucket page to clear the garbage flag and squeeze the bucket.
853  * if the current buffer is same as bucket buffer, then we already have
854  * lock on bucket page.
855  */
856  if (buf != bucket_buf)
857  {
858  _hash_relbuf(rel, buf);
859  LockBuffer(bucket_buf, BUFFER_LOCK_EXCLUSIVE);
860  }
861 
862  /*
863  * Clear the garbage flag from bucket after deleting the tuples that are
864  * moved by split. We purposefully clear the flag before squeeze bucket,
865  * so that after restart, vacuum shouldn't again try to delete the moved
866  * by split tuples.
867  */
868  if (split_cleanup)
869  {
870  HashPageOpaque bucket_opaque;
871  Page page;
872 
873  page = BufferGetPage(bucket_buf);
874  bucket_opaque = (HashPageOpaque) PageGetSpecialPointer(page);
875 
876  /* No ereport(ERROR) until changes are logged */
878 
879  bucket_opaque->hasho_flag &= ~LH_BUCKET_NEEDS_SPLIT_CLEANUP;
880  MarkBufferDirty(bucket_buf);
881 
882  /* XLOG stuff */
883  if (RelationNeedsWAL(rel))
884  {
885  XLogRecPtr recptr;
886 
887  XLogBeginInsert();
888  XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD);
889 
890  recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_CLEANUP);
891  PageSetLSN(page, recptr);
892  }
893 
895  }
896 
897  /*
898  * If we have deleted anything, try to compact free space. For squeezing
899  * the bucket, we must have a cleanup lock, else it can impact the
900  * ordering of tuples for a scan that has started before it.
901  */
902  if (bucket_dirty && IsBufferCleanupOK(bucket_buf))
903  _hash_squeezebucket(rel, cur_bucket, bucket_blkno, bucket_buf,
904  bstrategy);
905  else
906  LockBuffer(bucket_buf, BUFFER_LOCK_UNLOCK);
907 }
ambeginscan_function ambeginscan
Definition: amapi.h:208
void XLogRegisterBufData(uint8 block_id, char *data, int len)
Definition: xloginsert.c:361
static void hashbuildCallback(Relation index, HeapTuple htup, Datum *values, bool *isnull, bool tupleIsAlive, void *state)
Definition: hash.c:195
bytea * hashoptions(Datum reloptions, bool validate)
Definition: hashutil.c:290
#define PG_RETURN_POINTER(x)
Definition: fmgr.h:321
HashScanOpaqueData * HashScanOpaque
Definition: hash.h:200
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:87
void LockBufferForCleanup(Buffer buffer)
Definition: bufmgr.c:3603
ambulkdelete_function ambulkdelete
Definition: amapi.h:201
bool hashgettuple(IndexScanDesc scan, ScanDirection dir)
Definition: hash.c:268
bool amcanmulticol
Definition: amapi.h:179
void _hash_doinsert(Relation rel, IndexTuple itup, Relation heapRel)
Definition: hashinsert.c:36
uint16 amsupport
Definition: amapi.h:169
double tuples_removed
Definition: genam.h:77
void _hash_dropscanbuf(Relation rel, HashScanOpaque so)
Definition: hashpage.c:297
#define HASHNProcs
Definition: hash.h:353
amgettuple_function amgettuple
Definition: amapi.h:210
#define RelationGetDescr(relation)
Definition: rel.h:437
Bucket _hash_hashkey2bucket(uint32 hashkey, uint32 maxbucket, uint32 highmask, uint32 lowmask)
Definition: hashutil.c:125
bool amcanorderbyop
Definition: amapi.h:173
#define LH_META_PAGE
Definition: hash.h:67
IndexBulkDeleteResult * hashbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state)
Definition: hash.c:447
amproperty_function amproperty
Definition: amapi.h:206
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:1450
#define SizeOfHashDelete
Definition: hash_xlog.h:205
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition: xloginsert.c:213
#define HashScanPosIsValid(scanpos)
Definition: hash.h:145
#define MaxOffsetNumber
Definition: off.h:28
Buffer _hash_getbuf_with_strategy(Relation rel, BlockNumber blkno, int access, int flags, BufferAccessStrategy bstrategy)
Definition: hashpage.c:247
void tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids, bool recheck)
Definition: tidbitmap.c:376
Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition: bufmgr.c:640
ItemPointerData t_tid
Definition: itup.h:37
amparallelrescan_function amparallelrescan
Definition: amapi.h:219
#define Min(x, y)
Definition: c.h:812
#define END_CRIT_SECTION()
Definition: miscadmin.h:133
BufferAccessStrategy strategy
Definition: genam.h:51
void hashcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, Cost *indexStartupCost, Cost *indexTotalCost, Selectivity *indexSelectivity, double *indexCorrelation, double *indexPages)
Definition: selfuncs.c:7173
bool amstorage
Definition: amapi.h:187
double indtuples
Definition: hash.c:38
uint32 hashm_highmask
Definition: hash.h:260
#define INT4OID
Definition: pg_type.h:316
#define InvalidBuffer
Definition: buf.h:25
#define START_CRIT_SECTION()
Definition: miscadmin.h:131
Relation index
Definition: genam.h:46
#define XLOG_HASH_SPLIT_CLEANUP
Definition: hash_xlog.h:40
bool ampredlocks
Definition: amapi.h:191
bool clear_dead_marking
Definition: hash_xlog.h:199
void hashbucketcleanup(Relation rel, Bucket cur_bucket, Buffer bucket_buf, BlockNumber bucket_blkno, BufferAccessStrategy bstrategy, uint32 maxbucket, uint32 highmask, uint32 lowmask, double *tuples_removed, double *num_index_tuples, bool split_cleanup, IndexBulkDeleteCallback callback, void *callback_state)
Definition: hash.c:672
uint32 BlockNumber
Definition: block.h:31
IndexScanDesc hashbeginscan(Relation rel, int nkeys, int norderbys)
Definition: hash.c:352
#define HTMaxStrategyNumber
Definition: hash.h:337
aminsert_function aminsert
Definition: amapi.h:200
void _hash_dropbuf(Relation rel, Buffer buf)
Definition: hashpage.c:285
Buffer _hash_getbuf(Relation rel, BlockNumber blkno, int access, int flags)
Definition: hashpage.c:78
#define BUFFER_LOCK_EXCLUSIVE
Definition: bufmgr.h:89
bool hashinsert(Relation rel, Datum *values, bool *isnull, ItemPointer ht_ctid, Relation heapRel, IndexUniqueCheck checkUnique, IndexInfo *indexInfo)
Definition: hash.c:237
#define LH_BUCKET_NEEDS_SPLIT_CLEANUP
Definition: hash.h:70
Form_pg_class rd_rel
Definition: rel.h:114
Oid amkeytype
Definition: amapi.h:195
uint32 hashm_lowmask
Definition: hash.h:261
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:353
bool amoptionalkey
Definition: amapi.h:181
amvalidate_function amvalidate
Definition: amapi.h:207
void hashbuildempty(Relation index)
Definition: hash.c:186
#define BUCKET_TO_BLKNO(metap, B)
Definition: hash.h:38
Relation indexRelation
Definition: relscan.h:90
uint16 OffsetNumber
Definition: off.h:24
Definition: type.h:89
uint32 Bucket
Definition: hash.h:34
IndexUniqueCheck
Definition: genam.h:111
IndexTuple index_form_tuple(TupleDesc tupleDescriptor, Datum *values, bool *isnull)
Definition: indextuple.c:37
int * killedItems
Definition: hash.h:190
void pfree(void *pointer)
Definition: mcxt.c:936
#define InvalidBucket
Definition: hash.h:36
amgetbitmap_function amgetbitmap
Definition: amapi.h:211
#define H_NEEDS_SPLIT_CLEANUP(opaque)
Definition: hash.h:98
BlockNumber hasho_prevblkno
Definition: hash.h:89
#define ERROR
Definition: elog.h:43
ambuild_function ambuild
Definition: amapi.h:198
uint32 _hash_get_indextuple_hashkey(IndexTuple itup)
Definition: hashutil.c:299
bool _hash_first(IndexScanDesc scan, ScanDirection dir)
Definition: hashsearch.c:290
amoptions_function amoptions
Definition: amapi.h:205
IndexBuildResult * hashbuild(Relation heap, Relation index, IndexInfo *indexInfo)
Definition: hash.c:103
BlockNumber num_pages
Definition: genam.h:73
ItemPointerData t_self
Definition: htup.h:65
static void callback(struct sockaddr *addr, struct sockaddr *mask, void *unused)
Definition: test_ifaddrs.c:48
int NLocBuffer
Definition: localbuf.c:41
bool _hash_convert_tuple(Relation index, Datum *user_values, bool *user_isnull, Datum *index_values, bool *index_isnull)
Definition: hashutil.c:326
amcostestimate_function amcostestimate
Definition: amapi.h:204
bool amcanunique
Definition: amapi.h:177
bool is_primary_bucket_page
Definition: hash_xlog.h:201
bool _hash_next(IndexScanDesc scan, ScanDirection dir)
Definition: hashsearch.c:47
#define BufferIsInvalid(buffer)
Definition: buf.h:31
static char * buf
Definition: pg_test_fsync.c:67
amvacuumcleanup_function amvacuumcleanup
Definition: amapi.h:202
#define HASH_WRITE
Definition: hash.h:330
amendscan_function amendscan
Definition: amapi.h:212
#define memmove(d, s, c)
Definition: c.h:1055
#define HASH_NOLOCK
Definition: hash.h:331
bool amcanbackward
Definition: amapi.h:175
#define FirstOffsetNumber
Definition: off.h:27
IndexTupleData * IndexTuple
Definition: itup.h:53
#define REGBUF_STANDARD
Definition: xloginsert.h:34
Bucket _hash_get_newbucket_from_oldbucket(Relation rel, Bucket old_bucket, uint32 lowmask, uint32 maxbucket)
Definition: hashutil.c:502
void _h_spool(HSpool *hspool, ItemPointer self, Datum *values, bool *isnull)
Definition: hashsort.c:104
#define XLOG_HASH_DELETE
Definition: hash_xlog.h:39
ScanDirection
Definition: sdir.h:22
void _h_indexbuild(HSpool *hspool, Relation heapRel)
Definition: hashsort.c:115
#define RelationGetRelationName(relation)
Definition: rel.h:445
unsigned int uint32
Definition: c.h:306
void estimate_rel_size(Relation rel, int32 *attr_widths, BlockNumber *pages, double *tuples, double *allvisfrac)
Definition: plancat.c:920
#define BufferGetPage(buffer)
Definition: bufmgr.h:160
amrescan_function amrescan
Definition: amapi.h:209
bool amcanparallel
Definition: amapi.h:193
Buffer hashso_bucket_buf
Definition: hash.h:172
int64 hashgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
Definition: hash.c:320
bool IsBufferCleanupOK(Buffer buffer)
Definition: bufmgr.c:3774
#define SizeOfHashUpdateMetaPage
Definition: hash_xlog.h:219
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:231
void XLogRegisterData(char *data, int len)
Definition: xloginsert.c:323
bool amsearchnulls
Definition: amapi.h:185
void hashendscan(IndexScanDesc scan)
Definition: hash.c:416
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:415
void _hash_checkpage(Relation rel, Buffer buf, int flags)
Definition: hashutil.c:225
HSpool * spool
Definition: hash.c:37
void * palloc0(Size size)
Definition: mcxt.c:864
BlockNumber blkno
Definition: gistvacuum.c:105
#define HASH_METAPAGE
Definition: hash.h:206
uintptr_t Datum
Definition: postgres.h:372
#define HashScanPosInvalidate(scanpos)
Definition: hash.h:152
double hashm_ntuples
Definition: hash.h:253
bool hashso_buc_populated
Definition: hash.h:182
#define LH_OVERFLOW_PAGE
Definition: hash.h:64
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:3546
bool amclusterable
Definition: amapi.h:189
#define RelationGetNumberOfBlocks(reln)
Definition: bufmgr.h:199
bool amsearcharray
Definition: amapi.h:183
HashMetaPage _hash_getcachedmetap(Relation rel, Buffer *metabuf, bool force_refresh)
Definition: hashpage.c:1503
void _hash_relbuf(Relation rel, Buffer buf)
Definition: hashpage.c:274
int maintenance_work_mem
Definition: globals.c:114
Relation heapRel
Definition: hash.c:39
#define BlockNumberIsValid(blockNumber)
Definition: block.h:70
#define LH_BUCKET_PAGE
Definition: hash.h:65
IndexBulkDeleteResult * hashvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
Definition: hash.c:633
#define H_BUCKET_BEING_SPLIT(opaque)
Definition: hash.h:99
Datum hashhandler(PG_FUNCTION_ARGS)
Definition: hash.c:55
#define makeNode(_type_)
Definition: nodes.h:560
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define Assert(condition)
Definition: c.h:680
Definition: regguts.h:298
void PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems)
Definition: bufpage.c:832
#define OffsetNumberNext(offsetNumber)
Definition: off.h:53
#define PageGetSpecialPointer(page)
Definition: bufpage.h:322
#define REGBUF_NO_IMAGE
Definition: xloginsert.h:31
#define InvalidBlockNumber
Definition: block.h:33
HashPageOpaqueData * HashPageOpaque
Definition: hash.h:96
void _hash_kill_items(IndexScanDesc scan)
Definition: hashutil.c:544
void _hash_squeezebucket(Relation rel, Bucket bucket, BlockNumber bucket_blkno, Buffer bucket_buf, BufferAccessStrategy bstrategy)
Definition: hashovfl.c:804
ammarkpos_function ammarkpos
Definition: amapi.h:213
bool amcanorder
Definition: amapi.h:171
ScanKey keyData
Definition: relscan.h:94
#define RelationNeedsWAL(relation)
Definition: rel.h:514
amestimateparallelscan_function amestimateparallelscan
Definition: amapi.h:217
uint32 hashm_maxbucket
Definition: hash.h:259
HashScanPosData currPos
Definition: hash.h:197
uint16 hasho_flag
Definition: hash.h:92
bool hashso_buc_split
Definition: hash.h:188
uint32 _hash_init(Relation rel, double num_tuples, ForkNumber forkNum)
Definition: hashpage.c:335
bool hashvalidate(Oid opclassoid)
Definition: hashvalidate.c:44
uint16 amstrategies
Definition: amapi.h:167
static Datum values[MAXATTR]
Definition: bootstrap.c:164
void _h_spooldestroy(HSpool *hspool)
Definition: hashsort.c:94
double IndexBuildHeapScan(Relation heapRelation, Relation indexRelation, IndexInfo *indexInfo, bool allow_sync, IndexBuildCallback callback, void *callback_state)
Definition: index.c:2178
#define MaxIndexTuplesPerPage
Definition: itup.h:137
#define HashPageGetMeta(page)
Definition: hash.h:313
void * palloc(Size size)
Definition: mcxt.c:835
ambuildempty_function ambuildempty
Definition: amapi.h:199
bool kill_prior_tuple
Definition: relscan.h:100
Buffer hashso_split_bucket_buf
Definition: hash.h:179
int NBuffers
Definition: globals.c:122
BlockNumber hasho_nextblkno
Definition: hash.h:90
IndexScanDesc RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys)
Definition: genam.c:78
#define PG_FUNCTION_ARGS
Definition: fmgr.h:158
int itemIndex
Definition: hash.h:133
#define elog
Definition: elog.h:219
HSpool * _h_spoolinit(Relation heap, Relation index, uint32 num_buckets)
Definition: hashsort.c:56
#define RELPERSISTENCE_TEMP
Definition: pg_class.h:172
void vacuum_delay_point(void)
Definition: vacuum.c:1695
#define PG_USED_FOR_ASSERTS_ONLY
Definition: c.h:122
void XLogBeginInsert(void)
Definition: xloginsert.c:120
#define XLOG_HASH_UPDATE_META_PAGE
Definition: hash_xlog.h:43
#define PageSetLSN(page, lsn)
Definition: bufpage.h:364
double num_index_tuples
Definition: genam.h:76
int Buffer
Definition: buf.h:23
amcanreturn_function amcanreturn
Definition: amapi.h:203
#define H_HAS_DEAD_TUPLES(opaque)
Definition: hash.h:101
bool estimated_count
Definition: genam.h:75
void hashrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, ScanKey orderbys, int norderbys)
Definition: hash.c:382
#define LH_PAGE_HAS_DEAD_TUPLES
Definition: hash.h:71
#define PageGetItem(page, itemId)
Definition: bufpage.h:336
Pointer Page
Definition: bufpage.h:74
double index_tuples
Definition: genam.h:33
bool(* IndexBulkDeleteCallback)(ItemPointer itemptr, void *state)
Definition: genam.h:83
aminitparallelscan_function aminitparallelscan
Definition: amapi.h:218
HashScanPosItem items[MaxIndexTuplesPerPage]
Definition: hash.h:135
double heap_tuples
Definition: genam.h:32
amrestrpos_function amrestrpos
Definition: amapi.h:214