PostgreSQL Source Code  git master
hash.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * hash.c
4  * Implementation of Margo Seltzer's Hashing package for postgres.
5  *
6  * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  * src/backend/access/hash/hash.c
12  *
13  * NOTES
14  * This file contains only the public interface routines.
15  *
16  *-------------------------------------------------------------------------
17  */
18 
19 #include "postgres.h"
20 
21 #include "access/hash.h"
22 #include "access/hash_xlog.h"
23 #include "access/relscan.h"
24 #include "access/tableam.h"
25 #include "catalog/index.h"
26 #include "commands/progress.h"
27 #include "commands/vacuum.h"
28 #include "miscadmin.h"
29 #include "optimizer/plancat.h"
30 #include "pgstat.h"
31 #include "utils/builtins.h"
32 #include "utils/index_selfuncs.h"
33 #include "utils/rel.h"
34 
35 /* Working state for hashbuild and its callback */
36 typedef struct
37 {
38  HSpool *spool; /* NULL if not using spooling */
39  double indtuples; /* # tuples accepted into index */
40  Relation heapRel; /* heap relation descriptor */
42 
43 static void hashbuildCallback(Relation index,
44  ItemPointer tid,
45  Datum *values,
46  bool *isnull,
47  bool tupleIsAlive,
48  void *state);
49 
50 
51 /*
52  * Hash handler function: return IndexAmRoutine with access method parameters
53  * and callbacks.
54  */
55 Datum
57 {
59 
60  amroutine->amstrategies = HTMaxStrategyNumber;
61  amroutine->amsupport = HASHNProcs;
62  amroutine->amcanorder = false;
63  amroutine->amcanorderbyop = false;
64  amroutine->amcanbackward = true;
65  amroutine->amcanunique = false;
66  amroutine->amcanmulticol = false;
67  amroutine->amoptionalkey = false;
68  amroutine->amsearcharray = false;
69  amroutine->amsearchnulls = false;
70  amroutine->amstorage = false;
71  amroutine->amclusterable = false;
72  amroutine->ampredlocks = true;
73  amroutine->amcanparallel = false;
74  amroutine->amcaninclude = false;
75  amroutine->amkeytype = INT4OID;
76 
77  amroutine->ambuild = hashbuild;
78  amroutine->ambuildempty = hashbuildempty;
79  amroutine->aminsert = hashinsert;
80  amroutine->ambulkdelete = hashbulkdelete;
82  amroutine->amcanreturn = NULL;
83  amroutine->amcostestimate = hashcostestimate;
84  amroutine->amoptions = hashoptions;
85  amroutine->amproperty = NULL;
86  amroutine->ambuildphasename = NULL;
87  amroutine->amvalidate = hashvalidate;
88  amroutine->ambeginscan = hashbeginscan;
89  amroutine->amrescan = hashrescan;
90  amroutine->amgettuple = hashgettuple;
91  amroutine->amgetbitmap = hashgetbitmap;
92  amroutine->amendscan = hashendscan;
93  amroutine->ammarkpos = NULL;
94  amroutine->amrestrpos = NULL;
95  amroutine->amestimateparallelscan = NULL;
96  amroutine->aminitparallelscan = NULL;
97  amroutine->amparallelrescan = NULL;
98 
99  PG_RETURN_POINTER(amroutine);
100 }
101 
102 /*
103  * hashbuild() -- build a new hash index.
104  */
107 {
108  IndexBuildResult *result;
110  double reltuples;
111  double allvisfrac;
112  uint32 num_buckets;
113  long sort_threshold;
114  HashBuildState buildstate;
115 
116  /*
117  * We expect to be called exactly once for any index relation. If that's
118  * not the case, big trouble's what we have.
119  */
120  if (RelationGetNumberOfBlocks(index) != 0)
121  elog(ERROR, "index \"%s\" already contains data",
122  RelationGetRelationName(index));
123 
124  /* Estimate the number of rows currently present in the table */
125  estimate_rel_size(heap, NULL, &relpages, &reltuples, &allvisfrac);
126 
127  /* Initialize the hash index metadata page and initial buckets */
128  num_buckets = _hash_init(index, reltuples, MAIN_FORKNUM);
129 
130  /*
131  * If we just insert the tuples into the index in scan order, then
132  * (assuming their hash codes are pretty random) there will be no locality
133  * of access to the index, and if the index is bigger than available RAM
134  * then we'll thrash horribly. To prevent that scenario, we can sort the
135  * tuples by (expected) bucket number. However, such a sort is useless
136  * overhead when the index does fit in RAM. We choose to sort if the
137  * initial index size exceeds maintenance_work_mem, or the number of
138  * buffers usable for the index, whichever is less. (Limiting by the
139  * number of buffers should reduce thrashing between PG buffers and kernel
140  * buffers, which seems useful even if no physical I/O results. Limiting
141  * by maintenance_work_mem is useful to allow easy testing of the sort
142  * code path, and may be useful to DBAs as an additional control knob.)
143  *
144  * NOTE: this test will need adjustment if a bucket is ever different from
145  * one page. Also, "initial index size" accounting does not include the
146  * metapage, nor the first bitmap page.
147  */
148  sort_threshold = (maintenance_work_mem * 1024L) / BLCKSZ;
149  if (index->rd_rel->relpersistence != RELPERSISTENCE_TEMP)
150  sort_threshold = Min(sort_threshold, NBuffers);
151  else
152  sort_threshold = Min(sort_threshold, NLocBuffer);
153 
154  if (num_buckets >= (uint32) sort_threshold)
155  buildstate.spool = _h_spoolinit(heap, index, num_buckets);
156  else
157  buildstate.spool = NULL;
158 
159  /* prepare to build the index */
160  buildstate.indtuples = 0;
161  buildstate.heapRel = heap;
162 
163  /* do the heap scan */
164  reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
166  (void *) &buildstate, NULL);
168  buildstate.indtuples);
169 
170  if (buildstate.spool)
171  {
172  /* sort the tuples and insert them into the index */
173  _h_indexbuild(buildstate.spool, buildstate.heapRel);
174  _h_spooldestroy(buildstate.spool);
175  }
176 
177  /*
178  * Return statistics
179  */
180  result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
181 
182  result->heap_tuples = reltuples;
183  result->index_tuples = buildstate.indtuples;
184 
185  return result;
186 }
187 
188 /*
189  * hashbuildempty() -- build an empty hash index in the initialization fork
190  */
191 void
193 {
194  _hash_init(index, 0, INIT_FORKNUM);
195 }
196 
197 /*
198  * Per-tuple callback for table_index_build_scan
199  */
200 static void
202  ItemPointer tid,
203  Datum *values,
204  bool *isnull,
205  bool tupleIsAlive,
206  void *state)
207 {
208  HashBuildState *buildstate = (HashBuildState *) state;
209  Datum index_values[1];
210  bool index_isnull[1];
211  IndexTuple itup;
212 
213  /* convert data to a hash key; on failure, do not insert anything */
214  if (!_hash_convert_tuple(index,
215  values, isnull,
216  index_values, index_isnull))
217  return;
218 
219  /* Either spool the tuple for sorting, or just put it into the index */
220  if (buildstate->spool)
221  _h_spool(buildstate->spool, tid, index_values, index_isnull);
222  else
223  {
224  /* form an index tuple and point it at the heap tuple */
225  itup = index_form_tuple(RelationGetDescr(index),
226  index_values, index_isnull);
227  itup->t_tid = *tid;
228  _hash_doinsert(index, itup, buildstate->heapRel);
229  pfree(itup);
230  }
231 
232  buildstate->indtuples += 1;
233 }
234 
235 /*
236  * hashinsert() -- insert an index tuple into a hash table.
237  *
238  * Hash on the heap tuple's key, form an index tuple with hash code.
239  * Find the appropriate location for the new tuple, and put it there.
240  */
241 bool
242 hashinsert(Relation rel, Datum *values, bool *isnull,
243  ItemPointer ht_ctid, Relation heapRel,
244  IndexUniqueCheck checkUnique,
245  IndexInfo *indexInfo)
246 {
247  Datum index_values[1];
248  bool index_isnull[1];
249  IndexTuple itup;
250 
251  /* convert data to a hash key; on failure, do not insert anything */
252  if (!_hash_convert_tuple(rel,
253  values, isnull,
254  index_values, index_isnull))
255  return false;
256 
257  /* form an index tuple and point it at the heap tuple */
258  itup = index_form_tuple(RelationGetDescr(rel), index_values, index_isnull);
259  itup->t_tid = *ht_ctid;
260 
261  _hash_doinsert(rel, itup, heapRel);
262 
263  pfree(itup);
264 
265  return false;
266 }
267 
268 
269 /*
270  * hashgettuple() -- Get the next tuple in the scan.
271  */
272 bool
274 {
275  HashScanOpaque so = (HashScanOpaque) scan->opaque;
276  bool res;
277 
278  /* Hash indexes are always lossy since we store only the hash code */
279  scan->xs_recheck = true;
280 
281  /*
282  * If we've already initialized this scan, we can just advance it in the
283  * appropriate direction. If we haven't done so yet, we call a routine to
284  * get the first item in the scan.
285  */
286  if (!HashScanPosIsValid(so->currPos))
287  res = _hash_first(scan, dir);
288  else
289  {
290  /*
291  * Check to see if we should kill the previously-fetched tuple.
292  */
293  if (scan->kill_prior_tuple)
294  {
295  /*
296  * Yes, so remember it for later. (We'll deal with all such tuples
297  * at once right after leaving the index page or at end of scan.)
298  * In case if caller reverses the indexscan direction it is quite
299  * possible that the same item might get entered multiple times.
300  * But, we don't detect that; instead, we just forget any excess
301  * entries.
302  */
303  if (so->killedItems == NULL)
304  so->killedItems = (int *)
305  palloc(MaxIndexTuplesPerPage * sizeof(int));
306 
308  so->killedItems[so->numKilled++] = so->currPos.itemIndex;
309  }
310 
311  /*
312  * Now continue the scan.
313  */
314  res = _hash_next(scan, dir);
315  }
316 
317  return res;
318 }
319 
320 
321 /*
322  * hashgetbitmap() -- get all tuples at once
323  */
324 int64
326 {
327  HashScanOpaque so = (HashScanOpaque) scan->opaque;
328  bool res;
329  int64 ntids = 0;
330  HashScanPosItem *currItem;
331 
332  res = _hash_first(scan, ForwardScanDirection);
333 
334  while (res)
335  {
336  currItem = &so->currPos.items[so->currPos.itemIndex];
337 
338  /*
339  * _hash_first and _hash_next handle eliminate dead index entries
340  * whenever scan->ignore_killed_tuples is true. Therefore, there's
341  * nothing to do here except add the results to the TIDBitmap.
342  */
343  tbm_add_tuples(tbm, &(currItem->heapTid), 1, true);
344  ntids++;
345 
346  res = _hash_next(scan, ForwardScanDirection);
347  }
348 
349  return ntids;
350 }
351 
352 
353 /*
354  * hashbeginscan() -- start a scan on a hash index
355  */
357 hashbeginscan(Relation rel, int nkeys, int norderbys)
358 {
359  IndexScanDesc scan;
360  HashScanOpaque so;
361 
362  /* no order by operators allowed */
363  Assert(norderbys == 0);
364 
365  scan = RelationGetIndexScan(rel, nkeys, norderbys);
366 
367  so = (HashScanOpaque) palloc(sizeof(HashScanOpaqueData));
371 
372  so->hashso_buc_populated = false;
373  so->hashso_buc_split = false;
374 
375  so->killedItems = NULL;
376  so->numKilled = 0;
377 
378  scan->opaque = so;
379 
380  return scan;
381 }
382 
383 /*
384  * hashrescan() -- rescan an index relation
385  */
386 void
387 hashrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
388  ScanKey orderbys, int norderbys)
389 {
390  HashScanOpaque so = (HashScanOpaque) scan->opaque;
391  Relation rel = scan->indexRelation;
392 
393  if (HashScanPosIsValid(so->currPos))
394  {
395  /* Before leaving current page, deal with any killed items */
396  if (so->numKilled > 0)
397  _hash_kill_items(scan);
398  }
399 
400  _hash_dropscanbuf(rel, so);
401 
402  /* set position invalid (this will cause _hash_first call) */
404 
405  /* Update scan key, if a new one is given */
406  if (scankey && scan->numberOfKeys > 0)
407  {
408  memmove(scan->keyData,
409  scankey,
410  scan->numberOfKeys * sizeof(ScanKeyData));
411  }
412 
413  so->hashso_buc_populated = false;
414  so->hashso_buc_split = false;
415 }
416 
417 /*
418  * hashendscan() -- close down a scan
419  */
420 void
422 {
423  HashScanOpaque so = (HashScanOpaque) scan->opaque;
424  Relation rel = scan->indexRelation;
425 
426  if (HashScanPosIsValid(so->currPos))
427  {
428  /* Before leaving current page, deal with any killed items */
429  if (so->numKilled > 0)
430  _hash_kill_items(scan);
431  }
432 
433  _hash_dropscanbuf(rel, so);
434 
435  if (so->killedItems != NULL)
436  pfree(so->killedItems);
437  pfree(so);
438  scan->opaque = NULL;
439 }
440 
441 /*
442  * Bulk deletion of all index entries pointing to a set of heap tuples.
443  * The set of target tuples is specified via a callback routine that tells
444  * whether any given heap tuple (identified by ItemPointer) is being deleted.
445  *
446  * This function also deletes the tuples that are moved by split to other
447  * bucket.
448  *
449  * Result: a palloc'd struct containing statistical info for VACUUM displays.
450  */
453  IndexBulkDeleteCallback callback, void *callback_state)
454 {
455  Relation rel = info->index;
456  double tuples_removed;
457  double num_index_tuples;
458  double orig_ntuples;
459  Bucket orig_maxbucket;
460  Bucket cur_maxbucket;
461  Bucket cur_bucket;
462  Buffer metabuf = InvalidBuffer;
463  HashMetaPage metap;
464  HashMetaPage cachedmetap;
465 
466  tuples_removed = 0;
467  num_index_tuples = 0;
468 
469  /*
470  * We need a copy of the metapage so that we can use its hashm_spares[]
471  * values to compute bucket page addresses, but a cached copy should be
472  * good enough. (If not, we'll detect that further down and refresh the
473  * cache as necessary.)
474  */
475  cachedmetap = _hash_getcachedmetap(rel, &metabuf, false);
476  Assert(cachedmetap != NULL);
477 
478  orig_maxbucket = cachedmetap->hashm_maxbucket;
479  orig_ntuples = cachedmetap->hashm_ntuples;
480 
481  /* Scan the buckets that we know exist */
482  cur_bucket = 0;
483  cur_maxbucket = orig_maxbucket;
484 
485 loop_top:
486  while (cur_bucket <= cur_maxbucket)
487  {
488  BlockNumber bucket_blkno;
490  Buffer bucket_buf;
491  Buffer buf;
492  HashPageOpaque bucket_opaque;
493  Page page;
494  bool split_cleanup = false;
495 
496  /* Get address of bucket's start page */
497  bucket_blkno = BUCKET_TO_BLKNO(cachedmetap, cur_bucket);
498 
499  blkno = bucket_blkno;
500 
501  /*
502  * We need to acquire a cleanup lock on the primary bucket page to out
503  * wait concurrent scans before deleting the dead tuples.
504  */
505  buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, info->strategy);
507  _hash_checkpage(rel, buf, LH_BUCKET_PAGE);
508 
509  page = BufferGetPage(buf);
510  bucket_opaque = (HashPageOpaque) PageGetSpecialPointer(page);
511 
512  /*
513  * If the bucket contains tuples that are moved by split, then we need
514  * to delete such tuples. We can't delete such tuples if the split
515  * operation on bucket is not finished as those are needed by scans.
516  */
517  if (!H_BUCKET_BEING_SPLIT(bucket_opaque) &&
518  H_NEEDS_SPLIT_CLEANUP(bucket_opaque))
519  {
520  split_cleanup = true;
521 
522  /*
523  * This bucket might have been split since we last held a lock on
524  * the metapage. If so, hashm_maxbucket, hashm_highmask and
525  * hashm_lowmask might be old enough to cause us to fail to remove
526  * tuples left behind by the most recent split. To prevent that,
527  * now that the primary page of the target bucket has been locked
528  * (and thus can't be further split), check whether we need to
529  * update our cached metapage data.
530  */
531  Assert(bucket_opaque->hasho_prevblkno != InvalidBlockNumber);
532  if (bucket_opaque->hasho_prevblkno > cachedmetap->hashm_maxbucket)
533  {
534  cachedmetap = _hash_getcachedmetap(rel, &metabuf, true);
535  Assert(cachedmetap != NULL);
536  }
537  }
538 
539  bucket_buf = buf;
540 
541  hashbucketcleanup(rel, cur_bucket, bucket_buf, blkno, info->strategy,
542  cachedmetap->hashm_maxbucket,
543  cachedmetap->hashm_highmask,
544  cachedmetap->hashm_lowmask, &tuples_removed,
545  &num_index_tuples, split_cleanup,
546  callback, callback_state);
547 
548  _hash_dropbuf(rel, bucket_buf);
549 
550  /* Advance to next bucket */
551  cur_bucket++;
552  }
553 
554  if (BufferIsInvalid(metabuf))
556 
557  /* Write-lock metapage and check for split since we started */
559  metap = HashPageGetMeta(BufferGetPage(metabuf));
560 
561  if (cur_maxbucket != metap->hashm_maxbucket)
562  {
563  /* There's been a split, so process the additional bucket(s) */
564  LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
565  cachedmetap = _hash_getcachedmetap(rel, &metabuf, true);
566  Assert(cachedmetap != NULL);
567  cur_maxbucket = cachedmetap->hashm_maxbucket;
568  goto loop_top;
569  }
570 
571  /* Okay, we're really done. Update tuple count in metapage. */
573 
574  if (orig_maxbucket == metap->hashm_maxbucket &&
575  orig_ntuples == metap->hashm_ntuples)
576  {
577  /*
578  * No one has split or inserted anything since start of scan, so
579  * believe our count as gospel.
580  */
581  metap->hashm_ntuples = num_index_tuples;
582  }
583  else
584  {
585  /*
586  * Otherwise, our count is untrustworthy since we may have
587  * double-scanned tuples in split buckets. Proceed by dead-reckoning.
588  * (Note: we still return estimated_count = false, because using this
589  * count is better than not updating reltuples at all.)
590  */
591  if (metap->hashm_ntuples > tuples_removed)
592  metap->hashm_ntuples -= tuples_removed;
593  else
594  metap->hashm_ntuples = 0;
595  num_index_tuples = metap->hashm_ntuples;
596  }
597 
598  MarkBufferDirty(metabuf);
599 
600  /* XLOG stuff */
601  if (RelationNeedsWAL(rel))
602  {
604  XLogRecPtr recptr;
605 
606  xlrec.ntuples = metap->hashm_ntuples;
607 
608  XLogBeginInsert();
609  XLogRegisterData((char *) &xlrec, SizeOfHashUpdateMetaPage);
610 
611  XLogRegisterBuffer(0, metabuf, REGBUF_STANDARD);
612 
613  recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_UPDATE_META_PAGE);
614  PageSetLSN(BufferGetPage(metabuf), recptr);
615  }
616 
618 
619  _hash_relbuf(rel, metabuf);
620 
621  /* return statistics */
622  if (stats == NULL)
624  stats->estimated_count = false;
625  stats->num_index_tuples = num_index_tuples;
626  stats->tuples_removed += tuples_removed;
627  /* hashvacuumcleanup will fill in num_pages */
628 
629  return stats;
630 }
631 
632 /*
633  * Post-VACUUM cleanup.
634  *
635  * Result: a palloc'd struct containing statistical info for VACUUM displays.
636  */
639 {
640  Relation rel = info->index;
641  BlockNumber num_pages;
642 
643  /* If hashbulkdelete wasn't called, return NULL signifying no change */
644  /* Note: this covers the analyze_only case too */
645  if (stats == NULL)
646  return NULL;
647 
648  /* update statistics */
649  num_pages = RelationGetNumberOfBlocks(rel);
650  stats->num_pages = num_pages;
651 
652  return stats;
653 }
654 
655 /*
656  * Helper function to perform deletion of index entries from a bucket.
657  *
658  * This function expects that the caller has acquired a cleanup lock on the
659  * primary bucket page, and will return with a write lock again held on the
660  * primary bucket page. The lock won't necessarily be held continuously,
661  * though, because we'll release it when visiting overflow pages.
662  *
663  * There can't be any concurrent scans in progress when we first enter this
664  * function because of the cleanup lock we hold on the primary bucket page,
665  * but as soon as we release that lock, there might be. If those scans got
666  * ahead of our cleanup scan, they might see a tuple before we kill it and
667  * wake up only after VACUUM has completed and the TID has been recycled for
668  * an unrelated tuple. To avoid that calamity, we prevent scans from passing
669  * our cleanup scan by locking the next page in the bucket chain before
670  * releasing the lock on the previous page. (This type of lock chaining is not
671  * ideal, so we might want to look for a better solution at some point.)
672  *
673  * We need to retain a pin on the primary bucket to ensure that no concurrent
674  * split can start.
675  */
676 void
677 hashbucketcleanup(Relation rel, Bucket cur_bucket, Buffer bucket_buf,
678  BlockNumber bucket_blkno, BufferAccessStrategy bstrategy,
679  uint32 maxbucket, uint32 highmask, uint32 lowmask,
680  double *tuples_removed, double *num_index_tuples,
681  bool split_cleanup,
682  IndexBulkDeleteCallback callback, void *callback_state)
683 {
685  Buffer buf;
687  bool bucket_dirty = false;
688 
689  blkno = bucket_blkno;
690  buf = bucket_buf;
691 
692  if (split_cleanup)
693  new_bucket = _hash_get_newbucket_from_oldbucket(rel, cur_bucket,
694  lowmask, maxbucket);
695 
696  /* Scan each page in bucket */
697  for (;;)
698  {
699  HashPageOpaque opaque;
700  OffsetNumber offno;
701  OffsetNumber maxoffno;
702  Buffer next_buf;
703  Page page;
704  OffsetNumber deletable[MaxOffsetNumber];
705  int ndeletable = 0;
706  bool retain_pin = false;
707  bool clear_dead_marking = false;
708 
710 
711  page = BufferGetPage(buf);
712  opaque = (HashPageOpaque) PageGetSpecialPointer(page);
713 
714  /* Scan each tuple in page */
715  maxoffno = PageGetMaxOffsetNumber(page);
716  for (offno = FirstOffsetNumber;
717  offno <= maxoffno;
718  offno = OffsetNumberNext(offno))
719  {
720  ItemPointer htup;
721  IndexTuple itup;
722  Bucket bucket;
723  bool kill_tuple = false;
724 
725  itup = (IndexTuple) PageGetItem(page,
726  PageGetItemId(page, offno));
727  htup = &(itup->t_tid);
728 
729  /*
730  * To remove the dead tuples, we strictly want to rely on results
731  * of callback function. refer btvacuumpage for detailed reason.
732  */
733  if (callback && callback(htup, callback_state))
734  {
735  kill_tuple = true;
736  if (tuples_removed)
737  *tuples_removed += 1;
738  }
739  else if (split_cleanup)
740  {
741  /* delete the tuples that are moved by split. */
743  maxbucket,
744  highmask,
745  lowmask);
746  /* mark the item for deletion */
747  if (bucket != cur_bucket)
748  {
749  /*
750  * We expect tuples to either belong to current bucket or
751  * new_bucket. This is ensured because we don't allow
752  * further splits from bucket that contains garbage. See
753  * comments in _hash_expandtable.
754  */
755  Assert(bucket == new_bucket);
756  kill_tuple = true;
757  }
758  }
759 
760  if (kill_tuple)
761  {
762  /* mark the item for deletion */
763  deletable[ndeletable++] = offno;
764  }
765  else
766  {
767  /* we're keeping it, so count it */
768  if (num_index_tuples)
769  *num_index_tuples += 1;
770  }
771  }
772 
773  /* retain the pin on primary bucket page till end of bucket scan */
774  if (blkno == bucket_blkno)
775  retain_pin = true;
776  else
777  retain_pin = false;
778 
779  blkno = opaque->hasho_nextblkno;
780 
781  /*
782  * Apply deletions, advance to next page and write page if needed.
783  */
784  if (ndeletable > 0)
785  {
786  /* No ereport(ERROR) until changes are logged */
788 
789  PageIndexMultiDelete(page, deletable, ndeletable);
790  bucket_dirty = true;
791 
792  /*
793  * Let us mark the page as clean if vacuum removes the DEAD tuples
794  * from an index page. We do this by clearing
795  * LH_PAGE_HAS_DEAD_TUPLES flag.
796  */
797  if (tuples_removed && *tuples_removed > 0 &&
798  H_HAS_DEAD_TUPLES(opaque))
799  {
801  clear_dead_marking = true;
802  }
803 
804  MarkBufferDirty(buf);
805 
806  /* XLOG stuff */
807  if (RelationNeedsWAL(rel))
808  {
809  xl_hash_delete xlrec;
810  XLogRecPtr recptr;
811 
812  xlrec.clear_dead_marking = clear_dead_marking;
813  xlrec.is_primary_bucket_page = (buf == bucket_buf) ? true : false;
814 
815  XLogBeginInsert();
816  XLogRegisterData((char *) &xlrec, SizeOfHashDelete);
817 
818  /*
819  * bucket buffer needs to be registered to ensure that we can
820  * acquire a cleanup lock on it during replay.
821  */
822  if (!xlrec.is_primary_bucket_page)
824 
826  XLogRegisterBufData(1, (char *) deletable,
827  ndeletable * sizeof(OffsetNumber));
828 
829  recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_DELETE);
830  PageSetLSN(BufferGetPage(buf), recptr);
831  }
832 
834  }
835 
836  /* bail out if there are no more pages to scan. */
837  if (!BlockNumberIsValid(blkno))
838  break;
839 
840  next_buf = _hash_getbuf_with_strategy(rel, blkno, HASH_WRITE,
842  bstrategy);
843 
844  /*
845  * release the lock on previous page after acquiring the lock on next
846  * page
847  */
848  if (retain_pin)
850  else
851  _hash_relbuf(rel, buf);
852 
853  buf = next_buf;
854  }
855 
856  /*
857  * lock the bucket page to clear the garbage flag and squeeze the bucket.
858  * if the current buffer is same as bucket buffer, then we already have
859  * lock on bucket page.
860  */
861  if (buf != bucket_buf)
862  {
863  _hash_relbuf(rel, buf);
864  LockBuffer(bucket_buf, BUFFER_LOCK_EXCLUSIVE);
865  }
866 
867  /*
868  * Clear the garbage flag from bucket after deleting the tuples that are
869  * moved by split. We purposefully clear the flag before squeeze bucket,
870  * so that after restart, vacuum shouldn't again try to delete the moved
871  * by split tuples.
872  */
873  if (split_cleanup)
874  {
875  HashPageOpaque bucket_opaque;
876  Page page;
877 
878  page = BufferGetPage(bucket_buf);
879  bucket_opaque = (HashPageOpaque) PageGetSpecialPointer(page);
880 
881  /* No ereport(ERROR) until changes are logged */
883 
884  bucket_opaque->hasho_flag &= ~LH_BUCKET_NEEDS_SPLIT_CLEANUP;
885  MarkBufferDirty(bucket_buf);
886 
887  /* XLOG stuff */
888  if (RelationNeedsWAL(rel))
889  {
890  XLogRecPtr recptr;
891 
892  XLogBeginInsert();
893  XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD);
894 
895  recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_CLEANUP);
896  PageSetLSN(page, recptr);
897  }
898 
900  }
901 
902  /*
903  * If we have deleted anything, try to compact free space. For squeezing
904  * the bucket, we must have a cleanup lock, else it can impact the
905  * ordering of tuples for a scan that has started before it.
906  */
907  if (bucket_dirty && IsBufferCleanupOK(bucket_buf))
908  _hash_squeezebucket(rel, cur_bucket, bucket_blkno, bucket_buf,
909  bstrategy);
910  else
911  LockBuffer(bucket_buf, BUFFER_LOCK_UNLOCK);
912 }
ambeginscan_function ambeginscan
Definition: amapi.h:221
void XLogRegisterBufData(uint8 block_id, char *data, int len)
Definition: xloginsert.c:361
bytea * hashoptions(Datum reloptions, bool validate)
Definition: hashutil.c:290
#define PG_RETURN_POINTER(x)
Definition: fmgr.h:351
HashScanOpaqueData * HashScanOpaque
Definition: hash.h:189
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:86
void LockBufferForCleanup(Buffer buffer)
Definition: bufmgr.c:3659
ambulkdelete_function ambulkdelete
Definition: amapi.h:213
bool hashgettuple(IndexScanDesc scan, ScanDirection dir)
Definition: hash.c:273
bool amcanmulticol
Definition: amapi.h:183
void _hash_doinsert(Relation rel, IndexTuple itup, Relation heapRel)
Definition: hashinsert.c:36
uint16 amsupport
Definition: amapi.h:173
double tuples_removed
Definition: genam.h:78
#define HTMaxStrategyNumber
Definition: stratnum.h:43
void _hash_dropscanbuf(Relation rel, HashScanOpaque so)
Definition: hashpage.c:287
#define HASHNProcs
Definition: hash.h:339
amgettuple_function amgettuple
Definition: amapi.h:223
#define RelationGetDescr(relation)
Definition: rel.h:449
Bucket _hash_hashkey2bucket(uint32 hashkey, uint32 maxbucket, uint32 highmask, uint32 lowmask)
Definition: hashutil.c:125
bool amcanorderbyop
Definition: amapi.h:177
#define LH_META_PAGE
Definition: hash.h:56
IndexBulkDeleteResult * hashbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state)
Definition: hash.c:452
amproperty_function amproperty
Definition: amapi.h:218
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:1458
#define SizeOfHashDelete
Definition: hash_xlog.h:192
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition: xloginsert.c:213
#define HashScanPosIsValid(scanpos)
Definition: hash.h:134
#define MaxOffsetNumber
Definition: off.h:28
Buffer _hash_getbuf_with_strategy(Relation rel, BlockNumber blkno, int access, int flags, BufferAccessStrategy bstrategy)
Definition: hashpage.c:237
void pgstat_progress_update_param(int index, int64 val)
Definition: pgstat.c:3218
void tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids, bool recheck)
Definition: tidbitmap.c:376
Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition: bufmgr.c:642
ItemPointerData t_tid
Definition: itup.h:37
amparallelrescan_function amparallelrescan
Definition: amapi.h:232
#define Min(x, y)
Definition: c.h:905
#define END_CRIT_SECTION()
Definition: miscadmin.h:134
BufferAccessStrategy strategy
Definition: genam.h:52
void hashcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, Cost *indexStartupCost, Cost *indexTotalCost, Selectivity *indexSelectivity, double *indexCorrelation, double *indexPages)
Definition: selfuncs.c:6069
bool amstorage
Definition: amapi.h:191
double indtuples
Definition: hash.c:39
uint32 hashm_highmask
Definition: hash.h:252
#define InvalidBuffer
Definition: buf.h:25
#define START_CRIT_SECTION()
Definition: miscadmin.h:132
Relation index
Definition: genam.h:46
#define XLOG_HASH_SPLIT_CLEANUP
Definition: hash_xlog.h:40
bool ampredlocks
Definition: amapi.h:195
bool clear_dead_marking
Definition: hash_xlog.h:186
void hashbucketcleanup(Relation rel, Bucket cur_bucket, Buffer bucket_buf, BlockNumber bucket_blkno, BufferAccessStrategy bstrategy, uint32 maxbucket, uint32 highmask, uint32 lowmask, double *tuples_removed, double *num_index_tuples, bool split_cleanup, IndexBulkDeleteCallback callback, void *callback_state)
Definition: hash.c:677
#define PROGRESS_CREATEIDX_TUPLES_TOTAL
Definition: progress.h:67
uint32 BlockNumber
Definition: block.h:31
IndexScanDesc hashbeginscan(Relation rel, int nkeys, int norderbys)
Definition: hash.c:357
aminsert_function aminsert
Definition: amapi.h:212
void _hash_dropbuf(Relation rel, Buffer buf)
Definition: hashpage.c:275
Buffer _hash_getbuf(Relation rel, BlockNumber blkno, int access, int flags)
Definition: hashpage.c:68
#define BUFFER_LOCK_EXCLUSIVE
Definition: bufmgr.h:88
bool hashinsert(Relation rel, Datum *values, bool *isnull, ItemPointer ht_ctid, Relation heapRel, IndexUniqueCheck checkUnique, IndexInfo *indexInfo)
Definition: hash.c:242
#define LH_BUCKET_NEEDS_SPLIT_CLEANUP
Definition: hash.h:59
Form_pg_class rd_rel
Definition: rel.h:83
Oid amkeytype
Definition: amapi.h:201
uint32 hashm_lowmask
Definition: hash.h:253
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:357
bool amoptionalkey
Definition: amapi.h:185
amvalidate_function amvalidate
Definition: amapi.h:220
void hashbuildempty(Relation index)
Definition: hash.c:192
int32 relpages
Definition: pg_class.h:60
#define BUCKET_TO_BLKNO(metap, B)
Definition: hash.h:38
Relation indexRelation
Definition: relscan.h:103
uint16 OffsetNumber
Definition: off.h:24
Definition: type.h:89
uint32 Bucket
Definition: hash.h:34
IndexUniqueCheck
Definition: genam.h:112
IndexTuple index_form_tuple(TupleDesc tupleDescriptor, Datum *values, bool *isnull)
Definition: indextuple.c:47
int * killedItems
Definition: hash.h:179
void pfree(void *pointer)
Definition: mcxt.c:1056
#define InvalidBucket
Definition: hash.h:36
amgetbitmap_function amgetbitmap
Definition: amapi.h:224
#define H_NEEDS_SPLIT_CLEANUP(opaque)
Definition: hash.h:87
BlockNumber hasho_prevblkno
Definition: hash.h:78
#define ERROR
Definition: elog.h:43
ambuild_function ambuild
Definition: amapi.h:210
uint32 _hash_get_indextuple_hashkey(IndexTuple itup)
Definition: hashutil.c:299
bool _hash_first(IndexScanDesc scan, ScanDirection dir)
Definition: hashsearch.c:292
amoptions_function amoptions
Definition: amapi.h:217
static double table_index_build_scan(Relation table_rel, Relation index_rel, struct IndexInfo *index_info, bool allow_sync, bool progress, IndexBuildCallback callback, void *callback_state, TableScanDesc scan)
Definition: tableam.h:1499
IndexBuildResult * hashbuild(Relation heap, Relation index, IndexInfo *indexInfo)
Definition: hash.c:106
BlockNumber num_pages
Definition: genam.h:74
static void callback(struct sockaddr *addr, struct sockaddr *mask, void *unused)
Definition: test_ifaddrs.c:48
bool amcaninclude
Definition: amapi.h:199
int NLocBuffer
Definition: localbuf.c:41
bool _hash_convert_tuple(Relation index, Datum *user_values, bool *user_isnull, Datum *index_values, bool *index_isnull)
Definition: hashutil.c:326
BlockNumber blkno
Definition: ginvacuum.c:119
amcostestimate_function amcostestimate
Definition: amapi.h:216
bool amcanunique
Definition: amapi.h:181
bool is_primary_bucket_page
Definition: hash_xlog.h:188
bool _hash_next(IndexScanDesc scan, ScanDirection dir)
Definition: hashsearch.c:48
#define BufferIsInvalid(buffer)
Definition: buf.h:31
static char * buf
Definition: pg_test_fsync.c:67
amvacuumcleanup_function amvacuumcleanup
Definition: amapi.h:214
#define HASH_WRITE
Definition: hash.h:322
amendscan_function amendscan
Definition: amapi.h:225
#define memmove(d, s, c)
Definition: c.h:1261
#define HASH_NOLOCK
Definition: hash.h:323
bool amcanbackward
Definition: amapi.h:179
#define FirstOffsetNumber
Definition: off.h:27
IndexTupleData * IndexTuple
Definition: itup.h:53
#define REGBUF_STANDARD
Definition: xloginsert.h:35
Bucket _hash_get_newbucket_from_oldbucket(Relation rel, Bucket old_bucket, uint32 lowmask, uint32 maxbucket)
Definition: hashutil.c:502
void _h_spool(HSpool *hspool, ItemPointer self, Datum *values, bool *isnull)
Definition: hashsort.c:107
#define XLOG_HASH_DELETE
Definition: hash_xlog.h:39
ScanDirection
Definition: sdir.h:22
void _h_indexbuild(HSpool *hspool, Relation heapRel)
Definition: hashsort.c:118
#define RelationGetRelationName(relation)
Definition: rel.h:457
unsigned int uint32
Definition: c.h:359
void estimate_rel_size(Relation rel, int32 *attr_widths, BlockNumber *pages, double *tuples, double *allvisfrac)
Definition: plancat.c:946
#define BufferGetPage(buffer)
Definition: bufmgr.h:159
amrescan_function amrescan
Definition: amapi.h:222
bool amcanparallel
Definition: amapi.h:197
Buffer hashso_bucket_buf
Definition: hash.h:161
int64 hashgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
Definition: hash.c:325
bool IsBufferCleanupOK(Buffer buffer)
Definition: bufmgr.c:3830
#define SizeOfHashUpdateMetaPage
Definition: hash_xlog.h:206
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:235
void XLogRegisterData(char *data, int len)
Definition: xloginsert.c:323
bool amsearchnulls
Definition: amapi.h:189
void hashendscan(IndexScanDesc scan)
Definition: hash.c:421
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:415
void _hash_checkpage(Relation rel, Buffer buf, int flags)
Definition: hashutil.c:225
HSpool * spool
Definition: hash.c:38
void * palloc0(Size size)
Definition: mcxt.c:980
#define HASH_METAPAGE
Definition: hash.h:195
uintptr_t Datum
Definition: postgres.h:367
#define HashScanPosInvalidate(scanpos)
Definition: hash.h:141
double hashm_ntuples
Definition: hash.h:245
bool hashso_buc_populated
Definition: hash.h:171
#define LH_OVERFLOW_PAGE
Definition: hash.h:53
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:3602
bool amclusterable
Definition: amapi.h:193
#define RelationGetNumberOfBlocks(reln)
Definition: bufmgr.h:198
bool amsearcharray
Definition: amapi.h:187
HashMetaPage _hash_getcachedmetap(Relation rel, Buffer *metabuf, bool force_refresh)
Definition: hashpage.c:1499
void _hash_relbuf(Relation rel, Buffer buf)
Definition: hashpage.c:264
int maintenance_work_mem
Definition: globals.c:122
Relation heapRel
Definition: hash.c:40
#define BlockNumberIsValid(blockNumber)
Definition: block.h:70
#define LH_BUCKET_PAGE
Definition: hash.h:54
IndexBulkDeleteResult * hashvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
Definition: hash.c:638
#define H_BUCKET_BEING_SPLIT(opaque)
Definition: hash.h:88
Datum hashhandler(PG_FUNCTION_ARGS)
Definition: hash.c:56
#define makeNode(_type_)
Definition: nodes.h:573
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define Assert(condition)
Definition: c.h:733
Definition: regguts.h:298
void PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems)
Definition: bufpage.c:835
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define REGBUF_NO_IMAGE
Definition: xloginsert.h:32
#define InvalidBlockNumber
Definition: block.h:33
HashPageOpaqueData * HashPageOpaque
Definition: hash.h:85
void _hash_kill_items(IndexScanDesc scan)
Definition: hashutil.c:544
void _hash_squeezebucket(Relation rel, Bucket bucket, BlockNumber bucket_blkno, Buffer bucket_buf, BufferAccessStrategy bstrategy)
Definition: hashovfl.c:805
ammarkpos_function ammarkpos
Definition: amapi.h:226
bool amcanorder
Definition: amapi.h:175
ambuildphasename_function ambuildphasename
Definition: amapi.h:219
#define RelationNeedsWAL(relation)
Definition: rel.h:525
amestimateparallelscan_function amestimateparallelscan
Definition: amapi.h:230
uint32 hashm_maxbucket
Definition: hash.h:251
struct ScanKeyData * keyData
Definition: relscan.h:107
HashScanPosData currPos
Definition: hash.h:186
uint16 hasho_flag
Definition: hash.h:81
bool hashso_buc_split
Definition: hash.h:177
uint32 _hash_init(Relation rel, double num_tuples, ForkNumber forkNum)
Definition: hashpage.c:325
bool hashvalidate(Oid opclassoid)
Definition: hashvalidate.c:44
uint16 amstrategies
Definition: amapi.h:171
static Datum values[MAXATTR]
Definition: bootstrap.c:167
void _h_spooldestroy(HSpool *hspool)
Definition: hashsort.c:97
#define MaxIndexTuplesPerPage
Definition: itup.h:145
#define HashPageGetMeta(page)
Definition: hash.h:305
void * palloc(Size size)
Definition: mcxt.c:949
#define elog(elevel,...)
Definition: elog.h:228
ambuildempty_function ambuildempty
Definition: amapi.h:211
bool kill_prior_tuple
Definition: relscan.h:113
Buffer hashso_split_bucket_buf
Definition: hash.h:168
int NBuffers
Definition: globals.c:131
BlockNumber hasho_nextblkno
Definition: hash.h:79
IndexScanDesc RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys)
Definition: genam.c:80
static void hashbuildCallback(Relation index, ItemPointer tid, Datum *values, bool *isnull, bool tupleIsAlive, void *state)
Definition: hash.c:201
#define PG_FUNCTION_ARGS
Definition: fmgr.h:188
int itemIndex
Definition: hash.h:122
HSpool * _h_spoolinit(Relation heap, Relation index, uint32 num_buckets)
Definition: hashsort.c:58
void vacuum_delay_point(void)
Definition: vacuum.c:1942
#define PG_USED_FOR_ASSERTS_ONLY
Definition: c.h:123
void XLogBeginInsert(void)
Definition: xloginsert.c:120
#define XLOG_HASH_UPDATE_META_PAGE
Definition: hash_xlog.h:43
#define PageSetLSN(page, lsn)
Definition: bufpage.h:368
double num_index_tuples
Definition: genam.h:77
int Buffer
Definition: buf.h:23
amcanreturn_function amcanreturn
Definition: amapi.h:215
#define H_HAS_DEAD_TUPLES(opaque)
Definition: hash.h:90
bool estimated_count
Definition: genam.h:76
float4 reltuples
Definition: pg_class.h:63
void hashrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, ScanKey orderbys, int norderbys)
Definition: hash.c:387
#define LH_PAGE_HAS_DEAD_TUPLES
Definition: hash.h:60
#define PageGetItem(page, itemId)
Definition: bufpage.h:340
Pointer Page
Definition: bufpage.h:78
double index_tuples
Definition: genam.h:33
bool(* IndexBulkDeleteCallback)(ItemPointer itemptr, void *state)
Definition: genam.h:84
aminitparallelscan_function aminitparallelscan
Definition: amapi.h:231
HashScanPosItem items[MaxIndexTuplesPerPage]
Definition: hash.h:124
double heap_tuples
Definition: genam.h:32
amrestrpos_function amrestrpos
Definition: amapi.h:227