PostgreSQL Source Code  git master
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros
hash.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * hash.c
4  * Implementation of Margo Seltzer's Hashing package for postgres.
5  *
6  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  * src/backend/access/hash/hash.c
12  *
13  * NOTES
14  * This file contains only the public interface routines.
15  *
16  *-------------------------------------------------------------------------
17  */
18 
19 #include "postgres.h"
20 
21 #include "access/hash.h"
22 #include "access/hash_xlog.h"
23 #include "access/relscan.h"
24 #include "catalog/index.h"
25 #include "commands/vacuum.h"
26 #include "miscadmin.h"
27 #include "optimizer/plancat.h"
28 #include "utils/builtins.h"
29 #include "utils/index_selfuncs.h"
30 #include "utils/rel.h"
31 
32 
33 /* Working state for hashbuild and its callback */
34 typedef struct
35 {
36  HSpool *spool; /* NULL if not using spooling */
37  double indtuples; /* # tuples accepted into index */
39 
40 static void hashbuildCallback(Relation index,
41  HeapTuple htup,
42  Datum *values,
43  bool *isnull,
44  bool tupleIsAlive,
45  void *state);
46 
47 
48 /*
49  * Hash handler function: return IndexAmRoutine with access method parameters
50  * and callbacks.
51  */
52 Datum
54 {
56 
57  amroutine->amstrategies = HTMaxStrategyNumber;
58  amroutine->amsupport = HASHNProcs;
59  amroutine->amcanorder = false;
60  amroutine->amcanorderbyop = false;
61  amroutine->amcanbackward = true;
62  amroutine->amcanunique = false;
63  amroutine->amcanmulticol = false;
64  amroutine->amoptionalkey = false;
65  amroutine->amsearcharray = false;
66  amroutine->amsearchnulls = false;
67  amroutine->amstorage = false;
68  amroutine->amclusterable = false;
69  amroutine->ampredlocks = false;
70  amroutine->amcanparallel = false;
71  amroutine->amkeytype = INT4OID;
72 
73  amroutine->ambuild = hashbuild;
74  amroutine->ambuildempty = hashbuildempty;
75  amroutine->aminsert = hashinsert;
76  amroutine->ambulkdelete = hashbulkdelete;
78  amroutine->amcanreturn = NULL;
79  amroutine->amcostestimate = hashcostestimate;
80  amroutine->amoptions = hashoptions;
81  amroutine->amproperty = NULL;
82  amroutine->amvalidate = hashvalidate;
83  amroutine->ambeginscan = hashbeginscan;
84  amroutine->amrescan = hashrescan;
85  amroutine->amgettuple = hashgettuple;
86  amroutine->amgetbitmap = hashgetbitmap;
87  amroutine->amendscan = hashendscan;
88  amroutine->ammarkpos = NULL;
89  amroutine->amrestrpos = NULL;
90  amroutine->amestimateparallelscan = NULL;
91  amroutine->aminitparallelscan = NULL;
92  amroutine->amparallelrescan = NULL;
93 
94  PG_RETURN_POINTER(amroutine);
95 }
96 
97 /*
98  * hashbuild() -- build a new hash index.
99  */
102 {
103  IndexBuildResult *result;
104  BlockNumber relpages;
105  double reltuples;
106  double allvisfrac;
107  uint32 num_buckets;
108  long sort_threshold;
109  HashBuildState buildstate;
110 
111  /*
112  * We expect to be called exactly once for any index relation. If that's
113  * not the case, big trouble's what we have.
114  */
115  if (RelationGetNumberOfBlocks(index) != 0)
116  elog(ERROR, "index \"%s\" already contains data",
117  RelationGetRelationName(index));
118 
119  /* Estimate the number of rows currently present in the table */
120  estimate_rel_size(heap, NULL, &relpages, &reltuples, &allvisfrac);
121 
122  /* Initialize the hash index metadata page and initial buckets */
123  num_buckets = _hash_metapinit(index, reltuples, MAIN_FORKNUM);
124 
125  /*
126  * If we just insert the tuples into the index in scan order, then
127  * (assuming their hash codes are pretty random) there will be no locality
128  * of access to the index, and if the index is bigger than available RAM
129  * then we'll thrash horribly. To prevent that scenario, we can sort the
130  * tuples by (expected) bucket number. However, such a sort is useless
131  * overhead when the index does fit in RAM. We choose to sort if the
132  * initial index size exceeds maintenance_work_mem, or the number of
133  * buffers usable for the index, whichever is less. (Limiting by the
134  * number of buffers should reduce thrashing between PG buffers and kernel
135  * buffers, which seems useful even if no physical I/O results. Limiting
136  * by maintenance_work_mem is useful to allow easy testing of the sort
137  * code path, and may be useful to DBAs as an additional control knob.)
138  *
139  * NOTE: this test will need adjustment if a bucket is ever different from
140  * one page. Also, "initial index size" accounting does not include the
141  * metapage, nor the first bitmap page.
142  */
143  sort_threshold = (maintenance_work_mem * 1024L) / BLCKSZ;
144  if (index->rd_rel->relpersistence != RELPERSISTENCE_TEMP)
145  sort_threshold = Min(sort_threshold, NBuffers);
146  else
147  sort_threshold = Min(sort_threshold, NLocBuffer);
148 
149  if (num_buckets >= (uint32) sort_threshold)
150  buildstate.spool = _h_spoolinit(heap, index, num_buckets);
151  else
152  buildstate.spool = NULL;
153 
154  /* prepare to build the index */
155  buildstate.indtuples = 0;
156 
157  /* do the heap scan */
158  reltuples = IndexBuildHeapScan(heap, index, indexInfo, true,
159  hashbuildCallback, (void *) &buildstate);
160 
161  if (buildstate.spool)
162  {
163  /* sort the tuples and insert them into the index */
164  _h_indexbuild(buildstate.spool);
165  _h_spooldestroy(buildstate.spool);
166  }
167 
168  /*
169  * Return statistics
170  */
171  result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
172 
173  result->heap_tuples = reltuples;
174  result->index_tuples = buildstate.indtuples;
175 
176  return result;
177 }
178 
179 /*
180  * hashbuildempty() -- build an empty hash index in the initialization fork
181  */
182 void
184 {
185  _hash_metapinit(index, 0, INIT_FORKNUM);
186 }
187 
188 /*
189  * Per-tuple callback from IndexBuildHeapScan
190  */
191 static void
193  HeapTuple htup,
194  Datum *values,
195  bool *isnull,
196  bool tupleIsAlive,
197  void *state)
198 {
199  HashBuildState *buildstate = (HashBuildState *) state;
200  Datum index_values[1];
201  bool index_isnull[1];
202  IndexTuple itup;
203 
204  /* convert data to a hash key; on failure, do not insert anything */
205  if (!_hash_convert_tuple(index,
206  values, isnull,
207  index_values, index_isnull))
208  return;
209 
210  /* Either spool the tuple for sorting, or just put it into the index */
211  if (buildstate->spool)
212  _h_spool(buildstate->spool, &htup->t_self,
213  index_values, index_isnull);
214  else
215  {
216  /* form an index tuple and point it at the heap tuple */
217  itup = index_form_tuple(RelationGetDescr(index),
218  index_values, index_isnull);
219  itup->t_tid = htup->t_self;
220  _hash_doinsert(index, itup);
221  pfree(itup);
222  }
223 
224  buildstate->indtuples += 1;
225 }
226 
227 /*
228  * hashinsert() -- insert an index tuple into a hash table.
229  *
230  * Hash on the heap tuple's key, form an index tuple with hash code.
231  * Find the appropriate location for the new tuple, and put it there.
232  */
233 bool
234 hashinsert(Relation rel, Datum *values, bool *isnull,
235  ItemPointer ht_ctid, Relation heapRel,
236  IndexUniqueCheck checkUnique,
237  IndexInfo *indexInfo)
238 {
239  Datum index_values[1];
240  bool index_isnull[1];
241  IndexTuple itup;
242 
243  /* convert data to a hash key; on failure, do not insert anything */
244  if (!_hash_convert_tuple(rel,
245  values, isnull,
246  index_values, index_isnull))
247  return false;
248 
249  /* form an index tuple and point it at the heap tuple */
250  itup = index_form_tuple(RelationGetDescr(rel), index_values, index_isnull);
251  itup->t_tid = *ht_ctid;
252 
253  _hash_doinsert(rel, itup);
254 
255  pfree(itup);
256 
257  return false;
258 }
259 
260 
261 /*
262  * hashgettuple() -- Get the next tuple in the scan.
263  */
264 bool
266 {
267  HashScanOpaque so = (HashScanOpaque) scan->opaque;
268  Relation rel = scan->indexRelation;
269  Buffer buf;
270  Page page;
271  OffsetNumber offnum;
272  ItemPointer current;
273  bool res;
274 
275  /* Hash indexes are always lossy since we store only the hash code */
276  scan->xs_recheck = true;
277 
278  /*
279  * We hold pin but not lock on current buffer while outside the hash AM.
280  * Reacquire the read lock here.
281  */
282  if (BufferIsValid(so->hashso_curbuf))
284 
285  /*
286  * If we've already initialized this scan, we can just advance it in the
287  * appropriate direction. If we haven't done so yet, we call a routine to
288  * get the first item in the scan.
289  */
290  current = &(so->hashso_curpos);
291  if (ItemPointerIsValid(current))
292  {
293  /*
294  * An insertion into the current index page could have happened while
295  * we didn't have read lock on it. Re-find our position by looking
296  * for the TID we previously returned. (Because we hold a pin on the
297  * primary bucket page, no deletions or splits could have occurred;
298  * therefore we can expect that the TID still exists in the current
299  * index page, at an offset >= where we were.)
300  */
301  OffsetNumber maxoffnum;
302 
303  buf = so->hashso_curbuf;
305  page = BufferGetPage(buf);
306  maxoffnum = PageGetMaxOffsetNumber(page);
307  for (offnum = ItemPointerGetOffsetNumber(current);
308  offnum <= maxoffnum;
309  offnum = OffsetNumberNext(offnum))
310  {
311  IndexTuple itup;
312 
313  itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
314  if (ItemPointerEquals(&(so->hashso_heappos), &(itup->t_tid)))
315  break;
316  }
317  if (offnum > maxoffnum)
318  elog(ERROR, "failed to re-find scan position within index \"%s\"",
320  ItemPointerSetOffsetNumber(current, offnum);
321 
322  /*
323  * Check to see if we should kill the previously-fetched tuple.
324  */
325  if (scan->kill_prior_tuple)
326  {
327  /*
328  * Yes, so mark it by setting the LP_DEAD state in the item flags.
329  */
330  ItemIdMarkDead(PageGetItemId(page, offnum));
331 
332  /*
333  * Since this can be redone later if needed, mark as a hint.
334  */
335  MarkBufferDirtyHint(buf, true);
336  }
337 
338  /*
339  * Now continue the scan.
340  */
341  res = _hash_next(scan, dir);
342  }
343  else
344  res = _hash_first(scan, dir);
345 
346  /*
347  * Skip killed tuples if asked to.
348  */
349  if (scan->ignore_killed_tuples)
350  {
351  while (res)
352  {
353  offnum = ItemPointerGetOffsetNumber(current);
354  page = BufferGetPage(so->hashso_curbuf);
355  if (!ItemIdIsDead(PageGetItemId(page, offnum)))
356  break;
357  res = _hash_next(scan, dir);
358  }
359  }
360 
361  /* Release read lock on current buffer, but keep it pinned */
362  if (BufferIsValid(so->hashso_curbuf))
364 
365  /* Return current heap TID on success */
366  scan->xs_ctup.t_self = so->hashso_heappos;
367 
368  return res;
369 }
370 
371 
372 /*
373  * hashgetbitmap() -- get all tuples at once
374  */
375 int64
377 {
378  HashScanOpaque so = (HashScanOpaque) scan->opaque;
379  bool res;
380  int64 ntids = 0;
381 
382  res = _hash_first(scan, ForwardScanDirection);
383 
384  while (res)
385  {
386  bool add_tuple;
387 
388  /*
389  * Skip killed tuples if asked to.
390  */
391  if (scan->ignore_killed_tuples)
392  {
393  Page page;
394  OffsetNumber offnum;
395 
396  offnum = ItemPointerGetOffsetNumber(&(so->hashso_curpos));
397  page = BufferGetPage(so->hashso_curbuf);
398  add_tuple = !ItemIdIsDead(PageGetItemId(page, offnum));
399  }
400  else
401  add_tuple = true;
402 
403  /* Save tuple ID, and continue scanning */
404  if (add_tuple)
405  {
406  /* Note we mark the tuple ID as requiring recheck */
407  tbm_add_tuples(tbm, &(so->hashso_heappos), 1, true);
408  ntids++;
409  }
410 
411  res = _hash_next(scan, ForwardScanDirection);
412  }
413 
414  return ntids;
415 }
416 
417 
418 /*
419  * hashbeginscan() -- start a scan on a hash index
420  */
422 hashbeginscan(Relation rel, int nkeys, int norderbys)
423 {
424  IndexScanDesc scan;
425  HashScanOpaque so;
426 
427  /* no order by operators allowed */
428  Assert(norderbys == 0);
429 
430  scan = RelationGetIndexScan(rel, nkeys, norderbys);
431 
432  so = (HashScanOpaque) palloc(sizeof(HashScanOpaqueData));
436  /* set position invalid (this will cause _hash_first call) */
439 
440  so->hashso_buc_populated = false;
441  so->hashso_buc_split = false;
442 
443  scan->opaque = so;
444 
445  return scan;
446 }
447 
448 /*
449  * hashrescan() -- rescan an index relation
450  */
451 void
452 hashrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
453  ScanKey orderbys, int norderbys)
454 {
455  HashScanOpaque so = (HashScanOpaque) scan->opaque;
456  Relation rel = scan->indexRelation;
457 
458  _hash_dropscanbuf(rel, so);
459 
460  /* set position invalid (this will cause _hash_first call) */
463 
464  /* Update scan key, if a new one is given */
465  if (scankey && scan->numberOfKeys > 0)
466  {
467  memmove(scan->keyData,
468  scankey,
469  scan->numberOfKeys * sizeof(ScanKeyData));
470  }
471 
472  so->hashso_buc_populated = false;
473  so->hashso_buc_split = false;
474 }
475 
476 /*
477  * hashendscan() -- close down a scan
478  */
479 void
481 {
482  HashScanOpaque so = (HashScanOpaque) scan->opaque;
483  Relation rel = scan->indexRelation;
484 
485  _hash_dropscanbuf(rel, so);
486 
487  pfree(so);
488  scan->opaque = NULL;
489 }
490 
491 /*
492  * Bulk deletion of all index entries pointing to a set of heap tuples.
493  * The set of target tuples is specified via a callback routine that tells
494  * whether any given heap tuple (identified by ItemPointer) is being deleted.
495  *
496  * This function also deletes the tuples that are moved by split to other
497  * bucket.
498  *
499  * Result: a palloc'd struct containing statistical info for VACUUM displays.
500  */
503  IndexBulkDeleteCallback callback, void *callback_state)
504 {
505  Relation rel = info->index;
506  double tuples_removed;
507  double num_index_tuples;
508  double orig_ntuples;
509  Bucket orig_maxbucket;
510  Bucket cur_maxbucket;
511  Bucket cur_bucket;
512  Buffer metabuf = InvalidBuffer;
513  HashMetaPage metap;
514  HashMetaPage cachedmetap;
515 
516  tuples_removed = 0;
517  num_index_tuples = 0;
518 
519  /*
520  * We need a copy of the metapage so that we can use its hashm_spares[]
521  * values to compute bucket page addresses, but a cached copy should be
522  * good enough. (If not, we'll detect that further down and refresh the
523  * cache as necessary.)
524  */
525  cachedmetap = _hash_getcachedmetap(rel, &metabuf, false);
526  Assert(cachedmetap != NULL);
527 
528  orig_maxbucket = cachedmetap->hashm_maxbucket;
529  orig_ntuples = cachedmetap->hashm_ntuples;
530 
531  /* Scan the buckets that we know exist */
532  cur_bucket = 0;
533  cur_maxbucket = orig_maxbucket;
534 
535 loop_top:
536  while (cur_bucket <= cur_maxbucket)
537  {
538  BlockNumber bucket_blkno;
540  Buffer bucket_buf;
541  Buffer buf;
542  HashPageOpaque bucket_opaque;
543  Page page;
544  bool split_cleanup = false;
545 
546  /* Get address of bucket's start page */
547  bucket_blkno = BUCKET_TO_BLKNO(cachedmetap, cur_bucket);
548 
549  blkno = bucket_blkno;
550 
551  /*
552  * We need to acquire a cleanup lock on the primary bucket page to out
553  * wait concurrent scans before deleting the dead tuples.
554  */
555  buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, info->strategy);
557  _hash_checkpage(rel, buf, LH_BUCKET_PAGE);
558 
559  page = BufferGetPage(buf);
560  bucket_opaque = (HashPageOpaque) PageGetSpecialPointer(page);
561 
562  /*
563  * If the bucket contains tuples that are moved by split, then we need
564  * to delete such tuples. We can't delete such tuples if the split
565  * operation on bucket is not finished as those are needed by scans.
566  */
567  if (!H_BUCKET_BEING_SPLIT(bucket_opaque) &&
568  H_NEEDS_SPLIT_CLEANUP(bucket_opaque))
569  {
570  split_cleanup = true;
571 
572  /*
573  * This bucket might have been split since we last held a lock on
574  * the metapage. If so, hashm_maxbucket, hashm_highmask and
575  * hashm_lowmask might be old enough to cause us to fail to remove
576  * tuples left behind by the most recent split. To prevent that,
577  * now that the primary page of the target bucket has been locked
578  * (and thus can't be further split), check whether we need to
579  * update our cached metapage data.
580  *
581  * NB: The check for InvalidBlockNumber is only needed for
582  * on-disk compatibility with indexes created before we started
583  * storing hashm_maxbucket in the primary page's hasho_prevblkno.
584  */
585  if (bucket_opaque->hasho_prevblkno != InvalidBlockNumber &&
586  bucket_opaque->hasho_prevblkno > cachedmetap->hashm_maxbucket)
587  {
588  cachedmetap = _hash_getcachedmetap(rel, &metabuf, true);
589  Assert(cachedmetap != NULL);
590  }
591  }
592 
593  bucket_buf = buf;
594 
595  hashbucketcleanup(rel, cur_bucket, bucket_buf, blkno, info->strategy,
596  cachedmetap->hashm_maxbucket,
597  cachedmetap->hashm_highmask,
598  cachedmetap->hashm_lowmask, &tuples_removed,
599  &num_index_tuples, split_cleanup,
600  callback, callback_state);
601 
602  _hash_dropbuf(rel, bucket_buf);
603 
604  /* Advance to next bucket */
605  cur_bucket++;
606  }
607 
608  if (BufferIsInvalid(metabuf))
610 
611  /* Write-lock metapage and check for split since we started */
613  metap = HashPageGetMeta(BufferGetPage(metabuf));
614 
615  if (cur_maxbucket != metap->hashm_maxbucket)
616  {
617  /* There's been a split, so process the additional bucket(s) */
618  LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
619  cachedmetap = _hash_getcachedmetap(rel, &metabuf, true);
620  Assert(cachedmetap != NULL);
621  cur_maxbucket = cachedmetap->hashm_maxbucket;
622  goto loop_top;
623  }
624 
625  /* Okay, we're really done. Update tuple count in metapage. */
626 
627  if (orig_maxbucket == metap->hashm_maxbucket &&
628  orig_ntuples == metap->hashm_ntuples)
629  {
630  /*
631  * No one has split or inserted anything since start of scan, so
632  * believe our count as gospel.
633  */
634  metap->hashm_ntuples = num_index_tuples;
635  }
636  else
637  {
638  /*
639  * Otherwise, our count is untrustworthy since we may have
640  * double-scanned tuples in split buckets. Proceed by dead-reckoning.
641  * (Note: we still return estimated_count = false, because using this
642  * count is better than not updating reltuples at all.)
643  */
644  if (metap->hashm_ntuples > tuples_removed)
645  metap->hashm_ntuples -= tuples_removed;
646  else
647  metap->hashm_ntuples = 0;
648  num_index_tuples = metap->hashm_ntuples;
649  }
650 
651  MarkBufferDirty(metabuf);
652  _hash_relbuf(rel, metabuf);
653 
654  /* return statistics */
655  if (stats == NULL)
657  stats->estimated_count = false;
658  stats->num_index_tuples = num_index_tuples;
659  stats->tuples_removed += tuples_removed;
660  /* hashvacuumcleanup will fill in num_pages */
661 
662  return stats;
663 }
664 
665 /*
666  * Post-VACUUM cleanup.
667  *
668  * Result: a palloc'd struct containing statistical info for VACUUM displays.
669  */
672 {
673  Relation rel = info->index;
674  BlockNumber num_pages;
675 
676  /* If hashbulkdelete wasn't called, return NULL signifying no change */
677  /* Note: this covers the analyze_only case too */
678  if (stats == NULL)
679  return NULL;
680 
681  /* update statistics */
682  num_pages = RelationGetNumberOfBlocks(rel);
683  stats->num_pages = num_pages;
684 
685  return stats;
686 }
687 
688 /*
689  * Helper function to perform deletion of index entries from a bucket.
690  *
691  * This function expects that the caller has acquired a cleanup lock on the
692  * primary bucket page, and will return with a write lock again held on the
693  * primary bucket page. The lock won't necessarily be held continuously,
694  * though, because we'll release it when visiting overflow pages.
695  *
696  * It would be very bad if this function cleaned a page while some other
697  * backend was in the midst of scanning it, because hashgettuple assumes
698  * that the next valid TID will be greater than or equal to the current
699  * valid TID. There can't be any concurrent scans in progress when we first
700  * enter this function because of the cleanup lock we hold on the primary
701  * bucket page, but as soon as we release that lock, there might be. We
702  * handle that by conspiring to prevent those scans from passing our cleanup
703  * scan. To do that, we lock the next page in the bucket chain before
704  * releasing the lock on the previous page. (This type of lock chaining is
705  * not ideal, so we might want to look for a better solution at some point.)
706  *
707  * We need to retain a pin on the primary bucket to ensure that no concurrent
708  * split can start.
709  */
710 void
711 hashbucketcleanup(Relation rel, Bucket cur_bucket, Buffer bucket_buf,
712  BlockNumber bucket_blkno, BufferAccessStrategy bstrategy,
713  uint32 maxbucket, uint32 highmask, uint32 lowmask,
714  double *tuples_removed, double *num_index_tuples,
715  bool split_cleanup,
716  IndexBulkDeleteCallback callback, void *callback_state)
717 {
719  Buffer buf;
721  bool bucket_dirty = false;
722 
723  blkno = bucket_blkno;
724  buf = bucket_buf;
725 
726  if (split_cleanup)
727  new_bucket = _hash_get_newbucket_from_oldbucket(rel, cur_bucket,
728  lowmask, maxbucket);
729 
730  /* Scan each page in bucket */
731  for (;;)
732  {
733  HashPageOpaque opaque;
734  OffsetNumber offno;
735  OffsetNumber maxoffno;
736  Buffer next_buf;
737  Page page;
738  OffsetNumber deletable[MaxOffsetNumber];
739  int ndeletable = 0;
740  bool retain_pin = false;
741 
743 
744  page = BufferGetPage(buf);
745  opaque = (HashPageOpaque) PageGetSpecialPointer(page);
746 
747  /* Scan each tuple in page */
748  maxoffno = PageGetMaxOffsetNumber(page);
749  for (offno = FirstOffsetNumber;
750  offno <= maxoffno;
751  offno = OffsetNumberNext(offno))
752  {
753  ItemPointer htup;
754  IndexTuple itup;
755  Bucket bucket;
756  bool kill_tuple = false;
757 
758  itup = (IndexTuple) PageGetItem(page,
759  PageGetItemId(page, offno));
760  htup = &(itup->t_tid);
761 
762  /*
763  * To remove the dead tuples, we strictly want to rely on results
764  * of callback function. refer btvacuumpage for detailed reason.
765  */
766  if (callback && callback(htup, callback_state))
767  {
768  kill_tuple = true;
769  if (tuples_removed)
770  *tuples_removed += 1;
771  }
772  else if (split_cleanup)
773  {
774  /* delete the tuples that are moved by split. */
776  maxbucket,
777  highmask,
778  lowmask);
779  /* mark the item for deletion */
780  if (bucket != cur_bucket)
781  {
782  /*
783  * We expect tuples to either belong to curent bucket or
784  * new_bucket. This is ensured because we don't allow
785  * further splits from bucket that contains garbage. See
786  * comments in _hash_expandtable.
787  */
788  Assert(bucket == new_bucket);
789  kill_tuple = true;
790  }
791  }
792 
793  if (kill_tuple)
794  {
795  /* mark the item for deletion */
796  deletable[ndeletable++] = offno;
797  }
798  else
799  {
800  /* we're keeping it, so count it */
801  if (num_index_tuples)
802  *num_index_tuples += 1;
803  }
804  }
805 
806  /* retain the pin on primary bucket page till end of bucket scan */
807  if (blkno == bucket_blkno)
808  retain_pin = true;
809  else
810  retain_pin = false;
811 
812  blkno = opaque->hasho_nextblkno;
813 
814  /*
815  * Apply deletions, advance to next page and write page if needed.
816  */
817  if (ndeletable > 0)
818  {
819  PageIndexMultiDelete(page, deletable, ndeletable);
820  bucket_dirty = true;
821  MarkBufferDirty(buf);
822  }
823 
824  /* bail out if there are no more pages to scan. */
825  if (!BlockNumberIsValid(blkno))
826  break;
827 
828  next_buf = _hash_getbuf_with_strategy(rel, blkno, HASH_WRITE,
830  bstrategy);
831 
832  /*
833  * release the lock on previous page after acquiring the lock on next
834  * page
835  */
836  if (retain_pin)
838  else
839  _hash_relbuf(rel, buf);
840 
841  buf = next_buf;
842  }
843 
844  /*
845  * lock the bucket page to clear the garbage flag and squeeze the bucket.
846  * if the current buffer is same as bucket buffer, then we already have
847  * lock on bucket page.
848  */
849  if (buf != bucket_buf)
850  {
851  _hash_relbuf(rel, buf);
852  LockBuffer(bucket_buf, BUFFER_LOCK_EXCLUSIVE);
853  }
854 
855  /*
856  * Clear the garbage flag from bucket after deleting the tuples that are
857  * moved by split. We purposefully clear the flag before squeeze bucket,
858  * so that after restart, vacuum shouldn't again try to delete the moved
859  * by split tuples.
860  */
861  if (split_cleanup)
862  {
863  HashPageOpaque bucket_opaque;
864  Page page;
865 
866  page = BufferGetPage(bucket_buf);
867  bucket_opaque = (HashPageOpaque) PageGetSpecialPointer(page);
868 
869  bucket_opaque->hasho_flag &= ~LH_BUCKET_NEEDS_SPLIT_CLEANUP;
870  MarkBufferDirty(bucket_buf);
871  }
872 
873  /*
874  * If we have deleted anything, try to compact free space. For squeezing
875  * the bucket, we must have a cleanup lock, else it can impact the
876  * ordering of tuples for a scan that has started before it.
877  */
878  if (bucket_dirty && IsBufferCleanupOK(bucket_buf))
879  _hash_squeezebucket(rel, cur_bucket, bucket_blkno, bucket_buf,
880  bstrategy);
881  else
882  LockBuffer(bucket_buf, BUFFER_LOCK_UNLOCK);
883 }
884 
885 void
887 {
888  elog(PANIC, "hash_redo: unimplemented");
889 }
#define ItemPointerIsValid(pointer)
Definition: itemptr.h:59
ambeginscan_function ambeginscan
Definition: amapi.h:208
static void hashbuildCallback(Relation index, HeapTuple htup, Datum *values, bool *isnull, bool tupleIsAlive, void *state)
Definition: hash.c:192
bytea * hashoptions(Datum reloptions, bool validate)
Definition: hashutil.c:223
#define PG_RETURN_POINTER(x)
Definition: fmgr.h:305
HashScanOpaqueData * HashScanOpaque
Definition: hash.h:140
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:87
void LockBufferForCleanup(Buffer buffer)
Definition: bufmgr.c:3586
ambulkdelete_function ambulkdelete
Definition: amapi.h:201
bool hashgettuple(IndexScanDesc scan, ScanDirection dir)
Definition: hash.c:265
bool amcanmulticol
Definition: amapi.h:179
uint16 amsupport
Definition: amapi.h:169
double tuples_removed
Definition: genam.h:77
void _hash_dropscanbuf(Relation rel, HashScanOpaque so)
Definition: hashpage.c:268
#define HASHNProcs
Definition: hash.h:270
amgettuple_function amgettuple
Definition: amapi.h:210
void MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
Definition: bufmgr.c:3362
#define RelationGetDescr(relation)
Definition: rel.h:425
#define ItemIdMarkDead(itemId)
Definition: itemid.h:178
Bucket _hash_hashkey2bucket(uint32 hashkey, uint32 maxbucket, uint32 highmask, uint32 lowmask)
Definition: hashutil.c:124
bool amcanorderbyop
Definition: amapi.h:173
#define LH_META_PAGE
Definition: hash.h:56
IndexBulkDeleteResult * hashbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state)
Definition: hash.c:502
amproperty_function amproperty
Definition: amapi.h:206
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:1445
#define MaxOffsetNumber
Definition: off.h:28
Buffer _hash_getbuf_with_strategy(Relation rel, BlockNumber blkno, int access, int flags, BufferAccessStrategy bstrategy)
Definition: hashpage.c:218
void tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids, bool recheck)
Definition: tidbitmap.c:290
Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition: bufmgr.c:640
ItemPointerData t_tid
Definition: itup.h:37
amparallelrescan_function amparallelrescan
Definition: amapi.h:219
#define Min(x, y)
Definition: c.h:802
BufferAccessStrategy strategy
Definition: genam.h:51
void hashcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, Cost *indexStartupCost, Cost *indexTotalCost, Selectivity *indexSelectivity, double *indexCorrelation, double *indexPages)
Definition: selfuncs.c:6750
bool amstorage
Definition: amapi.h:187
double indtuples
Definition: hash.c:37
uint32 hashm_highmask
Definition: hash.h:185
#define INT4OID
Definition: pg_type.h:316
#define InvalidBuffer
Definition: buf.h:25
Relation index
Definition: genam.h:46
bool ampredlocks
Definition: amapi.h:191
void hashbucketcleanup(Relation rel, Bucket cur_bucket, Buffer bucket_buf, BlockNumber bucket_blkno, BufferAccessStrategy bstrategy, uint32 maxbucket, uint32 highmask, uint32 lowmask, double *tuples_removed, double *num_index_tuples, bool split_cleanup, IndexBulkDeleteCallback callback, void *callback_state)
Definition: hash.c:711
uint32 BlockNumber
Definition: block.h:31
IndexScanDesc hashbeginscan(Relation rel, int nkeys, int norderbys)
Definition: hash.c:422
#define HTMaxStrategyNumber
Definition: hash.h:262
aminsert_function aminsert
Definition: amapi.h:200
void _hash_dropbuf(Relation rel, Buffer buf)
Definition: hashpage.c:256
Buffer _hash_getbuf(Relation rel, BlockNumber blkno, int access, int flags)
Definition: hashpage.c:79
#define BUFFER_LOCK_EXCLUSIVE
Definition: bufmgr.h:89
bool hashinsert(Relation rel, Datum *values, bool *isnull, ItemPointer ht_ctid, Relation heapRel, IndexUniqueCheck checkUnique, IndexInfo *indexInfo)
Definition: hash.c:234
#define LH_BUCKET_NEEDS_SPLIT_CLEANUP
Definition: hash.h:59
Form_pg_class rd_rel
Definition: rel.h:113
#define ItemIdIsDead(itemId)
Definition: itemid.h:112
Oid amkeytype
Definition: amapi.h:195
#define PANIC
Definition: elog.h:53
uint32 hashm_lowmask
Definition: hash.h:186
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:354
void hash_redo(XLogReaderState *record)
Definition: hash.c:886
bool amoptionalkey
Definition: amapi.h:181
amvalidate_function amvalidate
Definition: amapi.h:207
void hashbuildempty(Relation index)
Definition: hash.c:183
bool ignore_killed_tuples
Definition: relscan.h:100
#define BUCKET_TO_BLKNO(metap, B)
Definition: hash.h:38
Relation indexRelation
Definition: relscan.h:89
uint16 OffsetNumber
Definition: off.h:24
Definition: type.h:90
uint32 Bucket
Definition: hash.h:34
IndexUniqueCheck
Definition: genam.h:111
IndexTuple index_form_tuple(TupleDesc tupleDescriptor, Datum *values, bool *isnull)
Definition: indextuple.c:37
void pfree(void *pointer)
Definition: mcxt.c:992
#define InvalidBucket
Definition: hash.h:36
amgetbitmap_function amgetbitmap
Definition: amapi.h:211
#define H_NEEDS_SPLIT_CLEANUP(opaque)
Definition: hash.h:86
BlockNumber hasho_prevblkno
Definition: hash.h:77
#define ERROR
Definition: elog.h:43
ambuild_function ambuild
Definition: amapi.h:198
uint32 _hash_get_indextuple_hashkey(IndexTuple itup)
Definition: hashutil.c:232
bool _hash_first(IndexScanDesc scan, ScanDirection dir)
Definition: hashsearch.c:220
amoptions_function amoptions
Definition: amapi.h:205
IndexBuildResult * hashbuild(Relation heap, Relation index, IndexInfo *indexInfo)
Definition: hash.c:101
BlockNumber num_pages
Definition: genam.h:73
ItemPointerData t_self
Definition: htup.h:65
static void callback(struct sockaddr *addr, struct sockaddr *mask, void *unused)
Definition: test_ifaddrs.c:49
int NLocBuffer
Definition: localbuf.c:41
bool _hash_convert_tuple(Relation index, Datum *user_values, bool *user_isnull, Datum *index_values, bool *index_isnull)
Definition: hashutil.c:259
amcostestimate_function amcostestimate
Definition: amapi.h:204
bool amcanunique
Definition: amapi.h:177
bool _hash_next(IndexScanDesc scan, ScanDirection dir)
Definition: hashsearch.c:34
#define BufferIsInvalid(buffer)
Definition: buf.h:31
static char * buf
Definition: pg_test_fsync.c:65
amvacuumcleanup_function amvacuumcleanup
Definition: amapi.h:202
#define HASH_WRITE
Definition: hash.h:255
amendscan_function amendscan
Definition: amapi.h:212
#define memmove(d, s, c)
Definition: c.h:1058
#define HASH_NOLOCK
Definition: hash.h:256
bool amcanbackward
Definition: amapi.h:175
#define FirstOffsetNumber
Definition: off.h:27
IndexTupleData * IndexTuple
Definition: itup.h:53
Bucket _hash_get_newbucket_from_oldbucket(Relation rel, Bucket old_bucket, uint32 lowmask, uint32 maxbucket)
Definition: hashutil.c:435
void _h_spool(HSpool *hspool, ItemPointer self, Datum *values, bool *isnull)
Definition: hashsort.c:93
ScanDirection
Definition: sdir.h:22
#define RelationGetRelationName(relation)
Definition: rel.h:433
unsigned int uint32
Definition: c.h:265
void estimate_rel_size(Relation rel, int32 *attr_widths, BlockNumber *pages, double *tuples, double *allvisfrac)
Definition: plancat.c:901
#define BufferGetPage(buffer)
Definition: bufmgr.h:160
amrescan_function amrescan
Definition: amapi.h:209
bool amcanparallel
Definition: amapi.h:193
Buffer hashso_bucket_buf
Definition: hash.h:115
int64 hashgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
Definition: hash.c:376
bool IsBufferCleanupOK(Buffer buffer)
Definition: bufmgr.c:3757
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:232
void _hash_doinsert(Relation rel, IndexTuple itup)
Definition: hashinsert.c:29
bool amsearchnulls
Definition: amapi.h:185
void hashendscan(IndexScanDesc scan)
Definition: hash.c:480
void _hash_checkpage(Relation rel, Buffer buf, int flags)
Definition: hashutil.c:158
HSpool * spool
Definition: hash.c:36
void * palloc0(Size size)
Definition: mcxt.c:920
BlockNumber blkno
Definition: gistvacuum.c:105
#define HASH_METAPAGE
Definition: hash.h:146
uintptr_t Datum
Definition: postgres.h:374
double hashm_ntuples
Definition: hash.h:178
bool hashso_buc_populated
Definition: hash.h:131
#define LH_OVERFLOW_PAGE
Definition: hash.h:53
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:3529
bool amclusterable
Definition: amapi.h:189
#define RelationGetNumberOfBlocks(reln)
Definition: bufmgr.h:199
bool amsearcharray
Definition: amapi.h:183
HashMetaPage _hash_getcachedmetap(Relation rel, Buffer *metabuf, bool force_refresh)
Definition: hashpage.c:1216
void _hash_relbuf(Relation rel, Buffer buf)
Definition: hashpage.c:245
int maintenance_work_mem
Definition: globals.c:113
#define BlockNumberIsValid(blockNumber)
Definition: block.h:70
#define LH_BUCKET_PAGE
Definition: hash.h:54
IndexBulkDeleteResult * hashvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
Definition: hash.c:671
#define H_BUCKET_BEING_SPLIT(opaque)
Definition: hash.h:87
Datum hashhandler(PG_FUNCTION_ARGS)
Definition: hash.c:53
#define makeNode(_type_)
Definition: nodes.h:556
#define NULL
Definition: c.h:226
#define Assert(condition)
Definition: c.h:671
Definition: regguts.h:298
HeapTupleData xs_ctup
Definition: relscan.h:112
void PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems)
Definition: bufpage.c:809
#define OffsetNumberNext(offsetNumber)
Definition: off.h:53
#define PageGetSpecialPointer(page)
Definition: bufpage.h:323
#define InvalidBlockNumber
Definition: block.h:33
HashPageOpaqueData * HashPageOpaque
Definition: hash.h:84
void _hash_squeezebucket(Relation rel, Bucket bucket, BlockNumber bucket_blkno, Buffer bucket_buf, BufferAccessStrategy bstrategy)
Definition: hashovfl.c:629
#define BufferIsValid(bufnum)
Definition: bufmgr.h:114
ItemPointerData hashso_curpos
Definition: hash.h:125
#define ItemPointerGetOffsetNumber(pointer)
Definition: itemptr.h:76
ammarkpos_function ammarkpos
Definition: amapi.h:213
bool amcanorder
Definition: amapi.h:171
ScanKey keyData
Definition: relscan.h:93
amestimateparallelscan_function amestimateparallelscan
Definition: amapi.h:217
bool ItemPointerEquals(ItemPointer pointer1, ItemPointer pointer2)
Definition: itemptr.c:29
uint32 hashm_maxbucket
Definition: hash.h:184
void _h_indexbuild(HSpool *hspool)
Definition: hashsort.c:104
uint16 hasho_flag
Definition: hash.h:80
bool hashso_buc_split
Definition: hash.h:137
#define ItemPointerSetOffsetNumber(pointer, offsetNumber)
Definition: itemptr.h:107
bool hashvalidate(Oid opclassoid)
Definition: hashvalidate.c:44
uint16 amstrategies
Definition: amapi.h:167
static Datum values[MAXATTR]
Definition: bootstrap.c:162
uint32 _hash_metapinit(Relation rel, double num_tuples, ForkNumber forkNum)
Definition: hashpage.c:306
void _h_spooldestroy(HSpool *hspool)
Definition: hashsort.c:83
#define ItemPointerSetInvalid(pointer)
Definition: itemptr.h:131
double IndexBuildHeapScan(Relation heapRelation, Relation indexRelation, IndexInfo *indexInfo, bool allow_sync, IndexBuildCallback callback, void *callback_state)
Definition: index.c:2169
#define HashPageGetMeta(page)
Definition: hash.h:238
void * palloc(Size size)
Definition: mcxt.c:891
ambuildempty_function ambuildempty
Definition: amapi.h:199
bool kill_prior_tuple
Definition: relscan.h:99
Buffer hashso_split_bucket_buf
Definition: hash.h:122
int NBuffers
Definition: globals.c:122
#define BUFFER_LOCK_SHARE
Definition: bufmgr.h:88
BlockNumber hasho_nextblkno
Definition: hash.h:78
IndexScanDesc RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys)
Definition: genam.c:78
#define PG_FUNCTION_ARGS
Definition: fmgr.h:150
#define elog
Definition: elog.h:219
HSpool * _h_spoolinit(Relation heap, Relation index, uint32 num_buckets)
Definition: hashsort.c:48
#define RELPERSISTENCE_TEMP
Definition: pg_class.h:172
void vacuum_delay_point(void)
Definition: vacuum.c:1515
#define PG_USED_FOR_ASSERTS_ONLY
Definition: c.h:986
double num_index_tuples
Definition: genam.h:76
int Buffer
Definition: buf.h:23
amcanreturn_function amcanreturn
Definition: amapi.h:203
bool estimated_count
Definition: genam.h:75
ItemPointerData hashso_heappos
Definition: hash.h:128
bool(* IndexBulkDeleteCallback)(ItemPointer itemptr, void *state)
Definition: genam.h:83
Buffer hashso_curbuf
Definition: hash.h:112
void hashrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, ScanKey orderbys, int norderbys)
Definition: hash.c:452
#define PageGetItem(page, itemId)
Definition: bufpage.h:337
Pointer Page
Definition: bufpage.h:74
double index_tuples
Definition: genam.h:33
aminitparallelscan_function aminitparallelscan
Definition: amapi.h:218
double heap_tuples
Definition: genam.h:32
amrestrpos_function amrestrpos
Definition: amapi.h:214