PostgreSQL Source Code  git master
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros
hashpage.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * hashpage.c
4  * Hash table page management code for the Postgres hash access method
5  *
6  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  * src/backend/access/hash/hashpage.c
12  *
13  * NOTES
14  * Postgres hash pages look like ordinary relation pages. The opaque
15  * data at high addresses includes information about the page including
16  * whether a page is an overflow page or a true bucket, the bucket
17  * number, and the block numbers of the preceding and following pages
18  * in the same bucket.
19  *
20  * The first page in a hash relation, page zero, is special -- it stores
21  * information describing the hash table; it is referred to as the
22  * "meta page." Pages one and higher store the actual data.
23  *
24  * There are also bitmap pages, which are not manipulated here;
25  * see hashovfl.c.
26  *
27  *-------------------------------------------------------------------------
28  */
29 #include "postgres.h"
30 
31 #include "access/hash.h"
32 #include "miscadmin.h"
33 #include "storage/lmgr.h"
34 #include "storage/smgr.h"
35 
36 
37 static bool _hash_alloc_buckets(Relation rel, BlockNumber firstblock,
38  uint32 nblocks);
39 static void _hash_splitbucket(Relation rel, Buffer metabuf,
40  Bucket obucket, Bucket nbucket,
41  Buffer obuf,
42  Buffer nbuf,
43  HTAB *htab,
44  uint32 maxbucket,
45  uint32 highmask, uint32 lowmask);
46 
47 
48 /*
49  * We use high-concurrency locking on hash indexes (see README for an overview
50  * of the locking rules). However, we can skip taking lmgr locks when the
51  * index is local to the current backend (ie, either temp or new in the
52  * current transaction). No one else can see it, so there's no reason to
53  * take locks. We still take buffer-level locks, but not lmgr locks.
54  */
55 #define USELOCKING(rel) (!RELATION_IS_LOCAL(rel))
56 
57 
58 /*
59  * _hash_getbuf() -- Get a buffer by block number for read or write.
60  *
61  * 'access' must be HASH_READ, HASH_WRITE, or HASH_NOLOCK.
62  * 'flags' is a bitwise OR of the allowed page types.
63  *
64  * This must be used only to fetch pages that are expected to be valid
65  * already. _hash_checkpage() is applied using the given flags.
66  *
67  * When this routine returns, the appropriate lock is set on the
68  * requested buffer and its reference count has been incremented
69  * (ie, the buffer is "locked and pinned").
70  *
71  * P_NEW is disallowed because this routine can only be used
72  * to access pages that are known to be before the filesystem EOF.
73  * Extending the index should be done with _hash_getnewbuf.
74  */
75 Buffer
76 _hash_getbuf(Relation rel, BlockNumber blkno, int access, int flags)
77 {
78  Buffer buf;
79 
80  if (blkno == P_NEW)
81  elog(ERROR, "hash AM does not use P_NEW");
82 
83  buf = ReadBuffer(rel, blkno);
84 
85  if (access != HASH_NOLOCK)
86  LockBuffer(buf, access);
87 
88  /* ref count and lock type are correct */
89 
90  _hash_checkpage(rel, buf, flags);
91 
92  return buf;
93 }
94 
95 /*
96  * _hash_getbuf_with_condlock_cleanup() -- Try to get a buffer for cleanup.
97  *
98  * We read the page and try to acquire a cleanup lock. If we get it,
99  * we return the buffer; otherwise, we return InvalidBuffer.
100  */
101 Buffer
103 {
104  Buffer buf;
105 
106  if (blkno == P_NEW)
107  elog(ERROR, "hash AM does not use P_NEW");
108 
109  buf = ReadBuffer(rel, blkno);
110 
112  {
113  ReleaseBuffer(buf);
114  return InvalidBuffer;
115  }
116 
117  /* ref count and lock type are correct */
118 
119  _hash_checkpage(rel, buf, flags);
120 
121  return buf;
122 }
123 
124 /*
125  * _hash_getinitbuf() -- Get and initialize a buffer by block number.
126  *
127  * This must be used only to fetch pages that are known to be before
128  * the index's filesystem EOF, but are to be filled from scratch.
129  * _hash_pageinit() is applied automatically. Otherwise it has
130  * effects similar to _hash_getbuf() with access = HASH_WRITE.
131  *
132  * When this routine returns, a write lock is set on the
133  * requested buffer and its reference count has been incremented
134  * (ie, the buffer is "locked and pinned").
135  *
136  * P_NEW is disallowed because this routine can only be used
137  * to access pages that are known to be before the filesystem EOF.
138  * Extending the index should be done with _hash_getnewbuf.
139  */
140 Buffer
142 {
143  Buffer buf;
144 
145  if (blkno == P_NEW)
146  elog(ERROR, "hash AM does not use P_NEW");
147 
149  NULL);
150 
151  /* ref count and lock type are correct */
152 
153  /* initialize the page */
155 
156  return buf;
157 }
158 
159 /*
160  * _hash_getnewbuf() -- Get a new page at the end of the index.
161  *
162  * This has the same API as _hash_getinitbuf, except that we are adding
163  * a page to the index, and hence expect the page to be past the
164  * logical EOF. (However, we have to support the case where it isn't,
165  * since a prior try might have crashed after extending the filesystem
166  * EOF but before updating the metapage to reflect the added page.)
167  *
168  * It is caller's responsibility to ensure that only one process can
169  * extend the index at a time. In practice, this function is called
170  * only while holding write lock on the metapage, because adding a page
171  * is always associated with an update of metapage data.
172  */
173 Buffer
175 {
176  BlockNumber nblocks = RelationGetNumberOfBlocksInFork(rel, forkNum);
177  Buffer buf;
178 
179  if (blkno == P_NEW)
180  elog(ERROR, "hash AM does not use P_NEW");
181  if (blkno > nblocks)
182  elog(ERROR, "access to noncontiguous page in hash index \"%s\"",
184 
185  /* smgr insists we use P_NEW to extend the relation */
186  if (blkno == nblocks)
187  {
188  buf = ReadBufferExtended(rel, forkNum, P_NEW, RBM_NORMAL, NULL);
189  if (BufferGetBlockNumber(buf) != blkno)
190  elog(ERROR, "unexpected hash relation size: %u, should be %u",
191  BufferGetBlockNumber(buf), blkno);
192  LockBuffer(buf, HASH_WRITE);
193  }
194  else
195  {
196  buf = ReadBufferExtended(rel, forkNum, blkno, RBM_ZERO_AND_LOCK,
197  NULL);
198  }
199 
200  /* ref count and lock type are correct */
201 
202  /* initialize the page */
204 
205  return buf;
206 }
207 
208 /*
209  * _hash_getbuf_with_strategy() -- Get a buffer with nondefault strategy.
210  *
211  * This is identical to _hash_getbuf() but also allows a buffer access
212  * strategy to be specified. We use this for VACUUM operations.
213  */
214 Buffer
216  int access, int flags,
217  BufferAccessStrategy bstrategy)
218 {
219  Buffer buf;
220 
221  if (blkno == P_NEW)
222  elog(ERROR, "hash AM does not use P_NEW");
223 
224  buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, bstrategy);
225 
226  if (access != HASH_NOLOCK)
227  LockBuffer(buf, access);
228 
229  /* ref count and lock type are correct */
230 
231  _hash_checkpage(rel, buf, flags);
232 
233  return buf;
234 }
235 
236 /*
237  * _hash_relbuf() -- release a locked buffer.
238  *
239  * Lock and pin (refcount) are both dropped.
240  */
241 void
243 {
244  UnlockReleaseBuffer(buf);
245 }
246 
247 /*
248  * _hash_dropbuf() -- release an unlocked buffer.
249  *
250  * This is used to unpin a buffer on which we hold no lock.
251  */
252 void
254 {
255  ReleaseBuffer(buf);
256 }
257 
258 /*
259  * _hash_dropscanbuf() -- release buffers used in scan.
260  *
261  * This routine unpins the buffers used during scan on which we
262  * hold no lock.
263  */
264 void
266 {
267  /* release pin we hold on primary bucket page */
268  if (BufferIsValid(so->hashso_bucket_buf) &&
269  so->hashso_bucket_buf != so->hashso_curbuf)
272 
273  /* release pin we hold on primary bucket page of bucket being split */
278 
279  /* release any pin we still hold */
280  if (BufferIsValid(so->hashso_curbuf))
281  _hash_dropbuf(rel, so->hashso_curbuf);
283 
284  /* reset split scan */
285  so->hashso_buc_populated = false;
286  so->hashso_buc_split = false;
287 }
288 
289 
290 /*
291  * _hash_metapinit() -- Initialize the metadata page of a hash index,
292  * the initial buckets, and the initial bitmap page.
293  *
294  * The initial number of buckets is dependent on num_tuples, an estimate
295  * of the number of tuples to be loaded into the index initially. The
296  * chosen number of buckets is returned.
297  *
298  * We are fairly cavalier about locking here, since we know that no one else
299  * could be accessing this index. In particular the rule about not holding
300  * multiple buffer locks is ignored.
301  */
302 uint32
303 _hash_metapinit(Relation rel, double num_tuples, ForkNumber forkNum)
304 {
305  HashMetaPage metap;
306  HashPageOpaque pageopaque;
307  Buffer metabuf;
308  Buffer buf;
309  Page pg;
310  int32 data_width;
311  int32 item_width;
312  int32 ffactor;
313  double dnumbuckets;
314  uint32 num_buckets;
315  uint32 log2_num_buckets;
316  uint32 i;
317 
318  /* safety check */
319  if (RelationGetNumberOfBlocksInFork(rel, forkNum) != 0)
320  elog(ERROR, "cannot initialize non-empty hash index \"%s\"",
322 
323  /*
324  * Determine the target fill factor (in tuples per bucket) for this index.
325  * The idea is to make the fill factor correspond to pages about as full
326  * as the user-settable fillfactor parameter says. We can compute it
327  * exactly since the index datatype (i.e. uint32 hash key) is fixed-width.
328  */
329  data_width = sizeof(uint32);
330  item_width = MAXALIGN(sizeof(IndexTupleData)) + MAXALIGN(data_width) +
331  sizeof(ItemIdData); /* include the line pointer */
332  ffactor = RelationGetTargetPageUsage(rel, HASH_DEFAULT_FILLFACTOR) / item_width;
333  /* keep to a sane range */
334  if (ffactor < 10)
335  ffactor = 10;
336 
337  /*
338  * Choose the number of initial bucket pages to match the fill factor
339  * given the estimated number of tuples. We round up the result to the
340  * next power of 2, however, and always force at least 2 bucket pages. The
341  * upper limit is determined by considerations explained in
342  * _hash_expandtable().
343  */
344  dnumbuckets = num_tuples / ffactor;
345  if (dnumbuckets <= 2.0)
346  num_buckets = 2;
347  else if (dnumbuckets >= (double) 0x40000000)
348  num_buckets = 0x40000000;
349  else
350  num_buckets = ((uint32) 1) << _hash_log2((uint32) dnumbuckets);
351 
352  log2_num_buckets = _hash_log2(num_buckets);
353  Assert(num_buckets == (((uint32) 1) << log2_num_buckets));
354  Assert(log2_num_buckets < HASH_MAX_SPLITPOINTS);
355 
356  /*
357  * We initialize the metapage, the first N bucket pages, and the first
358  * bitmap page in sequence, using _hash_getnewbuf to cause smgrextend()
359  * calls to occur. This ensures that the smgr level has the right idea of
360  * the physical index length.
361  */
362  metabuf = _hash_getnewbuf(rel, HASH_METAPAGE, forkNum);
363  pg = BufferGetPage(metabuf);
364 
365  pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg);
366  pageopaque->hasho_prevblkno = InvalidBlockNumber;
367  pageopaque->hasho_nextblkno = InvalidBlockNumber;
368  pageopaque->hasho_bucket = -1;
369  pageopaque->hasho_flag = LH_META_PAGE;
370  pageopaque->hasho_page_id = HASHO_PAGE_ID;
371 
372  metap = HashPageGetMeta(pg);
373 
374  metap->hashm_magic = HASH_MAGIC;
375  metap->hashm_version = HASH_VERSION;
376  metap->hashm_ntuples = 0;
377  metap->hashm_nmaps = 0;
378  metap->hashm_ffactor = ffactor;
379  metap->hashm_bsize = HashGetMaxBitmapSize(pg);
380  /* find largest bitmap array size that will fit in page size */
381  for (i = _hash_log2(metap->hashm_bsize); i > 0; --i)
382  {
383  if ((1 << i) <= metap->hashm_bsize)
384  break;
385  }
386  Assert(i > 0);
387  metap->hashm_bmsize = 1 << i;
388  metap->hashm_bmshift = i + BYTE_TO_BIT;
389  Assert((1 << BMPG_SHIFT(metap)) == (BMPG_MASK(metap) + 1));
390 
391  /*
392  * Label the index with its primary hash support function's OID. This is
393  * pretty useless for normal operation (in fact, hashm_procid is not used
394  * anywhere), but it might be handy for forensic purposes so we keep it.
395  */
396  metap->hashm_procid = index_getprocid(rel, 1, HASHPROC);
397 
398  /*
399  * We initialize the index with N buckets, 0 .. N-1, occupying physical
400  * blocks 1 to N. The first freespace bitmap page is in block N+1. Since
401  * N is a power of 2, we can set the masks this way:
402  */
403  metap->hashm_maxbucket = metap->hashm_lowmask = num_buckets - 1;
404  metap->hashm_highmask = (num_buckets << 1) - 1;
405 
406  MemSet(metap->hashm_spares, 0, sizeof(metap->hashm_spares));
407  MemSet(metap->hashm_mapp, 0, sizeof(metap->hashm_mapp));
408 
409  /* Set up mapping for one spare page after the initial splitpoints */
410  metap->hashm_spares[log2_num_buckets] = 1;
411  metap->hashm_ovflpoint = log2_num_buckets;
412  metap->hashm_firstfree = 0;
413 
414  /*
415  * Release buffer lock on the metapage while we initialize buckets.
416  * Otherwise, we'll be in interrupt holdoff and the CHECK_FOR_INTERRUPTS
417  * won't accomplish anything. It's a bad idea to hold buffer locks for
418  * long intervals in any case, since that can block the bgwriter.
419  */
420  MarkBufferDirty(metabuf);
421  LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
422 
423  /*
424  * Initialize the first N buckets
425  */
426  for (i = 0; i < num_buckets; i++)
427  {
428  /* Allow interrupts, in case N is huge */
430 
431  buf = _hash_getnewbuf(rel, BUCKET_TO_BLKNO(metap, i), forkNum);
432  pg = BufferGetPage(buf);
433  pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg);
434 
435  /*
436  * Set hasho_prevblkno with current hashm_maxbucket. This value will
437  * be used to validate cached HashMetaPageData. See
438  * _hash_getbucketbuf_from_hashkey().
439  */
440  pageopaque->hasho_prevblkno = metap->hashm_maxbucket;
441  pageopaque->hasho_nextblkno = InvalidBlockNumber;
442  pageopaque->hasho_bucket = i;
443  pageopaque->hasho_flag = LH_BUCKET_PAGE;
444  pageopaque->hasho_page_id = HASHO_PAGE_ID;
445  MarkBufferDirty(buf);
446  _hash_relbuf(rel, buf);
447  }
448 
449  /* Now reacquire buffer lock on metapage */
451 
452  /*
453  * Initialize first bitmap page
454  */
455  _hash_initbitmap(rel, metap, num_buckets + 1, forkNum);
456 
457  /* all done */
458  MarkBufferDirty(metabuf);
459  _hash_relbuf(rel, metabuf);
460 
461  return num_buckets;
462 }
463 
464 /*
465  * _hash_pageinit() -- Initialize a new hash index page.
466  */
467 void
469 {
470  PageInit(page, size, sizeof(HashPageOpaqueData));
471 }
472 
473 /*
474  * Attempt to expand the hash table by creating one new bucket.
475  *
476  * This will silently do nothing if we don't get cleanup lock on old or
477  * new bucket.
478  *
479  * Complete the pending splits and remove the tuples from old bucket,
480  * if there are any left over from the previous split.
481  *
482  * The caller must hold a pin, but no lock, on the metapage buffer.
483  * The buffer is returned in the same state.
484  */
485 void
487 {
488  HashMetaPage metap;
489  Bucket old_bucket;
490  Bucket new_bucket;
491  uint32 spare_ndx;
492  BlockNumber start_oblkno;
493  BlockNumber start_nblkno;
494  Buffer buf_nblkno;
495  Buffer buf_oblkno;
496  Page opage;
497  Page npage;
498  HashPageOpaque oopaque;
499  HashPageOpaque nopaque;
500  uint32 maxbucket;
501  uint32 highmask;
502  uint32 lowmask;
503 
504 restart_expand:
505 
506  /*
507  * Write-lock the meta page. It used to be necessary to acquire a
508  * heavyweight lock to begin a split, but that is no longer required.
509  */
511 
512  _hash_checkpage(rel, metabuf, LH_META_PAGE);
513  metap = HashPageGetMeta(BufferGetPage(metabuf));
514 
515  /*
516  * Check to see if split is still needed; someone else might have already
517  * done one while we waited for the lock.
518  *
519  * Make sure this stays in sync with _hash_doinsert()
520  */
521  if (metap->hashm_ntuples <=
522  (double) metap->hashm_ffactor * (metap->hashm_maxbucket + 1))
523  goto fail;
524 
525  /*
526  * Can't split anymore if maxbucket has reached its maximum possible
527  * value.
528  *
529  * Ideally we'd allow bucket numbers up to UINT_MAX-1 (no higher because
530  * the calculation maxbucket+1 mustn't overflow). Currently we restrict
531  * to half that because of overflow looping in _hash_log2() and
532  * insufficient space in hashm_spares[]. It's moot anyway because an
533  * index with 2^32 buckets would certainly overflow BlockNumber and hence
534  * _hash_alloc_buckets() would fail, but if we supported buckets smaller
535  * than a disk block then this would be an independent constraint.
536  *
537  * If you change this, see also the maximum initial number of buckets in
538  * _hash_metapinit().
539  */
540  if (metap->hashm_maxbucket >= (uint32) 0x7FFFFFFE)
541  goto fail;
542 
543  /*
544  * Determine which bucket is to be split, and attempt to take cleanup lock
545  * on the old bucket. If we can't get the lock, give up.
546  *
547  * The cleanup lock protects us not only against other backends, but
548  * against our own backend as well.
549  *
550  * The cleanup lock is mainly to protect the split from concurrent
551  * inserts. See src/backend/access/hash/README, Lock Definitions for
552  * further details. Due to this locking restriction, if there is any
553  * pending scan, the split will give up which is not good, but harmless.
554  */
555  new_bucket = metap->hashm_maxbucket + 1;
556 
557  old_bucket = (new_bucket & metap->hashm_lowmask);
558 
559  start_oblkno = BUCKET_TO_BLKNO(metap, old_bucket);
560 
561  buf_oblkno = _hash_getbuf_with_condlock_cleanup(rel, start_oblkno, LH_BUCKET_PAGE);
562  if (!buf_oblkno)
563  goto fail;
564 
565  opage = BufferGetPage(buf_oblkno);
566  oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
567 
568  /*
569  * We want to finish the split from a bucket as there is no apparent
570  * benefit by not doing so and it will make the code complicated to finish
571  * the split that involves multiple buckets considering the case where new
572  * split also fails. We don't need to consider the new bucket for
573  * completing the split here as it is not possible that a re-split of new
574  * bucket starts when there is still a pending split from old bucket.
575  */
576  if (H_BUCKET_BEING_SPLIT(oopaque))
577  {
578  /*
579  * Copy bucket mapping info now; refer the comment in code below where
580  * we copy this information before calling _hash_splitbucket to see
581  * why this is okay.
582  */
583  maxbucket = metap->hashm_maxbucket;
584  highmask = metap->hashm_highmask;
585  lowmask = metap->hashm_lowmask;
586 
587  /*
588  * Release the lock on metapage and old_bucket, before completing the
589  * split.
590  */
591  LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
592  LockBuffer(buf_oblkno, BUFFER_LOCK_UNLOCK);
593 
594  _hash_finish_split(rel, metabuf, buf_oblkno, old_bucket, maxbucket,
595  highmask, lowmask);
596 
597  /* release the pin on old buffer and retry for expand. */
598  _hash_dropbuf(rel, buf_oblkno);
599 
600  goto restart_expand;
601  }
602 
603  /*
604  * Clean the tuples remained from the previous split. This operation
605  * requires cleanup lock and we already have one on the old bucket, so
606  * let's do it. We also don't want to allow further splits from the bucket
607  * till the garbage of previous split is cleaned. This has two
608  * advantages; first, it helps in avoiding the bloat due to garbage and
609  * second is, during cleanup of bucket, we are always sure that the
610  * garbage tuples belong to most recently split bucket. On the contrary,
611  * if we allow cleanup of bucket after meta page is updated to indicate
612  * the new split and before the actual split, the cleanup operation won't
613  * be able to decide whether the tuple has been moved to the newly created
614  * bucket and ended up deleting such tuples.
615  */
616  if (H_NEEDS_SPLIT_CLEANUP(oopaque))
617  {
618  /*
619  * Copy bucket mapping info now; refer to the comment in code below
620  * where we copy this information before calling _hash_splitbucket
621  * to see why this is okay.
622  */
623  maxbucket = metap->hashm_maxbucket;
624  highmask = metap->hashm_highmask;
625  lowmask = metap->hashm_lowmask;
626 
627  /* Release the metapage lock. */
628  LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
629 
630  hashbucketcleanup(rel, old_bucket, buf_oblkno, start_oblkno, NULL,
631  maxbucket, highmask, lowmask, NULL, NULL, true,
632  NULL, NULL);
633 
634  _hash_dropbuf(rel, buf_oblkno);
635 
636  goto restart_expand;
637  }
638 
639  /*
640  * There shouldn't be any active scan on new bucket.
641  *
642  * Note: it is safe to compute the new bucket's blkno here, even though we
643  * may still need to update the BUCKET_TO_BLKNO mapping. This is because
644  * the current value of hashm_spares[hashm_ovflpoint] correctly shows
645  * where we are going to put a new splitpoint's worth of buckets.
646  */
647  start_nblkno = BUCKET_TO_BLKNO(metap, new_bucket);
648 
649  /*
650  * If the split point is increasing (hashm_maxbucket's log base 2
651  * increases), we need to allocate a new batch of bucket pages.
652  */
653  spare_ndx = _hash_log2(new_bucket + 1);
654  if (spare_ndx > metap->hashm_ovflpoint)
655  {
656  Assert(spare_ndx == metap->hashm_ovflpoint + 1);
657 
658  /*
659  * The number of buckets in the new splitpoint is equal to the total
660  * number already in existence, i.e. new_bucket. Currently this maps
661  * one-to-one to blocks required, but someday we may need a more
662  * complicated calculation here.
663  */
664  if (!_hash_alloc_buckets(rel, start_nblkno, new_bucket))
665  {
666  /* can't split due to BlockNumber overflow */
667  _hash_relbuf(rel, buf_oblkno);
668  goto fail;
669  }
670  }
671 
672  /*
673  * Physically allocate the new bucket's primary page. We want to do this
674  * before changing the metapage's mapping info, in case we can't get the
675  * disk space. Ideally, we don't need to check for cleanup lock on new
676  * bucket as no other backend could find this bucket unless meta page is
677  * updated. However, it is good to be consistent with old bucket locking.
678  */
679  buf_nblkno = _hash_getnewbuf(rel, start_nblkno, MAIN_FORKNUM);
680  if (!IsBufferCleanupOK(buf_nblkno))
681  {
682  _hash_relbuf(rel, buf_oblkno);
683  _hash_relbuf(rel, buf_nblkno);
684  goto fail;
685  }
686 
687  /*
688  * Since we are scribbling on the pages in the shared buffers, establish a
689  * critical section. Any failure in this next code leaves us with a big
690  * problem: the metapage is effectively corrupt but could get written back
691  * to disk. We don't really expect any failure, but just to be sure,
692  * establish a critical section.
693  */
695 
696  /*
697  * Okay to proceed with split. Update the metapage bucket mapping info.
698  */
699  metap->hashm_maxbucket = new_bucket;
700 
701  if (new_bucket > metap->hashm_highmask)
702  {
703  /* Starting a new doubling */
704  metap->hashm_lowmask = metap->hashm_highmask;
705  metap->hashm_highmask = new_bucket | metap->hashm_lowmask;
706  }
707 
708  /*
709  * If the split point is increasing (hashm_maxbucket's log base 2
710  * increases), we need to adjust the hashm_spares[] array and
711  * hashm_ovflpoint so that future overflow pages will be created beyond
712  * this new batch of bucket pages.
713  */
714  if (spare_ndx > metap->hashm_ovflpoint)
715  {
716  metap->hashm_spares[spare_ndx] = metap->hashm_spares[metap->hashm_ovflpoint];
717  metap->hashm_ovflpoint = spare_ndx;
718  }
719 
720  MarkBufferDirty(metabuf);
721 
722  /*
723  * Copy bucket mapping info now; this saves re-accessing the meta page
724  * inside _hash_splitbucket's inner loop. Note that once we drop the
725  * split lock, other splits could begin, so these values might be out of
726  * date before _hash_splitbucket finishes. That's okay, since all it
727  * needs is to tell which of these two buckets to map hashkeys into.
728  */
729  maxbucket = metap->hashm_maxbucket;
730  highmask = metap->hashm_highmask;
731  lowmask = metap->hashm_lowmask;
732 
733  opage = BufferGetPage(buf_oblkno);
734  oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
735 
736  /*
737  * Mark the old bucket to indicate that split is in progress. (At
738  * operation end, we will clear the split-in-progress flag.) Also,
739  * for a primary bucket page, hasho_prevblkno stores the number of
740  * buckets that existed as of the last split, so we must update that
741  * value here.
742  */
743  oopaque->hasho_flag |= LH_BUCKET_BEING_SPLIT;
744  oopaque->hasho_prevblkno = maxbucket;
745 
746  MarkBufferDirty(buf_oblkno);
747 
748  npage = BufferGetPage(buf_nblkno);
749 
750  /*
751  * initialize the new bucket's primary page and mark it to indicate that
752  * split is in progress.
753  */
754  nopaque = (HashPageOpaque) PageGetSpecialPointer(npage);
755  nopaque->hasho_prevblkno = maxbucket;
757  nopaque->hasho_bucket = new_bucket;
759  nopaque->hasho_page_id = HASHO_PAGE_ID;
760 
761  MarkBufferDirty(buf_nblkno);
762 
764 
765  /* drop lock, but keep pin */
766  LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
767 
768  /* Relocate records to the new bucket */
769  _hash_splitbucket(rel, metabuf,
770  old_bucket, new_bucket,
771  buf_oblkno, buf_nblkno, NULL,
772  maxbucket, highmask, lowmask);
773 
774  /* all done, now release the locks and pins on primary buckets. */
775  _hash_relbuf(rel, buf_oblkno);
776  _hash_relbuf(rel, buf_nblkno);
777 
778  return;
779 
780  /* Here if decide not to split or fail to acquire old bucket lock */
781 fail:
782 
783  /* We didn't write the metapage, so just drop lock */
784  LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
785 }
786 
787 
788 /*
789  * _hash_alloc_buckets -- allocate a new splitpoint's worth of bucket pages
790  *
791  * This does not need to initialize the new bucket pages; we'll do that as
792  * each one is used by _hash_expandtable(). But we have to extend the logical
793  * EOF to the end of the splitpoint; this keeps smgr's idea of the EOF in
794  * sync with ours, so that we don't get complaints from smgr.
795  *
796  * We do this by writing a page of zeroes at the end of the splitpoint range.
797  * We expect that the filesystem will ensure that the intervening pages read
798  * as zeroes too. On many filesystems this "hole" will not be allocated
799  * immediately, which means that the index file may end up more fragmented
800  * than if we forced it all to be allocated now; but since we don't scan
801  * hash indexes sequentially anyway, that probably doesn't matter.
802  *
803  * XXX It's annoying that this code is executed with the metapage lock held.
804  * We need to interlock against _hash_getovflpage() adding a new overflow page
805  * concurrently, but it'd likely be better to use LockRelationForExtension
806  * for the purpose. OTOH, adding a splitpoint is a very infrequent operation,
807  * so it may not be worth worrying about.
808  *
809  * Returns TRUE if successful, or FALSE if allocation failed due to
810  * BlockNumber overflow.
811  */
812 static bool
814 {
815  BlockNumber lastblock;
816  char zerobuf[BLCKSZ];
817 
818  lastblock = firstblock + nblocks - 1;
819 
820  /*
821  * Check for overflow in block number calculation; if so, we cannot extend
822  * the index anymore.
823  */
824  if (lastblock < firstblock || lastblock == InvalidBlockNumber)
825  return false;
826 
827  MemSet(zerobuf, 0, sizeof(zerobuf));
828 
829  RelationOpenSmgr(rel);
830  smgrextend(rel->rd_smgr, MAIN_FORKNUM, lastblock, zerobuf, false);
831 
832  return true;
833 }
834 
835 
836 /*
837  * _hash_splitbucket -- split 'obucket' into 'obucket' and 'nbucket'
838  *
839  * This routine is used to partition the tuples between old and new bucket and
840  * is used to finish the incomplete split operations. To finish the previously
841  * interrupted split operation, the caller needs to fill htab. If htab is set,
842  * then we skip the movement of tuples that exists in htab, otherwise NULL
843  * value of htab indicates movement of all the tuples that belong to the new
844  * bucket.
845  *
846  * We are splitting a bucket that consists of a base bucket page and zero
847  * or more overflow (bucket chain) pages. We must relocate tuples that
848  * belong in the new bucket.
849  *
850  * The caller must hold cleanup locks on both buckets to ensure that
851  * no one else is trying to access them (see README).
852  *
853  * The caller must hold a pin, but no lock, on the metapage buffer.
854  * The buffer is returned in the same state. (The metapage is only
855  * touched if it becomes necessary to add or remove overflow pages.)
856  *
857  * Split needs to retain pin on primary bucket pages of both old and new
858  * buckets till end of operation. This is to prevent vacuum from starting
859  * while a split is in progress.
860  *
861  * In addition, the caller must have created the new bucket's base page,
862  * which is passed in buffer nbuf, pinned and write-locked. That lock and
863  * pin are released here. (The API is set up this way because we must do
864  * _hash_getnewbuf() before releasing the metapage write lock. So instead of
865  * passing the new bucket's start block number, we pass an actual buffer.)
866  */
867 static void
869  Buffer metabuf,
870  Bucket obucket,
871  Bucket nbucket,
872  Buffer obuf,
873  Buffer nbuf,
874  HTAB *htab,
875  uint32 maxbucket,
876  uint32 highmask,
877  uint32 lowmask)
878 {
879  Buffer bucket_obuf;
880  Buffer bucket_nbuf;
881  Page opage;
882  Page npage;
883  HashPageOpaque oopaque;
884  HashPageOpaque nopaque;
885 
886  bucket_obuf = obuf;
887  opage = BufferGetPage(obuf);
888  oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
889 
890  bucket_nbuf = nbuf;
891  npage = BufferGetPage(nbuf);
892  nopaque = (HashPageOpaque) PageGetSpecialPointer(npage);
893 
894  /*
895  * Partition the tuples in the old bucket between the old bucket and the
896  * new bucket, advancing along the old bucket's overflow bucket chain and
897  * adding overflow pages to the new bucket as needed. Outer loop iterates
898  * once per page in old bucket.
899  */
900  for (;;)
901  {
902  BlockNumber oblkno;
903  OffsetNumber ooffnum;
904  OffsetNumber omaxoffnum;
905 
906  /* Scan each tuple in old page */
907  omaxoffnum = PageGetMaxOffsetNumber(opage);
908  for (ooffnum = FirstOffsetNumber;
909  ooffnum <= omaxoffnum;
910  ooffnum = OffsetNumberNext(ooffnum))
911  {
912  IndexTuple itup;
913  Size itemsz;
914  Bucket bucket;
915  bool found = false;
916 
917  /* skip dead tuples */
918  if (ItemIdIsDead(PageGetItemId(opage, ooffnum)))
919  continue;
920 
921  /*
922  * Before inserting a tuple, probe the hash table containing TIDs
923  * of tuples belonging to new bucket, if we find a match, then
924  * skip that tuple, else fetch the item's hash key (conveniently
925  * stored in the item) and determine which bucket it now belongs
926  * in.
927  */
928  itup = (IndexTuple) PageGetItem(opage,
929  PageGetItemId(opage, ooffnum));
930 
931  if (htab)
932  (void) hash_search(htab, &itup->t_tid, HASH_FIND, &found);
933 
934  if (found)
935  continue;
936 
938  maxbucket, highmask, lowmask);
939 
940  if (bucket == nbucket)
941  {
942  IndexTuple new_itup;
943 
944  /*
945  * make a copy of index tuple as we have to scribble on it.
946  */
947  new_itup = CopyIndexTuple(itup);
948 
949  /*
950  * mark the index tuple as moved by split, such tuples are
951  * skipped by scan if there is split in progress for a bucket.
952  */
953  new_itup->t_info |= INDEX_MOVED_BY_SPLIT_MASK;
954 
955  /*
956  * insert the tuple into the new bucket. if it doesn't fit on
957  * the current page in the new bucket, we must allocate a new
958  * overflow page and place the tuple on that page instead.
959  */
960  itemsz = IndexTupleDSize(*new_itup);
961  itemsz = MAXALIGN(itemsz);
962 
963  if (PageGetFreeSpace(npage) < itemsz)
964  {
965  /* write out nbuf and drop lock, but keep pin */
966  MarkBufferDirty(nbuf);
967  /* drop lock, but keep pin */
969  /* chain to a new overflow page */
970  nbuf = _hash_addovflpage(rel, metabuf, nbuf, (nbuf == bucket_nbuf) ? true : false);
971  npage = BufferGetPage(nbuf);
972  nopaque = (HashPageOpaque) PageGetSpecialPointer(npage);
973  }
974 
975  /*
976  * Insert tuple on new page, using _hash_pgaddtup to ensure
977  * correct ordering by hashkey. This is a tad inefficient
978  * since we may have to shuffle itempointers repeatedly.
979  * Possible future improvement: accumulate all the items for
980  * the new page and qsort them before insertion.
981  */
982  (void) _hash_pgaddtup(rel, nbuf, itemsz, new_itup);
983 
984  /* be tidy */
985  pfree(new_itup);
986  }
987  else
988  {
989  /*
990  * the tuple stays on this page, so nothing to do.
991  */
992  Assert(bucket == obucket);
993  }
994  }
995 
996  oblkno = oopaque->hasho_nextblkno;
997 
998  /* retain the pin on the old primary bucket */
999  if (obuf == bucket_obuf)
1001  else
1002  _hash_relbuf(rel, obuf);
1003 
1004  /* Exit loop if no more overflow pages in old bucket */
1005  if (!BlockNumberIsValid(oblkno))
1006  {
1007  MarkBufferDirty(nbuf);
1008  if (nbuf == bucket_nbuf)
1010  else
1011  _hash_relbuf(rel, nbuf);
1012  break;
1013  }
1014 
1015  /* Else, advance to next old page */
1016  obuf = _hash_getbuf(rel, oblkno, HASH_READ, LH_OVERFLOW_PAGE);
1017  opage = BufferGetPage(obuf);
1018  oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
1019  }
1020 
1021  /*
1022  * We're at the end of the old bucket chain, so we're done partitioning
1023  * the tuples. Mark the old and new buckets to indicate split is
1024  * finished.
1025  *
1026  * To avoid deadlocks due to locking order of buckets, first lock the old
1027  * bucket and then the new bucket.
1028  */
1029  LockBuffer(bucket_obuf, BUFFER_LOCK_EXCLUSIVE);
1030  opage = BufferGetPage(bucket_obuf);
1031  oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
1032 
1033  LockBuffer(bucket_nbuf, BUFFER_LOCK_EXCLUSIVE);
1034  npage = BufferGetPage(bucket_nbuf);
1035  nopaque = (HashPageOpaque) PageGetSpecialPointer(npage);
1036 
1037  oopaque->hasho_flag &= ~LH_BUCKET_BEING_SPLIT;
1039 
1040  /*
1041  * After the split is finished, mark the old bucket to indicate that it
1042  * contains deletable tuples. Vacuum will clear split-cleanup flag after
1043  * deleting such tuples.
1044  */
1046 
1047  /*
1048  * now write the buffers, here we don't release the locks as caller is
1049  * responsible to release locks.
1050  */
1051  MarkBufferDirty(bucket_obuf);
1052  MarkBufferDirty(bucket_nbuf);
1053 }
1054 
1055 /*
1056  * _hash_finish_split() -- Finish the previously interrupted split operation
1057  *
1058  * To complete the split operation, we form the hash table of TIDs in new
1059  * bucket which is then used by split operation to skip tuples that are
1060  * already moved before the split operation was previously interrupted.
1061  *
1062  * The caller must hold a pin, but no lock, on the metapage and old bucket's
1063  * primary page buffer. The buffers are returned in the same state. (The
1064  * metapage is only touched if it becomes necessary to add or remove overflow
1065  * pages.)
1066  */
1067 void
1068 _hash_finish_split(Relation rel, Buffer metabuf, Buffer obuf, Bucket obucket,
1069  uint32 maxbucket, uint32 highmask, uint32 lowmask)
1070 {
1071  HASHCTL hash_ctl;
1072  HTAB *tidhtab;
1073  Buffer bucket_nbuf = InvalidBuffer;
1074  Buffer nbuf;
1075  Page npage;
1076  BlockNumber nblkno;
1077  BlockNumber bucket_nblkno;
1078  HashPageOpaque npageopaque;
1079  Bucket nbucket;
1080  bool found;
1081 
1082  /* Initialize hash tables used to track TIDs */
1083  memset(&hash_ctl, 0, sizeof(hash_ctl));
1084  hash_ctl.keysize = sizeof(ItemPointerData);
1085  hash_ctl.entrysize = sizeof(ItemPointerData);
1086  hash_ctl.hcxt = CurrentMemoryContext;
1087 
1088  tidhtab =
1089  hash_create("bucket ctids",
1090  256, /* arbitrary initial size */
1091  &hash_ctl,
1093 
1094  bucket_nblkno = nblkno = _hash_get_newblock_from_oldbucket(rel, obucket);
1095 
1096  /*
1097  * Scan the new bucket and build hash table of TIDs
1098  */
1099  for (;;)
1100  {
1101  OffsetNumber noffnum;
1102  OffsetNumber nmaxoffnum;
1103 
1104  nbuf = _hash_getbuf(rel, nblkno, HASH_READ,
1106 
1107  /* remember the primary bucket buffer to acquire cleanup lock on it. */
1108  if (nblkno == bucket_nblkno)
1109  bucket_nbuf = nbuf;
1110 
1111  npage = BufferGetPage(nbuf);
1112  npageopaque = (HashPageOpaque) PageGetSpecialPointer(npage);
1113 
1114  /* Scan each tuple in new page */
1115  nmaxoffnum = PageGetMaxOffsetNumber(npage);
1116  for (noffnum = FirstOffsetNumber;
1117  noffnum <= nmaxoffnum;
1118  noffnum = OffsetNumberNext(noffnum))
1119  {
1120  IndexTuple itup;
1121 
1122  /* Fetch the item's TID and insert it in hash table. */
1123  itup = (IndexTuple) PageGetItem(npage,
1124  PageGetItemId(npage, noffnum));
1125 
1126  (void) hash_search(tidhtab, &itup->t_tid, HASH_ENTER, &found);
1127 
1128  Assert(!found);
1129  }
1130 
1131  nblkno = npageopaque->hasho_nextblkno;
1132 
1133  /*
1134  * release our write lock without modifying buffer and ensure to
1135  * retain the pin on primary bucket.
1136  */
1137  if (nbuf == bucket_nbuf)
1139  else
1140  _hash_relbuf(rel, nbuf);
1141 
1142  /* Exit loop if no more overflow pages in new bucket */
1143  if (!BlockNumberIsValid(nblkno))
1144  break;
1145  }
1146 
1147  /*
1148  * Conditionally get the cleanup lock on old and new buckets to perform
1149  * the split operation. If we don't get the cleanup locks, silently give
1150  * up and next insertion on old bucket will try again to complete the
1151  * split.
1152  */
1154  {
1155  hash_destroy(tidhtab);
1156  return;
1157  }
1158  if (!ConditionalLockBufferForCleanup(bucket_nbuf))
1159  {
1161  hash_destroy(tidhtab);
1162  return;
1163  }
1164 
1165  npage = BufferGetPage(bucket_nbuf);
1166  npageopaque = (HashPageOpaque) PageGetSpecialPointer(npage);
1167  nbucket = npageopaque->hasho_bucket;
1168 
1169  _hash_splitbucket(rel, metabuf, obucket,
1170  nbucket, obuf, bucket_nbuf, tidhtab,
1171  maxbucket, highmask, lowmask);
1172 
1173  _hash_relbuf(rel, bucket_nbuf);
1175  hash_destroy(tidhtab);
1176 }
1177 
1178 /*
1179  * _hash_getcachedmetap() -- Returns cached metapage data.
1180  *
1181  * If metabuf is not InvalidBuffer, caller must hold a pin, but no lock, on
1182  * the metapage. If not set, we'll set it before returning if we have to
1183  * refresh the cache, and return with a pin but no lock on it; caller is
1184  * responsible for releasing the pin.
1185  *
1186  * We refresh the cache if it's not initialized yet or force_refresh is true.
1187  */
1189 _hash_getcachedmetap(Relation rel, Buffer *metabuf, bool force_refresh)
1190 {
1191  Page page;
1192 
1193  Assert(metabuf);
1194  if (force_refresh || rel->rd_amcache == NULL)
1195  {
1196  char *cache = NULL;
1197 
1198  /*
1199  * It's important that we don't set rd_amcache to an invalid
1200  * value. Either MemoryContextAlloc or _hash_getbuf could fail,
1201  * so don't install a pointer to the newly-allocated storage in the
1202  * actual relcache entry until both have succeeeded.
1203  */
1204  if (rel->rd_amcache == NULL)
1205  cache = MemoryContextAlloc(rel->rd_indexcxt,
1206  sizeof(HashMetaPageData));
1207 
1208  /* Read the metapage. */
1209  if (BufferIsValid(*metabuf))
1210  LockBuffer(*metabuf, BUFFER_LOCK_SHARE);
1211  else
1212  *metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ,
1213  LH_META_PAGE);
1214  page = BufferGetPage(*metabuf);
1215 
1216  /* Populate the cache. */
1217  if (rel->rd_amcache == NULL)
1218  rel->rd_amcache = cache;
1219  memcpy(rel->rd_amcache, HashPageGetMeta(page),
1220  sizeof(HashMetaPageData));
1221 
1222  /* Release metapage lock, but keep the pin. */
1223  LockBuffer(*metabuf, BUFFER_LOCK_UNLOCK);
1224  }
1225 
1226  return (HashMetaPage) rel->rd_amcache;
1227 }
1228 
1229 /*
1230  * _hash_getbucketbuf_from_hashkey() -- Get the bucket's buffer for the given
1231  * hashkey.
1232  *
1233  * Bucket pages do not move or get removed once they are allocated. This give
1234  * us an opportunity to use the previously saved metapage contents to reach
1235  * the target bucket buffer, instead of reading from the metapage every time.
1236  * This saves one buffer access every time we want to reach the target bucket
1237  * buffer, which is very helpful savings in bufmgr traffic and contention.
1238  *
1239  * The access type parameter (HASH_READ or HASH_WRITE) indicates whether the
1240  * bucket buffer has to be locked for reading or writing.
1241  *
1242  * The out parameter cachedmetap is set with metapage contents used for
1243  * hashkey to bucket buffer mapping. Some callers need this info to reach the
1244  * old bucket in case of bucket split, see _hash_doinsert().
1245  */
1246 Buffer
1248  HashMetaPage *cachedmetap)
1249 {
1250  HashMetaPage metap;
1251  Buffer buf;
1252  Buffer metabuf = InvalidBuffer;
1253  Page page;
1254  Bucket bucket;
1255  BlockNumber blkno;
1256  HashPageOpaque opaque;
1257 
1258  /* We read from target bucket buffer, hence locking is must. */
1259  Assert(access == HASH_READ || access == HASH_WRITE);
1260 
1261  metap = _hash_getcachedmetap(rel, &metabuf, false);
1262  Assert(metap != NULL);
1263 
1264  /*
1265  * Loop until we get a lock on the correct target bucket.
1266  */
1267  for (;;)
1268  {
1269  /*
1270  * Compute the target bucket number, and convert to block number.
1271  */
1272  bucket = _hash_hashkey2bucket(hashkey,
1273  metap->hashm_maxbucket,
1274  metap->hashm_highmask,
1275  metap->hashm_lowmask);
1276 
1277  blkno = BUCKET_TO_BLKNO(metap, bucket);
1278 
1279  /* Fetch the primary bucket page for the bucket */
1280  buf = _hash_getbuf(rel, blkno, access, LH_BUCKET_PAGE);
1281  page = BufferGetPage(buf);
1282  opaque = (HashPageOpaque) PageGetSpecialPointer(page);
1283  Assert(opaque->hasho_bucket == bucket);
1284 
1285  /*
1286  * If this bucket hasn't been split, we're done.
1287  *
1288  * NB: The check for InvalidBlockNumber is only needed for on-disk
1289  * compatibility with indexes created before we started storing
1290  * hashm_maxbucket in the primary page's hasho_prevblkno.
1291  */
1292  if (opaque->hasho_prevblkno == InvalidBlockNumber ||
1293  opaque->hasho_prevblkno <= metap->hashm_maxbucket)
1294  break;
1295 
1296  /* Drop lock on this buffer, update cached metapage, and retry. */
1297  _hash_relbuf(rel, buf);
1298  metap = _hash_getcachedmetap(rel, &metabuf, true);
1299  Assert(metap != NULL);
1300  }
1301 
1302  if (BufferIsValid(metabuf))
1303  _hash_dropbuf(rel, metabuf);
1304 
1305  if (cachedmetap)
1306  *cachedmetap = metap;
1307 
1308  return buf;
1309 }
#define HASH_DEFAULT_FILLFACTOR
Definition: hash.h:211
uint16 hashm_bmshift
Definition: hash.h:183
uint16 hasho_page_id
Definition: hash.h:81
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:87
void hash_destroy(HTAB *hashp)
Definition: dynahash.c:793
void _hash_dropscanbuf(Relation rel, HashScanOpaque so)
Definition: hashpage.c:265
RegProcedure hashm_procid
Definition: hash.h:191
#define HASH_CONTEXT
Definition: hsearch.h:93
#define HASH_ELEM
Definition: hsearch.h:87
MemoryContext hcxt
Definition: hsearch.h:78
void _hash_pageinit(Page page, Size size)
Definition: hashpage.c:468
Bucket _hash_hashkey2bucket(uint32 hashkey, uint32 maxbucket, uint32 highmask, uint32 lowmask)
Definition: hashutil.c:124
#define LH_META_PAGE
Definition: hash.h:56
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:1445
struct SMgrRelationData * rd_smgr
Definition: rel.h:87
Buffer _hash_getbuf_with_strategy(Relation rel, BlockNumber blkno, int access, int flags, BufferAccessStrategy bstrategy)
Definition: hashpage.c:215
Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition: bufmgr.c:640
#define BYTE_TO_BIT
Definition: hash.h:216
ItemPointerData t_tid
Definition: itup.h:37
uint32 hashm_magic
Definition: hash.h:176
#define END_CRIT_SECTION()
Definition: miscadmin.h:132
uint16 hashm_ffactor
Definition: hash.h:179
uint32 hashm_highmask
Definition: hash.h:185
#define InvalidBuffer
Definition: buf.h:25
Size entrysize
Definition: hsearch.h:73
#define START_CRIT_SECTION()
Definition: miscadmin.h:130
Buffer _hash_getbucketbuf_from_hashkey(Relation rel, uint32 hashkey, int access, HashMetaPage *cachedmetap)
Definition: hashpage.c:1247
#define MemSet(start, val, len)
Definition: c.h:853
void hashbucketcleanup(Relation rel, Bucket cur_bucket, Buffer bucket_buf, BlockNumber bucket_blkno, BufferAccessStrategy bstrategy, uint32 maxbucket, uint32 highmask, uint32 lowmask, double *tuples_removed, double *num_index_tuples, bool split_cleanup, IndexBulkDeleteCallback callback, void *callback_state)
Definition: hash.c:711
uint32 BlockNumber
Definition: block.h:31
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:3292
Buffer _hash_getnewbuf(Relation rel, BlockNumber blkno, ForkNumber forkNum)
Definition: hashpage.c:174
#define P_NEW
Definition: bufmgr.h:82
void _hash_dropbuf(Relation rel, Buffer buf)
Definition: hashpage.c:253
Buffer _hash_getinitbuf(Relation rel, BlockNumber blkno)
Definition: hashpage.c:141
void * hash_search(HTAB *hashp, const void *keyPtr, HASHACTION action, bool *foundPtr)
Definition: dynahash.c:885
Buffer _hash_getbuf(Relation rel, BlockNumber blkno, int access, int flags)
Definition: hashpage.c:76
#define BUFFER_LOCK_EXCLUSIVE
Definition: bufmgr.h:89
#define LH_BUCKET_NEEDS_SPLIT_CLEANUP
Definition: hash.h:59
#define HASH_VERSION
Definition: hash.h:149
#define ItemIdIsDead(itemId)
Definition: itemid.h:112
uint32 hashm_lowmask
Definition: hash.h:186
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:354
Size PageGetFreeSpace(Page page)
Definition: bufpage.c:582
signed int int32
Definition: c.h:253
#define BUCKET_TO_BLKNO(metap, B)
Definition: hash.h:38
#define HASH_MAX_SPLITPOINTS
Definition: hash.h:171
static bool _hash_alloc_buckets(Relation rel, BlockNumber firstblock, uint32 nblocks)
Definition: hashpage.c:813
uint16 OffsetNumber
Definition: off.h:24
#define HASH_MAGIC
Definition: hash.h:148
#define HASH_READ
Definition: hash.h:254
#define HashGetMaxBitmapSize(page)
Definition: hash.h:234
uint32 Bucket
Definition: hash.h:34
#define RelationOpenSmgr(relation)
Definition: rel.h:457
Definition: dynahash.c:193
void pfree(void *pointer)
Definition: mcxt.c:950
void UnlockReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:3315
#define H_NEEDS_SPLIT_CLEANUP(opaque)
Definition: hash.h:86
bool ConditionalLockBufferForCleanup(Buffer buffer)
Definition: bufmgr.c:3701
BlockNumber hasho_prevblkno
Definition: hash.h:77
#define ERROR
Definition: elog.h:43
uint32 _hash_get_indextuple_hashkey(IndexTuple itup)
Definition: hashutil.c:232
uint32 hashm_version
Definition: hash.h:177
uint32 hashm_nmaps
Definition: hash.h:190
void _hash_finish_split(Relation rel, Buffer metabuf, Buffer obuf, Bucket obucket, uint32 maxbucket, uint32 highmask, uint32 lowmask)
Definition: hashpage.c:1068
IndexTuple CopyIndexTuple(IndexTuple source)
Definition: indextuple.c:434
#define IndexTupleDSize(itup)
Definition: itup.h:71
static char * buf
Definition: pg_test_fsync.c:65
#define HASH_WRITE
Definition: hash.h:255
#define HASH_NOLOCK
Definition: hash.h:256
#define BMPG_MASK(metap)
Definition: hash.h:229
#define FirstOffsetNumber
Definition: off.h:27
IndexTupleData * IndexTuple
Definition: itup.h:53
#define RelationGetRelationName(relation)
Definition: rel.h:433
#define LH_BUCKET_BEING_SPLIT
Definition: hash.h:58
unsigned int uint32
Definition: c.h:265
struct ItemIdData ItemIdData
MemoryContext CurrentMemoryContext
Definition: mcxt.c:37
Buffer _hash_getbuf_with_condlock_cleanup(Relation rel, BlockNumber blkno, int flags)
Definition: hashpage.c:102
void _hash_expandtable(Relation rel, Buffer metabuf)
Definition: hashpage.c:486
#define BMPG_SHIFT(metap)
Definition: hash.h:228
#define BufferGetPage(buffer)
Definition: bufmgr.h:160
Buffer hashso_bucket_buf
Definition: hash.h:115
bool IsBufferCleanupOK(Buffer buffer)
Definition: bufmgr.c:3757
ForkNumber
Definition: relpath.h:24
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:232
uint32 hashm_ovflpoint
Definition: hash.h:187
uint16 hashm_bsize
Definition: hash.h:180
#define LH_BUCKET_BEING_POPULATED
Definition: hash.h:57
#define HASH_BLOBS
Definition: hsearch.h:88
void _hash_checkpage(Relation rel, Buffer buf, int flags)
Definition: hashutil.c:158
#define HASH_METAPAGE
Definition: hash.h:146
HTAB * hash_create(const char *tabname, long nelem, HASHCTL *info, int flags)
Definition: dynahash.c:301
double hashm_ntuples
Definition: hash.h:178
bool hashso_buc_populated
Definition: hash.h:131
#define BufferGetPageSize(buffer)
Definition: bufmgr.h:147
#define LH_OVERFLOW_PAGE
Definition: hash.h:53
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:3529
uint32 hashm_firstfree
Definition: hash.h:189
uint32 hashm_spares[HASH_MAX_SPLITPOINTS]
Definition: hash.h:192
Size keysize
Definition: hsearch.h:72
HashMetaPage _hash_getcachedmetap(Relation rel, Buffer *metabuf, bool force_refresh)
Definition: hashpage.c:1189
void _hash_relbuf(Relation rel, Buffer buf)
Definition: hashpage.c:242
BlockNumber _hash_get_newblock_from_oldbucket(Relation rel, Bucket old_bucket)
Definition: hashutil.c:402
#define BlockNumberIsValid(blockNumber)
Definition: block.h:70
#define LH_BUCKET_PAGE
Definition: hash.h:54
BlockNumber RelationGetNumberOfBlocksInFork(Relation relation, ForkNumber forkNum)
Definition: bufmgr.c:2771
#define H_BUCKET_BEING_SPLIT(opaque)
Definition: hash.h:87
static void _hash_splitbucket(Relation rel, Buffer metabuf, Bucket obucket, Bucket nbucket, Buffer obuf, Buffer nbuf, HTAB *htab, uint32 maxbucket, uint32 highmask, uint32 lowmask)
Definition: hashpage.c:868
#define RelationGetTargetPageUsage(relation, defaultff)
Definition: rel.h:297
#define NULL
Definition: c.h:226
#define Assert(condition)
Definition: c.h:671
Bucket hasho_bucket
Definition: hash.h:79
struct ItemPointerData ItemPointerData
Buffer ReadBuffer(Relation reln, BlockNumber blockNum)
Definition: bufmgr.c:594
#define OffsetNumberNext(offsetNumber)
Definition: off.h:53
size_t Size
Definition: c.h:353
#define PageGetSpecialPointer(page)
Definition: bufpage.h:323
#define InvalidBlockNumber
Definition: block.h:33
HashPageOpaqueData * HashPageOpaque
Definition: hash.h:84
#define HASHO_PAGE_ID
Definition: hash.h:96
Buffer _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf, bool retain_pin)
Definition: hashovfl.c:108
#define MAXALIGN(LEN)
Definition: c.h:584
#define BufferIsValid(bufnum)
Definition: bufmgr.h:114
uint32 hashm_maxbucket
Definition: hash.h:184
OffsetNumber _hash_pgaddtup(Relation rel, Buffer buf, Size itemsize, IndexTuple itup)
Definition: hashinsert.c:211
void smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync)
Definition: smgr.c:600
uint16 hasho_flag
Definition: hash.h:80
bool hashso_buc_split
Definition: hash.h:137
uint32 _hash_metapinit(Relation rel, double num_tuples, ForkNumber forkNum)
Definition: hashpage.c:303
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2588
#define HashPageGetMeta(page)
Definition: hash.h:238
uint32 _hash_log2(uint32 num)
Definition: hashutil.c:140
void * MemoryContextAlloc(MemoryContext context, Size size)
Definition: mcxt.c:707
#define HASHPROC
Definition: hash.h:269
MemoryContext rd_indexcxt
Definition: rel.h:175
int i
Buffer hashso_split_bucket_buf
Definition: hash.h:122
#define BUFFER_LOCK_SHARE
Definition: bufmgr.h:88
BlockNumber hasho_nextblkno
Definition: hash.h:78
uint16 hashm_bmsize
Definition: hash.h:181
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:97
#define elog
Definition: elog.h:219
unsigned short t_info
Definition: itup.h:49
void * rd_amcache
Definition: rel.h:188
#define INDEX_MOVED_BY_SPLIT_MASK
Definition: hash.h:208
void _hash_initbitmap(Relation rel, HashMetaPage metap, BlockNumber blkno, ForkNumber forkNum)
Definition: hashovfl.c:584
BlockNumber hashm_mapp[HASH_MAX_BITMAPS]
Definition: hash.h:194
int Buffer
Definition: buf.h:23
Buffer hashso_curbuf
Definition: hash.h:112
#define PageGetItem(page, itemId)
Definition: bufpage.h:337
Pointer Page
Definition: bufpage.h:74
void PageInit(Page page, Size pageSize, Size specialSize)
Definition: bufpage.c:41
RegProcedure index_getprocid(Relation irel, AttrNumber attnum, uint16 procnum)
Definition: indexam.c:821