PostgreSQL Source Code  git master
hashovfl.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * hashovfl.c
4  * Overflow page management code for the Postgres hash access method
5  *
6  * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  * src/backend/access/hash/hashovfl.c
12  *
13  * NOTES
14  * Overflow pages look like ordinary relation pages.
15  *
16  *-------------------------------------------------------------------------
17  */
18 #include "postgres.h"
19 
20 #include "access/hash.h"
21 #include "access/hash_xlog.h"
22 #include "access/xloginsert.h"
23 #include "miscadmin.h"
24 #include "utils/rel.h"
25 
26 
27 static uint32 _hash_firstfreebit(uint32 map);
28 
29 
30 /*
31  * Convert overflow page bit number (its index in the free-page bitmaps)
32  * to block number within the index.
33  */
34 static BlockNumber
36 {
37  uint32 splitnum = metap->hashm_ovflpoint;
38  uint32 i;
39 
40  /* Convert zero-based bitnumber to 1-based page number */
41  ovflbitnum += 1;
42 
43  /* Determine the split number for this page (must be >= 1) */
44  for (i = 1;
45  i < splitnum && ovflbitnum > metap->hashm_spares[i];
46  i++)
47  /* loop */ ;
48 
49  /*
50  * Convert to absolute page number by adding the number of bucket pages
51  * that exist before this split point.
52  */
53  return (BlockNumber) (_hash_get_totalbuckets(i) + ovflbitnum);
54 }
55 
56 /*
57  * _hash_ovflblkno_to_bitno
58  *
59  * Convert overflow page block number to bit number for free-page bitmap.
60  */
61 uint32
63 {
64  uint32 splitnum = metap->hashm_ovflpoint;
65  uint32 i;
66  uint32 bitnum;
67 
68  /* Determine the split number containing this page */
69  for (i = 1; i <= splitnum; i++)
70  {
71  if (ovflblkno <= (BlockNumber) _hash_get_totalbuckets(i))
72  break; /* oops */
73  bitnum = ovflblkno - _hash_get_totalbuckets(i);
74 
75  /*
76  * bitnum has to be greater than number of overflow page added in
77  * previous split point. The overflow page at this splitnum (i) if any
78  * should start from (_hash_get_totalbuckets(i) +
79  * metap->hashm_spares[i - 1] + 1).
80  */
81  if (bitnum > metap->hashm_spares[i - 1] &&
82  bitnum <= metap->hashm_spares[i])
83  return bitnum - 1; /* -1 to convert 1-based to 0-based */
84  }
85 
86  ereport(ERROR,
87  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
88  errmsg("invalid overflow block number %u", ovflblkno)));
89  return 0; /* keep compiler quiet */
90 }
91 
92 /*
93  * _hash_addovflpage
94  *
95  * Add an overflow page to the bucket whose last page is pointed to by 'buf'.
96  *
97  * On entry, the caller must hold a pin but no lock on 'buf'. The pin is
98  * dropped before exiting (we assume the caller is not interested in 'buf'
99  * anymore) if not asked to retain. The pin will be retained only for the
100  * primary bucket. The returned overflow page will be pinned and
101  * write-locked; it is guaranteed to be empty.
102  *
103  * The caller must hold a pin, but no lock, on the metapage buffer.
104  * That buffer is returned in the same state.
105  *
106  * NB: since this could be executed concurrently by multiple processes,
107  * one should not assume that the returned overflow page will be the
108  * immediate successor of the originally passed 'buf'. Additional overflow
109  * pages might have been added to the bucket chain in between.
110  */
111 Buffer
112 _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf, bool retain_pin)
113 {
114  Buffer ovflbuf;
115  Page page;
116  Page ovflpage;
117  HashPageOpaque pageopaque;
118  HashPageOpaque ovflopaque;
119  HashMetaPage metap;
120  Buffer mapbuf = InvalidBuffer;
121  Buffer newmapbuf = InvalidBuffer;
122  BlockNumber blkno;
123  uint32 orig_firstfree;
124  uint32 splitnum;
125  uint32 *freep = NULL;
126  uint32 max_ovflpg;
127  uint32 bit;
128  uint32 bitmap_page_bit;
129  uint32 first_page;
130  uint32 last_bit;
131  uint32 last_page;
132  uint32 i,
133  j;
134  bool page_found = false;
135 
136  /*
137  * Write-lock the tail page. Here, we need to maintain locking order such
138  * that, first acquire the lock on tail page of bucket, then on meta page
139  * to find and lock the bitmap page and if it is found, then lock on meta
140  * page is released, then finally acquire the lock on new overflow buffer.
141  * We need this locking order to avoid deadlock with backends that are
142  * doing inserts.
143  *
144  * Note: We could have avoided locking many buffers here if we made two
145  * WAL records for acquiring an overflow page (one to allocate an overflow
146  * page and another to add it to overflow bucket chain). However, doing
147  * so can leak an overflow page, if the system crashes after allocation.
148  * Needless to say, it is better to have a single record from a
149  * performance point of view as well.
150  */
152 
153  /* probably redundant... */
155 
156  /* loop to find current tail page, in case someone else inserted too */
157  for (;;)
158  {
159  BlockNumber nextblkno;
160 
161  page = BufferGetPage(buf);
162  pageopaque = HashPageGetOpaque(page);
163  nextblkno = pageopaque->hasho_nextblkno;
164 
165  if (!BlockNumberIsValid(nextblkno))
166  break;
167 
168  /* we assume we do not need to write the unmodified page */
169  if (retain_pin)
170  {
171  /* pin will be retained only for the primary bucket page */
172  Assert((pageopaque->hasho_flag & LH_PAGE_TYPE) == LH_BUCKET_PAGE);
174  }
175  else
176  _hash_relbuf(rel, buf);
177 
178  retain_pin = false;
179 
180  buf = _hash_getbuf(rel, nextblkno, HASH_WRITE, LH_OVERFLOW_PAGE);
181  }
182 
183  /* Get exclusive lock on the meta page */
185 
186  _hash_checkpage(rel, metabuf, LH_META_PAGE);
187  metap = HashPageGetMeta(BufferGetPage(metabuf));
188 
189  /* start search at hashm_firstfree */
190  orig_firstfree = metap->hashm_firstfree;
191  first_page = orig_firstfree >> BMPG_SHIFT(metap);
192  bit = orig_firstfree & BMPG_MASK(metap);
193  i = first_page;
194  j = bit / BITS_PER_MAP;
195  bit &= ~(BITS_PER_MAP - 1);
196 
197  /* outer loop iterates once per bitmap page */
198  for (;;)
199  {
200  BlockNumber mapblkno;
201  Page mappage;
202  uint32 last_inpage;
203 
204  /* want to end search with the last existing overflow page */
205  splitnum = metap->hashm_ovflpoint;
206  max_ovflpg = metap->hashm_spares[splitnum] - 1;
207  last_page = max_ovflpg >> BMPG_SHIFT(metap);
208  last_bit = max_ovflpg & BMPG_MASK(metap);
209 
210  if (i > last_page)
211  break;
212 
213  Assert(i < metap->hashm_nmaps);
214  mapblkno = metap->hashm_mapp[i];
215 
216  if (i == last_page)
217  last_inpage = last_bit;
218  else
219  last_inpage = BMPGSZ_BIT(metap) - 1;
220 
221  /* Release exclusive lock on metapage while reading bitmap page */
222  LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
223 
224  mapbuf = _hash_getbuf(rel, mapblkno, HASH_WRITE, LH_BITMAP_PAGE);
225  mappage = BufferGetPage(mapbuf);
226  freep = HashPageGetBitmap(mappage);
227 
228  for (; bit <= last_inpage; j++, bit += BITS_PER_MAP)
229  {
230  if (freep[j] != ALL_SET)
231  {
232  page_found = true;
233 
234  /* Reacquire exclusive lock on the meta page */
236 
237  /* convert bit to bit number within page */
238  bit += _hash_firstfreebit(freep[j]);
239  bitmap_page_bit = bit;
240 
241  /* convert bit to absolute bit number */
242  bit += (i << BMPG_SHIFT(metap));
243  /* Calculate address of the recycled overflow page */
244  blkno = bitno_to_blkno(metap, bit);
245 
246  /* Fetch and init the recycled page */
247  ovflbuf = _hash_getinitbuf(rel, blkno);
248 
249  goto found;
250  }
251  }
252 
253  /* No free space here, try to advance to next map page */
254  _hash_relbuf(rel, mapbuf);
255  mapbuf = InvalidBuffer;
256  i++;
257  j = 0; /* scan from start of next map page */
258  bit = 0;
259 
260  /* Reacquire exclusive lock on the meta page */
262  }
263 
264  /*
265  * No free pages --- have to extend the relation to add an overflow page.
266  * First, check to see if we have to add a new bitmap page too.
267  */
268  if (last_bit == (uint32) (BMPGSZ_BIT(metap) - 1))
269  {
270  /*
271  * We create the new bitmap page with all pages marked "in use".
272  * Actually two pages in the new bitmap's range will exist
273  * immediately: the bitmap page itself, and the following page which
274  * is the one we return to the caller. Both of these are correctly
275  * marked "in use". Subsequent pages do not exist yet, but it is
276  * convenient to pre-mark them as "in use" too.
277  */
278  bit = metap->hashm_spares[splitnum];
279 
280  /* metapage already has a write lock */
281  if (metap->hashm_nmaps >= HASH_MAX_BITMAPS)
282  ereport(ERROR,
283  (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
284  errmsg("out of overflow pages in hash index \"%s\"",
285  RelationGetRelationName(rel))));
286 
287  newmapbuf = _hash_getnewbuf(rel, bitno_to_blkno(metap, bit), MAIN_FORKNUM);
288  }
289  else
290  {
291  /*
292  * Nothing to do here; since the page will be past the last used page,
293  * we know its bitmap bit was preinitialized to "in use".
294  */
295  }
296 
297  /* Calculate address of the new overflow page */
298  bit = BufferIsValid(newmapbuf) ?
299  metap->hashm_spares[splitnum] + 1 : metap->hashm_spares[splitnum];
300  blkno = bitno_to_blkno(metap, bit);
301 
302  /*
303  * Fetch the page with _hash_getnewbuf to ensure smgr's idea of the
304  * relation length stays in sync with ours. XXX It's annoying to do this
305  * with metapage write lock held; would be better to use a lock that
306  * doesn't block incoming searches.
307  *
308  * It is okay to hold two buffer locks here (one on tail page of bucket
309  * and other on new overflow page) since there cannot be anyone else
310  * contending for access to ovflbuf.
311  */
312  ovflbuf = _hash_getnewbuf(rel, blkno, MAIN_FORKNUM);
313 
314 found:
315 
316  /*
317  * Do the update. No ereport(ERROR) until changes are logged. We want to
318  * log the changes for bitmap page and overflow page together to avoid
319  * loss of pages in case the new page is added.
320  */
322 
323  if (page_found)
324  {
325  Assert(BufferIsValid(mapbuf));
326 
327  /* mark page "in use" in the bitmap */
328  SETBIT(freep, bitmap_page_bit);
329  MarkBufferDirty(mapbuf);
330  }
331  else
332  {
333  /* update the count to indicate new overflow page is added */
334  metap->hashm_spares[splitnum]++;
335 
336  if (BufferIsValid(newmapbuf))
337  {
338  _hash_initbitmapbuffer(newmapbuf, metap->hashm_bmsize, false);
339  MarkBufferDirty(newmapbuf);
340 
341  /* add the new bitmap page to the metapage's list of bitmaps */
342  metap->hashm_mapp[metap->hashm_nmaps] = BufferGetBlockNumber(newmapbuf);
343  metap->hashm_nmaps++;
344  metap->hashm_spares[splitnum]++;
345  }
346 
347  MarkBufferDirty(metabuf);
348 
349  /*
350  * for new overflow page, we don't need to explicitly set the bit in
351  * bitmap page, as by default that will be set to "in use".
352  */
353  }
354 
355  /*
356  * Adjust hashm_firstfree to avoid redundant searches. But don't risk
357  * changing it if someone moved it while we were searching bitmap pages.
358  */
359  if (metap->hashm_firstfree == orig_firstfree)
360  {
361  metap->hashm_firstfree = bit + 1;
362  MarkBufferDirty(metabuf);
363  }
364 
365  /* initialize new overflow page */
366  ovflpage = BufferGetPage(ovflbuf);
367  ovflopaque = HashPageGetOpaque(ovflpage);
369  ovflopaque->hasho_nextblkno = InvalidBlockNumber;
370  ovflopaque->hasho_bucket = pageopaque->hasho_bucket;
371  ovflopaque->hasho_flag = LH_OVERFLOW_PAGE;
372  ovflopaque->hasho_page_id = HASHO_PAGE_ID;
373 
374  MarkBufferDirty(ovflbuf);
375 
376  /* logically chain overflow page to previous page */
377  pageopaque->hasho_nextblkno = BufferGetBlockNumber(ovflbuf);
378 
380 
381  /* XLOG stuff */
382  if (RelationNeedsWAL(rel))
383  {
384  XLogRecPtr recptr;
385  xl_hash_add_ovfl_page xlrec;
386 
387  xlrec.bmpage_found = page_found;
388  xlrec.bmsize = metap->hashm_bmsize;
389 
390  XLogBeginInsert();
391  XLogRegisterData((char *) &xlrec, SizeOfHashAddOvflPage);
392 
394  XLogRegisterBufData(0, (char *) &pageopaque->hasho_bucket, sizeof(Bucket));
395 
397 
398  if (BufferIsValid(mapbuf))
399  {
401  XLogRegisterBufData(2, (char *) &bitmap_page_bit, sizeof(uint32));
402  }
403 
404  if (BufferIsValid(newmapbuf))
405  XLogRegisterBuffer(3, newmapbuf, REGBUF_WILL_INIT);
406 
407  XLogRegisterBuffer(4, metabuf, REGBUF_STANDARD);
408  XLogRegisterBufData(4, (char *) &metap->hashm_firstfree, sizeof(uint32));
409 
410  recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_ADD_OVFL_PAGE);
411 
412  PageSetLSN(BufferGetPage(ovflbuf), recptr);
413  PageSetLSN(BufferGetPage(buf), recptr);
414 
415  if (BufferIsValid(mapbuf))
416  PageSetLSN(BufferGetPage(mapbuf), recptr);
417 
418  if (BufferIsValid(newmapbuf))
419  PageSetLSN(BufferGetPage(newmapbuf), recptr);
420 
421  PageSetLSN(BufferGetPage(metabuf), recptr);
422  }
423 
425 
426  if (retain_pin)
428  else
429  _hash_relbuf(rel, buf);
430 
431  if (BufferIsValid(mapbuf))
432  _hash_relbuf(rel, mapbuf);
433 
434  LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
435 
436  if (BufferIsValid(newmapbuf))
437  _hash_relbuf(rel, newmapbuf);
438 
439  return ovflbuf;
440 }
441 
442 /*
443  * _hash_firstfreebit()
444  *
445  * Return the number of the first bit that is not set in the word 'map'.
446  */
447 static uint32
449 {
450  uint32 i,
451  mask;
452 
453  mask = 0x1;
454  for (i = 0; i < BITS_PER_MAP; i++)
455  {
456  if (!(mask & map))
457  return i;
458  mask <<= 1;
459  }
460 
461  elog(ERROR, "firstfreebit found no free bit");
462 
463  return 0; /* keep compiler quiet */
464 }
465 
466 /*
467  * _hash_freeovflpage() -
468  *
469  * Remove this overflow page from its bucket's chain, and mark the page as
470  * free. On entry, ovflbuf is write-locked; it is released before exiting.
471  *
472  * Add the tuples (itups) to wbuf in this function. We could do that in the
473  * caller as well, but the advantage of doing it here is we can easily write
474  * the WAL for XLOG_HASH_SQUEEZE_PAGE operation. Addition of tuples and
475  * removal of overflow page has to done as an atomic operation, otherwise
476  * during replay on standby users might find duplicate records.
477  *
478  * Since this function is invoked in VACUUM, we provide an access strategy
479  * parameter that controls fetches of the bucket pages.
480  *
481  * Returns the block number of the page that followed the given page
482  * in the bucket, or InvalidBlockNumber if no following page.
483  *
484  * NB: caller must not hold lock on metapage, nor on page, that's next to
485  * ovflbuf in the bucket chain. We don't acquire the lock on page that's
486  * prior to ovflbuf in chain if it is same as wbuf because the caller already
487  * has a lock on same.
488  */
490 _hash_freeovflpage(Relation rel, Buffer bucketbuf, Buffer ovflbuf,
491  Buffer wbuf, IndexTuple *itups, OffsetNumber *itup_offsets,
492  Size *tups_size, uint16 nitups,
493  BufferAccessStrategy bstrategy)
494 {
495  HashMetaPage metap;
496  Buffer metabuf;
497  Buffer mapbuf;
498  BlockNumber ovflblkno;
499  BlockNumber prevblkno;
500  BlockNumber blkno;
501  BlockNumber nextblkno;
502  BlockNumber writeblkno;
503  HashPageOpaque ovflopaque;
504  Page ovflpage;
505  Page mappage;
506  uint32 *freep;
507  uint32 ovflbitno;
508  int32 bitmappage,
509  bitmapbit;
511  Buffer prevbuf = InvalidBuffer;
512  Buffer nextbuf = InvalidBuffer;
513  bool update_metap = false;
514 
515  /* Get information from the doomed page */
516  _hash_checkpage(rel, ovflbuf, LH_OVERFLOW_PAGE);
517  ovflblkno = BufferGetBlockNumber(ovflbuf);
518  ovflpage = BufferGetPage(ovflbuf);
519  ovflopaque = HashPageGetOpaque(ovflpage);
520  nextblkno = ovflopaque->hasho_nextblkno;
521  prevblkno = ovflopaque->hasho_prevblkno;
522  writeblkno = BufferGetBlockNumber(wbuf);
523  bucket = ovflopaque->hasho_bucket;
524 
525  /*
526  * Fix up the bucket chain. this is a doubly-linked list, so we must fix
527  * up the bucket chain members behind and ahead of the overflow page being
528  * deleted. Concurrency issues are avoided by using lock chaining as
529  * described atop hashbucketcleanup.
530  */
531  if (BlockNumberIsValid(prevblkno))
532  {
533  if (prevblkno == writeblkno)
534  prevbuf = wbuf;
535  else
536  prevbuf = _hash_getbuf_with_strategy(rel,
537  prevblkno,
538  HASH_WRITE,
540  bstrategy);
541  }
542  if (BlockNumberIsValid(nextblkno))
543  nextbuf = _hash_getbuf_with_strategy(rel,
544  nextblkno,
545  HASH_WRITE,
547  bstrategy);
548 
549  /* Note: bstrategy is intentionally not used for metapage and bitmap */
550 
551  /* Read the metapage so we can determine which bitmap page to use */
553  metap = HashPageGetMeta(BufferGetPage(metabuf));
554 
555  /* Identify which bit to set */
556  ovflbitno = _hash_ovflblkno_to_bitno(metap, ovflblkno);
557 
558  bitmappage = ovflbitno >> BMPG_SHIFT(metap);
559  bitmapbit = ovflbitno & BMPG_MASK(metap);
560 
561  if (bitmappage >= metap->hashm_nmaps)
562  elog(ERROR, "invalid overflow bit number %u", ovflbitno);
563  blkno = metap->hashm_mapp[bitmappage];
564 
565  /* Release metapage lock while we access the bitmap page */
566  LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
567 
568  /* read the bitmap page to clear the bitmap bit */
569  mapbuf = _hash_getbuf(rel, blkno, HASH_WRITE, LH_BITMAP_PAGE);
570  mappage = BufferGetPage(mapbuf);
571  freep = HashPageGetBitmap(mappage);
572  Assert(ISSET(freep, bitmapbit));
573 
574  /* Get write-lock on metapage to update firstfree */
576 
577  /* This operation needs to log multiple tuples, prepare WAL for that */
578  if (RelationNeedsWAL(rel))
580 
582 
583  /*
584  * we have to insert tuples on the "write" page, being careful to preserve
585  * hashkey ordering. (If we insert many tuples into the same "write" page
586  * it would be worth qsort'ing them).
587  */
588  if (nitups > 0)
589  {
590  _hash_pgaddmultitup(rel, wbuf, itups, itup_offsets, nitups);
591  MarkBufferDirty(wbuf);
592  }
593 
594  /*
595  * Reinitialize the freed overflow page. Just zeroing the page won't
596  * work, because WAL replay routines expect pages to be initialized. See
597  * explanation of RBM_NORMAL mode atop XLogReadBufferExtended. We are
598  * careful to make the special space valid here so that tools like
599  * pageinspect won't get confused.
600  */
601  _hash_pageinit(ovflpage, BufferGetPageSize(ovflbuf));
602 
603  ovflopaque = HashPageGetOpaque(ovflpage);
604 
605  ovflopaque->hasho_prevblkno = InvalidBlockNumber;
606  ovflopaque->hasho_nextblkno = InvalidBlockNumber;
607  ovflopaque->hasho_bucket = InvalidBucket;
608  ovflopaque->hasho_flag = LH_UNUSED_PAGE;
609  ovflopaque->hasho_page_id = HASHO_PAGE_ID;
610 
611  MarkBufferDirty(ovflbuf);
612 
613  if (BufferIsValid(prevbuf))
614  {
615  Page prevpage = BufferGetPage(prevbuf);
616  HashPageOpaque prevopaque = HashPageGetOpaque(prevpage);
617 
618  Assert(prevopaque->hasho_bucket == bucket);
619  prevopaque->hasho_nextblkno = nextblkno;
620  MarkBufferDirty(prevbuf);
621  }
622  if (BufferIsValid(nextbuf))
623  {
624  Page nextpage = BufferGetPage(nextbuf);
625  HashPageOpaque nextopaque = HashPageGetOpaque(nextpage);
626 
627  Assert(nextopaque->hasho_bucket == bucket);
628  nextopaque->hasho_prevblkno = prevblkno;
629  MarkBufferDirty(nextbuf);
630  }
631 
632  /* Clear the bitmap bit to indicate that this overflow page is free */
633  CLRBIT(freep, bitmapbit);
634  MarkBufferDirty(mapbuf);
635 
636  /* if this is now the first free page, update hashm_firstfree */
637  if (ovflbitno < metap->hashm_firstfree)
638  {
639  metap->hashm_firstfree = ovflbitno;
640  update_metap = true;
641  MarkBufferDirty(metabuf);
642  }
643 
644  /* XLOG stuff */
645  if (RelationNeedsWAL(rel))
646  {
647  xl_hash_squeeze_page xlrec;
648  XLogRecPtr recptr;
649  int i;
650 
651  xlrec.prevblkno = prevblkno;
652  xlrec.nextblkno = nextblkno;
653  xlrec.ntups = nitups;
654  xlrec.is_prim_bucket_same_wrt = (wbuf == bucketbuf);
655  xlrec.is_prev_bucket_same_wrt = (wbuf == prevbuf);
656 
657  XLogBeginInsert();
658  XLogRegisterData((char *) &xlrec, SizeOfHashSqueezePage);
659 
660  /*
661  * bucket buffer needs to be registered to ensure that we can acquire
662  * a cleanup lock on it during replay.
663  */
664  if (!xlrec.is_prim_bucket_same_wrt)
666 
668  if (xlrec.ntups > 0)
669  {
670  XLogRegisterBufData(1, (char *) itup_offsets,
671  nitups * sizeof(OffsetNumber));
672  for (i = 0; i < nitups; i++)
673  XLogRegisterBufData(1, (char *) itups[i], tups_size[i]);
674  }
675 
676  XLogRegisterBuffer(2, ovflbuf, REGBUF_STANDARD);
677 
678  /*
679  * If prevpage and the writepage (block in which we are moving tuples
680  * from overflow) are same, then no need to separately register
681  * prevpage. During replay, we can directly update the nextblock in
682  * writepage.
683  */
684  if (BufferIsValid(prevbuf) && !xlrec.is_prev_bucket_same_wrt)
685  XLogRegisterBuffer(3, prevbuf, REGBUF_STANDARD);
686 
687  if (BufferIsValid(nextbuf))
688  XLogRegisterBuffer(4, nextbuf, REGBUF_STANDARD);
689 
691  XLogRegisterBufData(5, (char *) &bitmapbit, sizeof(uint32));
692 
693  if (update_metap)
694  {
695  XLogRegisterBuffer(6, metabuf, REGBUF_STANDARD);
696  XLogRegisterBufData(6, (char *) &metap->hashm_firstfree, sizeof(uint32));
697  }
698 
699  recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SQUEEZE_PAGE);
700 
701  PageSetLSN(BufferGetPage(wbuf), recptr);
702  PageSetLSN(BufferGetPage(ovflbuf), recptr);
703 
704  if (BufferIsValid(prevbuf) && !xlrec.is_prev_bucket_same_wrt)
705  PageSetLSN(BufferGetPage(prevbuf), recptr);
706  if (BufferIsValid(nextbuf))
707  PageSetLSN(BufferGetPage(nextbuf), recptr);
708 
709  PageSetLSN(BufferGetPage(mapbuf), recptr);
710 
711  if (update_metap)
712  PageSetLSN(BufferGetPage(metabuf), recptr);
713  }
714 
716 
717  /* release previous bucket if it is not same as write bucket */
718  if (BufferIsValid(prevbuf) && prevblkno != writeblkno)
719  _hash_relbuf(rel, prevbuf);
720 
721  if (BufferIsValid(ovflbuf))
722  _hash_relbuf(rel, ovflbuf);
723 
724  if (BufferIsValid(nextbuf))
725  _hash_relbuf(rel, nextbuf);
726 
727  _hash_relbuf(rel, mapbuf);
728  _hash_relbuf(rel, metabuf);
729 
730  return nextblkno;
731 }
732 
733 
734 /*
735  * _hash_initbitmapbuffer()
736  *
737  * Initialize a new bitmap page. All bits in the new bitmap page are set to
738  * "1", indicating "in use".
739  */
740 void
741 _hash_initbitmapbuffer(Buffer buf, uint16 bmsize, bool initpage)
742 {
743  Page pg;
744  HashPageOpaque op;
745  uint32 *freep;
746 
747  pg = BufferGetPage(buf);
748 
749  /* initialize the page */
750  if (initpage)
752 
753  /* initialize the page's special space */
754  op = HashPageGetOpaque(pg);
760 
761  /* set all of the bits to 1 */
762  freep = HashPageGetBitmap(pg);
763  MemSet(freep, 0xFF, bmsize);
764 
765  /*
766  * Set pd_lower just past the end of the bitmap page data. We could even
767  * set pd_lower equal to pd_upper, but this is more precise and makes the
768  * page look compressible to xlog.c.
769  */
770  ((PageHeader) pg)->pd_lower = ((char *) freep + bmsize) - (char *) pg;
771 }
772 
773 
774 /*
775  * _hash_squeezebucket(rel, bucket)
776  *
777  * Try to squeeze the tuples onto pages occurring earlier in the
778  * bucket chain in an attempt to free overflow pages. When we start
779  * the "squeezing", the page from which we start taking tuples (the
780  * "read" page) is the last bucket in the bucket chain and the page
781  * onto which we start squeezing tuples (the "write" page) is the
782  * first page in the bucket chain. The read page works backward and
783  * the write page works forward; the procedure terminates when the
784  * read page and write page are the same page.
785  *
786  * At completion of this procedure, it is guaranteed that all pages in
787  * the bucket are nonempty, unless the bucket is totally empty (in
788  * which case all overflow pages will be freed). The original implementation
789  * required that to be true on entry as well, but it's a lot easier for
790  * callers to leave empty overflow pages and let this guy clean it up.
791  *
792  * Caller must acquire cleanup lock on the primary page of the target
793  * bucket to exclude any scans that are in progress, which could easily
794  * be confused into returning the same tuple more than once or some tuples
795  * not at all by the rearrangement we are performing here. To prevent
796  * any concurrent scan to cross the squeeze scan we use lock chaining
797  * similar to hashbucketcleanup. Refer comments atop hashbucketcleanup.
798  *
799  * We need to retain a pin on the primary bucket to ensure that no concurrent
800  * split can start.
801  *
802  * Since this function is invoked in VACUUM, we provide an access strategy
803  * parameter that controls fetches of the bucket pages.
804  */
805 void
807  Bucket bucket,
808  BlockNumber bucket_blkno,
809  Buffer bucket_buf,
810  BufferAccessStrategy bstrategy)
811 {
812  BlockNumber wblkno;
813  BlockNumber rblkno;
814  Buffer wbuf;
815  Buffer rbuf;
816  Page wpage;
817  Page rpage;
818  HashPageOpaque wopaque;
819  HashPageOpaque ropaque;
820 
821  /*
822  * start squeezing into the primary bucket page.
823  */
824  wblkno = bucket_blkno;
825  wbuf = bucket_buf;
826  wpage = BufferGetPage(wbuf);
827  wopaque = HashPageGetOpaque(wpage);
828 
829  /*
830  * if there aren't any overflow pages, there's nothing to squeeze. caller
831  * is responsible for releasing the pin on primary bucket page.
832  */
833  if (!BlockNumberIsValid(wopaque->hasho_nextblkno))
834  {
836  return;
837  }
838 
839  /*
840  * Find the last page in the bucket chain by starting at the base bucket
841  * page and working forward. Note: we assume that a hash bucket chain is
842  * usually smaller than the buffer ring being used by VACUUM, else using
843  * the access strategy here would be counterproductive.
844  */
845  rbuf = InvalidBuffer;
846  ropaque = wopaque;
847  do
848  {
849  rblkno = ropaque->hasho_nextblkno;
850  if (rbuf != InvalidBuffer)
851  _hash_relbuf(rel, rbuf);
852  rbuf = _hash_getbuf_with_strategy(rel,
853  rblkno,
854  HASH_WRITE,
856  bstrategy);
857  rpage = BufferGetPage(rbuf);
858  ropaque = HashPageGetOpaque(rpage);
859  Assert(ropaque->hasho_bucket == bucket);
860  } while (BlockNumberIsValid(ropaque->hasho_nextblkno));
861 
862  /*
863  * squeeze the tuples.
864  */
865  for (;;)
866  {
867  OffsetNumber roffnum;
868  OffsetNumber maxroffnum;
869  OffsetNumber deletable[MaxOffsetNumber];
871  Size tups_size[MaxIndexTuplesPerPage];
872  OffsetNumber itup_offsets[MaxIndexTuplesPerPage];
873  uint16 ndeletable = 0;
874  uint16 nitups = 0;
875  Size all_tups_size = 0;
876  int i;
877  bool retain_pin = false;
878 
879 readpage:
880  /* Scan each tuple in "read" page */
881  maxroffnum = PageGetMaxOffsetNumber(rpage);
882  for (roffnum = FirstOffsetNumber;
883  roffnum <= maxroffnum;
884  roffnum = OffsetNumberNext(roffnum))
885  {
886  IndexTuple itup;
887  Size itemsz;
888 
889  /* skip dead tuples */
890  if (ItemIdIsDead(PageGetItemId(rpage, roffnum)))
891  continue;
892 
893  itup = (IndexTuple) PageGetItem(rpage,
894  PageGetItemId(rpage, roffnum));
895  itemsz = IndexTupleSize(itup);
896  itemsz = MAXALIGN(itemsz);
897 
898  /*
899  * Walk up the bucket chain, looking for a page big enough for
900  * this item and all other accumulated items. Exit if we reach
901  * the read page.
902  */
903  while (PageGetFreeSpaceForMultipleTuples(wpage, nitups + 1) < (all_tups_size + itemsz))
904  {
905  Buffer next_wbuf = InvalidBuffer;
906  bool tups_moved = false;
907 
908  Assert(!PageIsEmpty(wpage));
909 
910  if (wblkno == bucket_blkno)
911  retain_pin = true;
912 
913  wblkno = wopaque->hasho_nextblkno;
914  Assert(BlockNumberIsValid(wblkno));
915 
916  /* don't need to move to next page if we reached the read page */
917  if (wblkno != rblkno)
918  next_wbuf = _hash_getbuf_with_strategy(rel,
919  wblkno,
920  HASH_WRITE,
922  bstrategy);
923 
924  if (nitups > 0)
925  {
926  Assert(nitups == ndeletable);
927 
928  /*
929  * This operation needs to log multiple tuples, prepare
930  * WAL for that.
931  */
932  if (RelationNeedsWAL(rel))
933  XLogEnsureRecordSpace(0, 3 + nitups);
934 
936 
937  /*
938  * we have to insert tuples on the "write" page, being
939  * careful to preserve hashkey ordering. (If we insert
940  * many tuples into the same "write" page it would be
941  * worth qsort'ing them).
942  */
943  _hash_pgaddmultitup(rel, wbuf, itups, itup_offsets, nitups);
944  MarkBufferDirty(wbuf);
945 
946  /* Delete tuples we already moved off read page */
947  PageIndexMultiDelete(rpage, deletable, ndeletable);
948  MarkBufferDirty(rbuf);
949 
950  /* XLOG stuff */
951  if (RelationNeedsWAL(rel))
952  {
953  XLogRecPtr recptr;
955 
956  xlrec.ntups = nitups;
957  xlrec.is_prim_bucket_same_wrt = (wbuf == bucket_buf);
958 
959  XLogBeginInsert();
961 
962  /*
963  * bucket buffer needs to be registered to ensure that
964  * we can acquire a cleanup lock on it during replay.
965  */
966  if (!xlrec.is_prim_bucket_same_wrt)
968 
970  XLogRegisterBufData(1, (char *) itup_offsets,
971  nitups * sizeof(OffsetNumber));
972  for (i = 0; i < nitups; i++)
973  XLogRegisterBufData(1, (char *) itups[i], tups_size[i]);
974 
976  XLogRegisterBufData(2, (char *) deletable,
977  ndeletable * sizeof(OffsetNumber));
978 
979  recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_MOVE_PAGE_CONTENTS);
980 
981  PageSetLSN(BufferGetPage(wbuf), recptr);
982  PageSetLSN(BufferGetPage(rbuf), recptr);
983  }
984 
986 
987  tups_moved = true;
988  }
989 
990  /*
991  * release the lock on previous page after acquiring the lock
992  * on next page
993  */
994  if (retain_pin)
996  else
997  _hash_relbuf(rel, wbuf);
998 
999  /* nothing more to do if we reached the read page */
1000  if (rblkno == wblkno)
1001  {
1002  _hash_relbuf(rel, rbuf);
1003  return;
1004  }
1005 
1006  wbuf = next_wbuf;
1007  wpage = BufferGetPage(wbuf);
1008  wopaque = HashPageGetOpaque(wpage);
1009  Assert(wopaque->hasho_bucket == bucket);
1010  retain_pin = false;
1011 
1012  /* be tidy */
1013  for (i = 0; i < nitups; i++)
1014  pfree(itups[i]);
1015  nitups = 0;
1016  all_tups_size = 0;
1017  ndeletable = 0;
1018 
1019  /*
1020  * after moving the tuples, rpage would have been compacted,
1021  * so we need to rescan it.
1022  */
1023  if (tups_moved)
1024  goto readpage;
1025  }
1026 
1027  /* remember tuple for deletion from "read" page */
1028  deletable[ndeletable++] = roffnum;
1029 
1030  /*
1031  * we need a copy of index tuples as they can be freed as part of
1032  * overflow page, however we need them to write a WAL record in
1033  * _hash_freeovflpage.
1034  */
1035  itups[nitups] = CopyIndexTuple(itup);
1036  tups_size[nitups++] = itemsz;
1037  all_tups_size += itemsz;
1038  }
1039 
1040  /*
1041  * If we reach here, there are no live tuples on the "read" page ---
1042  * it was empty when we got to it, or we moved them all. So we can
1043  * just free the page without bothering with deleting tuples
1044  * individually. Then advance to the previous "read" page.
1045  *
1046  * Tricky point here: if our read and write pages are adjacent in the
1047  * bucket chain, our write lock on wbuf will conflict with
1048  * _hash_freeovflpage's attempt to update the sibling links of the
1049  * removed page. In that case, we don't need to lock it again.
1050  */
1051  rblkno = ropaque->hasho_prevblkno;
1052  Assert(BlockNumberIsValid(rblkno));
1053 
1054  /* free this overflow page (releases rbuf) */
1055  _hash_freeovflpage(rel, bucket_buf, rbuf, wbuf, itups, itup_offsets,
1056  tups_size, nitups, bstrategy);
1057 
1058  /* be tidy */
1059  for (i = 0; i < nitups; i++)
1060  pfree(itups[i]);
1061 
1062  /* are we freeing the page adjacent to wbuf? */
1063  if (rblkno == wblkno)
1064  {
1065  /* retain the pin on primary bucket page till end of bucket scan */
1066  if (wblkno == bucket_blkno)
1068  else
1069  _hash_relbuf(rel, wbuf);
1070  return;
1071  }
1072 
1073  rbuf = _hash_getbuf_with_strategy(rel,
1074  rblkno,
1075  HASH_WRITE,
1077  bstrategy);
1078  rpage = BufferGetPage(rbuf);
1079  ropaque = HashPageGetOpaque(rpage);
1080  Assert(ropaque->hasho_bucket == bucket);
1081  }
1082 
1083  /* NOTREACHED */
1084 }
uint32 BlockNumber
Definition: block.h:31
#define InvalidBlockNumber
Definition: block.h:33
#define BlockNumberIsValid(blockNumber)
Definition: block.h:70
#define CLRBIT(x, i)
Definition: blutils.c:31
#define SETBIT(x, i)
Definition: blutils.c:32
int Buffer
Definition: buf.h:23
#define InvalidBuffer
Definition: buf.h:25
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2755
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:1573
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:4156
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:96
#define BufferGetPageSize(buffer)
Definition: bufmgr.h:156
#define BufferIsValid(bufnum)
Definition: bufmgr.h:123
#define BUFFER_LOCK_EXCLUSIVE
Definition: bufmgr.h:98
#define BufferGetPage(buffer)
Definition: bufmgr.h:169
Size PageGetFreeSpaceForMultipleTuples(Page page, int ntups)
Definition: bufpage.c:934
void PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems)
Definition: bufpage.c:1161
PageHeaderData * PageHeader
Definition: bufpage.h:166
Pointer Page
Definition: bufpage.h:78
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:356
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:234
#define PageIsEmpty(page)
Definition: bufpage.h:221
#define PageGetItem(page, itemId)
Definition: bufpage.h:339
#define PageSetLSN(page, lsn)
Definition: bufpage.h:367
unsigned short uint16
Definition: c.h:440
unsigned int uint32
Definition: c.h:441
#define MAXALIGN(LEN)
Definition: c.h:757
signed int int32
Definition: c.h:429
#define MemSet(start, val, len)
Definition: c.h:1008
#define PG_USED_FOR_ASSERTS_ONLY
Definition: c.h:155
size_t Size
Definition: c.h:540
int errcode(int sqlerrcode)
Definition: elog.c:693
int errmsg(const char *fmt,...)
Definition: elog.c:904
#define ERROR
Definition: elog.h:33
#define elog(elevel,...)
Definition: elog.h:218
#define ereport(elevel,...)
Definition: elog.h:143
#define HashPageGetOpaque(page)
Definition: hash.h:88
#define LH_BUCKET_PAGE
Definition: hash.h:55
#define HASH_MAX_BITMAPS
Definition: hash.h:230
#define BMPG_MASK(metap)
Definition: hash.h:314
#define HASH_WRITE
Definition: hash.h:340
#define LH_UNUSED_PAGE
Definition: hash.h:53
#define BITS_PER_MAP
Definition: hash.h:329
#define HashPageGetBitmap(page)
Definition: hash.h:316
#define LH_META_PAGE
Definition: hash.h:57
#define HASHO_PAGE_ID
Definition: hash.h:101
#define ISSET(A, N)
Definition: hash.h:334
#define HashPageGetMeta(page)
Definition: hash.h:323
#define HASH_READ
Definition: hash.h:339
#define BMPGSZ_BIT(metap)
Definition: hash.h:312
#define HASH_METAPAGE
Definition: hash.h:198
#define LH_PAGE_TYPE
Definition: hash.h:63
uint32 Bucket
Definition: hash.h:35
#define ALL_SET
Definition: hash.h:302
#define LH_BITMAP_PAGE
Definition: hash.h:56
#define BMPG_SHIFT(metap)
Definition: hash.h:313
#define LH_OVERFLOW_PAGE
Definition: hash.h:54
#define InvalidBucket
Definition: hash.h:37
#define HASH_XLOG_FREE_OVFL_BUFS
Definition: hash_xlog.h:22
#define XLOG_HASH_SQUEEZE_PAGE
Definition: hash_xlog.h:35
#define SizeOfHashAddOvflPage
Definition: hash_xlog.h:80
#define XLOG_HASH_ADD_OVFL_PAGE
Definition: hash_xlog.h:30
#define SizeOfHashSqueezePage
Definition: hash_xlog.h:167
#define SizeOfHashMovePageContents
Definition: hash_xlog.h:138
#define XLOG_HASH_MOVE_PAGE_CONTENTS
Definition: hash_xlog.h:34
void _hash_pgaddmultitup(Relation rel, Buffer buf, IndexTuple *itups, OffsetNumber *itup_offsets, uint16 nitups)
Definition: hashinsert.c:300
void _hash_squeezebucket(Relation rel, Bucket bucket, BlockNumber bucket_blkno, Buffer bucket_buf, BufferAccessStrategy bstrategy)
Definition: hashovfl.c:806
static uint32 _hash_firstfreebit(uint32 map)
Definition: hashovfl.c:448
uint32 _hash_ovflblkno_to_bitno(HashMetaPage metap, BlockNumber ovflblkno)
Definition: hashovfl.c:62
BlockNumber _hash_freeovflpage(Relation rel, Buffer bucketbuf, Buffer ovflbuf, Buffer wbuf, IndexTuple *itups, OffsetNumber *itup_offsets, Size *tups_size, uint16 nitups, BufferAccessStrategy bstrategy)
Definition: hashovfl.c:490
void _hash_initbitmapbuffer(Buffer buf, uint16 bmsize, bool initpage)
Definition: hashovfl.c:741
Buffer _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf, bool retain_pin)
Definition: hashovfl.c:112
static BlockNumber bitno_to_blkno(HashMetaPage metap, uint32 ovflbitnum)
Definition: hashovfl.c:35
Buffer _hash_getinitbuf(Relation rel, BlockNumber blkno)
Definition: hashpage.c:135
void _hash_relbuf(Relation rel, Buffer buf)
Definition: hashpage.c:266
void _hash_pageinit(Page page, Size size)
Definition: hashpage.c:596
Buffer _hash_getbuf(Relation rel, BlockNumber blkno, int access, int flags)
Definition: hashpage.c:70
Buffer _hash_getbuf_with_strategy(Relation rel, BlockNumber blkno, int access, int flags, BufferAccessStrategy bstrategy)
Definition: hashpage.c:239
Buffer _hash_getnewbuf(Relation rel, BlockNumber blkno, ForkNumber forkNum)
Definition: hashpage.c:198
uint32 _hash_get_totalbuckets(uint32 splitpoint_phase)
Definition: hashutil.c:175
void _hash_checkpage(Relation rel, Buffer buf, int flags)
Definition: hashutil.c:211
IndexTuple CopyIndexTuple(IndexTuple source)
Definition: indextuple.c:528
int j
Definition: isn.c:74
int i
Definition: isn.c:73
#define ItemIdIsDead(itemId)
Definition: itemid.h:113
IndexTupleData * IndexTuple
Definition: itup.h:53
#define IndexTupleSize(itup)
Definition: itup.h:70
#define MaxIndexTuplesPerPage
Definition: itup.h:144
Assert(fmt[strlen(fmt) - 1] !='\n')
void pfree(void *pointer)
Definition: mcxt.c:1175
#define START_CRIT_SECTION()
Definition: miscadmin.h:148
#define END_CRIT_SECTION()
Definition: miscadmin.h:150
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
uint16 OffsetNumber
Definition: off.h:24
#define FirstOffsetNumber
Definition: off.h:27
#define MaxOffsetNumber
Definition: off.h:28
static char * buf
Definition: pg_test_fsync.c:67
#define RelationGetRelationName(relation)
Definition: rel.h:522
#define RelationNeedsWAL(relation)
Definition: rel.h:612
@ MAIN_FORKNUM
Definition: relpath.h:43
BlockNumber hashm_mapp[HASH_MAX_BITMAPS]
Definition: hash.h:264
uint32 hashm_spares[HASH_MAX_SPLITPOINTS]
Definition: hash.h:262
uint32 hashm_firstfree
Definition: hash.h:259
uint16 hashm_bmsize
Definition: hash.h:251
uint32 hashm_ovflpoint
Definition: hash.h:257
uint32 hashm_nmaps
Definition: hash.h:260
BlockNumber hasho_nextblkno
Definition: hash.h:80
uint16 hasho_flag
Definition: hash.h:82
BlockNumber hasho_prevblkno
Definition: hash.h:79
uint16 hasho_page_id
Definition: hash.h:83
Bucket hasho_bucket
Definition: hash.h:81
BlockNumber prevblkno
Definition: hash_xlog.h:155
bool is_prim_bucket_same_wrt
Definition: hash_xlog.h:158
bool is_prev_bucket_same_wrt
Definition: hash_xlog.h:161
BlockNumber nextblkno
Definition: hash_xlog.h:156
Datum bit(PG_FUNCTION_ARGS)
Definition: varbit.c:391
uint64 XLogRecPtr
Definition: xlogdefs.h:21
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:443
void XLogRegisterBufData(uint8 block_id, char *data, int len)
Definition: xloginsert.c:389
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition: xloginsert.c:243
void XLogBeginInsert(void)
Definition: xloginsert.c:150
void XLogEnsureRecordSpace(int max_block_id, int ndatas)
Definition: xloginsert.c:176
void XLogRegisterData(char *data, int len)
Definition: xloginsert.c:351
#define REGBUF_STANDARD
Definition: xloginsert.h:34
#define REGBUF_NO_IMAGE
Definition: xloginsert.h:32
#define REGBUF_WILL_INIT
Definition: xloginsert.h:33