PostgreSQL Source Code  git master
hashovfl.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * hashovfl.c
4  * Overflow page management code for the Postgres hash access method
5  *
6  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  * src/backend/access/hash/hashovfl.c
12  *
13  * NOTES
14  * Overflow pages look like ordinary relation pages.
15  *
16  *-------------------------------------------------------------------------
17  */
18 #include "postgres.h"
19 
20 #include "access/hash.h"
21 #include "access/hash_xlog.h"
22 #include "access/xloginsert.h"
23 #include "miscadmin.h"
24 #include "utils/rel.h"
25 
26 
27 static uint32 _hash_firstfreebit(uint32 map);
28 
29 
30 /*
31  * Convert overflow page bit number (its index in the free-page bitmaps)
32  * to block number within the index.
33  */
34 static BlockNumber
36 {
37  uint32 splitnum = metap->hashm_ovflpoint;
38  uint32 i;
39 
40  /* Convert zero-based bitnumber to 1-based page number */
41  ovflbitnum += 1;
42 
43  /* Determine the split number for this page (must be >= 1) */
44  for (i = 1;
45  i < splitnum && ovflbitnum > metap->hashm_spares[i];
46  i++)
47  /* loop */ ;
48 
49  /*
50  * Convert to absolute page number by adding the number of bucket pages
51  * that exist before this split point.
52  */
53  return (BlockNumber) (_hash_get_totalbuckets(i) + ovflbitnum);
54 }
55 
56 /*
57  * _hash_ovflblkno_to_bitno
58  *
59  * Convert overflow page block number to bit number for free-page bitmap.
60  */
61 uint32
63 {
64  uint32 splitnum = metap->hashm_ovflpoint;
65  uint32 i;
66  uint32 bitnum;
67 
68  /* Determine the split number containing this page */
69  for (i = 1; i <= splitnum; i++)
70  {
71  if (ovflblkno <= (BlockNumber) _hash_get_totalbuckets(i))
72  break; /* oops */
73  bitnum = ovflblkno - _hash_get_totalbuckets(i);
74 
75  /*
76  * bitnum has to be greater than number of overflow page added in
77  * previous split point. The overflow page at this splitnum (i) if any
78  * should start from (_hash_get_totalbuckets(i) +
79  * metap->hashm_spares[i - 1] + 1).
80  */
81  if (bitnum > metap->hashm_spares[i - 1] &&
82  bitnum <= metap->hashm_spares[i])
83  return bitnum - 1; /* -1 to convert 1-based to 0-based */
84  }
85 
86  ereport(ERROR,
87  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
88  errmsg("invalid overflow block number %u", ovflblkno)));
89  return 0; /* keep compiler quiet */
90 }
91 
92 /*
93  * _hash_addovflpage
94  *
95  * Add an overflow page to the bucket whose last page is pointed to by 'buf'.
96  *
97  * On entry, the caller must hold a pin but no lock on 'buf'. The pin is
98  * dropped before exiting (we assume the caller is not interested in 'buf'
99  * anymore) if not asked to retain. The pin will be retained only for the
100  * primary bucket. The returned overflow page will be pinned and
101  * write-locked; it is guaranteed to be empty.
102  *
103  * The caller must hold a pin, but no lock, on the metapage buffer.
104  * That buffer is returned in the same state.
105  *
106  * NB: since this could be executed concurrently by multiple processes,
107  * one should not assume that the returned overflow page will be the
108  * immediate successor of the originally passed 'buf'. Additional overflow
109  * pages might have been added to the bucket chain in between.
110  */
111 Buffer
112 _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf, bool retain_pin)
113 {
114  Buffer ovflbuf;
115  Page page;
116  Page ovflpage;
117  HashPageOpaque pageopaque;
118  HashPageOpaque ovflopaque;
119  HashMetaPage metap;
120  Buffer mapbuf = InvalidBuffer;
121  Buffer newmapbuf = InvalidBuffer;
122  BlockNumber blkno;
123  uint32 orig_firstfree;
124  uint32 splitnum;
125  uint32 *freep = NULL;
126  uint32 max_ovflpg;
127  uint32 bit;
128  uint32 bitmap_page_bit;
129  uint32 first_page;
130  uint32 last_bit;
131  uint32 last_page;
132  uint32 i,
133  j;
134  bool page_found = false;
135 
136  /*
137  * Write-lock the tail page. Here, we need to maintain locking order such
138  * that, first acquire the lock on tail page of bucket, then on meta page
139  * to find and lock the bitmap page and if it is found, then lock on meta
140  * page is released, then finally acquire the lock on new overflow buffer.
141  * We need this locking order to avoid deadlock with backends that are
142  * doing inserts.
143  *
144  * Note: We could have avoided locking many buffers here if we made two
145  * WAL records for acquiring an overflow page (one to allocate an overflow
146  * page and another to add it to overflow bucket chain). However, doing
147  * so can leak an overflow page, if the system crashes after allocation.
148  * Needless to say, it is better to have a single record from a
149  * performance point of view as well.
150  */
152 
153  /* probably redundant... */
155 
156  /* loop to find current tail page, in case someone else inserted too */
157  for (;;)
158  {
159  BlockNumber nextblkno;
160 
161  page = BufferGetPage(buf);
162  pageopaque = HashPageGetOpaque(page);
163  nextblkno = pageopaque->hasho_nextblkno;
164 
165  if (!BlockNumberIsValid(nextblkno))
166  break;
167 
168  /* we assume we do not need to write the unmodified page */
169  if (retain_pin)
170  {
171  /* pin will be retained only for the primary bucket page */
172  Assert((pageopaque->hasho_flag & LH_PAGE_TYPE) == LH_BUCKET_PAGE);
174  }
175  else
176  _hash_relbuf(rel, buf);
177 
178  retain_pin = false;
179 
180  buf = _hash_getbuf(rel, nextblkno, HASH_WRITE, LH_OVERFLOW_PAGE);
181  }
182 
183  /* Get exclusive lock on the meta page */
185 
186  _hash_checkpage(rel, metabuf, LH_META_PAGE);
187  metap = HashPageGetMeta(BufferGetPage(metabuf));
188 
189  /* start search at hashm_firstfree */
190  orig_firstfree = metap->hashm_firstfree;
191  first_page = orig_firstfree >> BMPG_SHIFT(metap);
192  bit = orig_firstfree & BMPG_MASK(metap);
193  i = first_page;
194  j = bit / BITS_PER_MAP;
195  bit &= ~(BITS_PER_MAP - 1);
196 
197  /* outer loop iterates once per bitmap page */
198  for (;;)
199  {
200  BlockNumber mapblkno;
201  Page mappage;
202  uint32 last_inpage;
203 
204  /* want to end search with the last existing overflow page */
205  splitnum = metap->hashm_ovflpoint;
206  max_ovflpg = metap->hashm_spares[splitnum] - 1;
207  last_page = max_ovflpg >> BMPG_SHIFT(metap);
208  last_bit = max_ovflpg & BMPG_MASK(metap);
209 
210  if (i > last_page)
211  break;
212 
213  Assert(i < metap->hashm_nmaps);
214  mapblkno = metap->hashm_mapp[i];
215 
216  if (i == last_page)
217  last_inpage = last_bit;
218  else
219  last_inpage = BMPGSZ_BIT(metap) - 1;
220 
221  /* Release exclusive lock on metapage while reading bitmap page */
222  LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
223 
224  mapbuf = _hash_getbuf(rel, mapblkno, HASH_WRITE, LH_BITMAP_PAGE);
225  mappage = BufferGetPage(mapbuf);
226  freep = HashPageGetBitmap(mappage);
227 
228  for (; bit <= last_inpage; j++, bit += BITS_PER_MAP)
229  {
230  if (freep[j] != ALL_SET)
231  {
232  page_found = true;
233 
234  /* Reacquire exclusive lock on the meta page */
236 
237  /* convert bit to bit number within page */
238  bit += _hash_firstfreebit(freep[j]);
239  bitmap_page_bit = bit;
240 
241  /* convert bit to absolute bit number */
242  bit += (i << BMPG_SHIFT(metap));
243  /* Calculate address of the recycled overflow page */
244  blkno = bitno_to_blkno(metap, bit);
245 
246  /* Fetch and init the recycled page */
247  ovflbuf = _hash_getinitbuf(rel, blkno);
248 
249  goto found;
250  }
251  }
252 
253  /* No free space here, try to advance to next map page */
254  _hash_relbuf(rel, mapbuf);
255  mapbuf = InvalidBuffer;
256  i++;
257  j = 0; /* scan from start of next map page */
258  bit = 0;
259 
260  /* Reacquire exclusive lock on the meta page */
262  }
263 
264  /*
265  * No free pages --- have to extend the relation to add an overflow page.
266  * First, check to see if we have to add a new bitmap page too.
267  */
268  if (last_bit == (uint32) (BMPGSZ_BIT(metap) - 1))
269  {
270  /*
271  * We create the new bitmap page with all pages marked "in use".
272  * Actually two pages in the new bitmap's range will exist
273  * immediately: the bitmap page itself, and the following page which
274  * is the one we return to the caller. Both of these are correctly
275  * marked "in use". Subsequent pages do not exist yet, but it is
276  * convenient to pre-mark them as "in use" too.
277  */
278  bit = metap->hashm_spares[splitnum];
279 
280  /* metapage already has a write lock */
281  if (metap->hashm_nmaps >= HASH_MAX_BITMAPS)
282  ereport(ERROR,
283  (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
284  errmsg("out of overflow pages in hash index \"%s\"",
285  RelationGetRelationName(rel))));
286 
287  newmapbuf = _hash_getnewbuf(rel, bitno_to_blkno(metap, bit), MAIN_FORKNUM);
288  }
289  else
290  {
291  /*
292  * Nothing to do here; since the page will be past the last used page,
293  * we know its bitmap bit was preinitialized to "in use".
294  */
295  }
296 
297  /* Calculate address of the new overflow page */
298  bit = BufferIsValid(newmapbuf) ?
299  metap->hashm_spares[splitnum] + 1 : metap->hashm_spares[splitnum];
300  blkno = bitno_to_blkno(metap, bit);
301 
302  /*
303  * Fetch the page with _hash_getnewbuf to ensure smgr's idea of the
304  * relation length stays in sync with ours. XXX It's annoying to do this
305  * with metapage write lock held; would be better to use a lock that
306  * doesn't block incoming searches.
307  *
308  * It is okay to hold two buffer locks here (one on tail page of bucket
309  * and other on new overflow page) since there cannot be anyone else
310  * contending for access to ovflbuf.
311  */
312  ovflbuf = _hash_getnewbuf(rel, blkno, MAIN_FORKNUM);
313 
314 found:
315 
316  /*
317  * Do the update. No ereport(ERROR) until changes are logged. We want to
318  * log the changes for bitmap page and overflow page together to avoid
319  * loss of pages in case the new page is added.
320  */
322 
323  if (page_found)
324  {
325  Assert(BufferIsValid(mapbuf));
326 
327  /* mark page "in use" in the bitmap */
328  SETBIT(freep, bitmap_page_bit);
329  MarkBufferDirty(mapbuf);
330  }
331  else
332  {
333  /* update the count to indicate new overflow page is added */
334  metap->hashm_spares[splitnum]++;
335 
336  if (BufferIsValid(newmapbuf))
337  {
338  _hash_initbitmapbuffer(newmapbuf, metap->hashm_bmsize, false);
339  MarkBufferDirty(newmapbuf);
340 
341  /* add the new bitmap page to the metapage's list of bitmaps */
342  metap->hashm_mapp[metap->hashm_nmaps] = BufferGetBlockNumber(newmapbuf);
343  metap->hashm_nmaps++;
344  metap->hashm_spares[splitnum]++;
345  }
346 
347  MarkBufferDirty(metabuf);
348 
349  /*
350  * for new overflow page, we don't need to explicitly set the bit in
351  * bitmap page, as by default that will be set to "in use".
352  */
353  }
354 
355  /*
356  * Adjust hashm_firstfree to avoid redundant searches. But don't risk
357  * changing it if someone moved it while we were searching bitmap pages.
358  */
359  if (metap->hashm_firstfree == orig_firstfree)
360  {
361  metap->hashm_firstfree = bit + 1;
362  MarkBufferDirty(metabuf);
363  }
364 
365  /* initialize new overflow page */
366  ovflpage = BufferGetPage(ovflbuf);
367  ovflopaque = HashPageGetOpaque(ovflpage);
369  ovflopaque->hasho_nextblkno = InvalidBlockNumber;
370  ovflopaque->hasho_bucket = pageopaque->hasho_bucket;
371  ovflopaque->hasho_flag = LH_OVERFLOW_PAGE;
372  ovflopaque->hasho_page_id = HASHO_PAGE_ID;
373 
374  MarkBufferDirty(ovflbuf);
375 
376  /* logically chain overflow page to previous page */
377  pageopaque->hasho_nextblkno = BufferGetBlockNumber(ovflbuf);
378 
380 
381  /* XLOG stuff */
382  if (RelationNeedsWAL(rel))
383  {
384  XLogRecPtr recptr;
385  xl_hash_add_ovfl_page xlrec;
386 
387  xlrec.bmpage_found = page_found;
388  xlrec.bmsize = metap->hashm_bmsize;
389 
390  XLogBeginInsert();
391  XLogRegisterData((char *) &xlrec, SizeOfHashAddOvflPage);
392 
394  XLogRegisterBufData(0, (char *) &pageopaque->hasho_bucket, sizeof(Bucket));
395 
397 
398  if (BufferIsValid(mapbuf))
399  {
401  XLogRegisterBufData(2, (char *) &bitmap_page_bit, sizeof(uint32));
402  }
403 
404  if (BufferIsValid(newmapbuf))
405  XLogRegisterBuffer(3, newmapbuf, REGBUF_WILL_INIT);
406 
407  XLogRegisterBuffer(4, metabuf, REGBUF_STANDARD);
408  XLogRegisterBufData(4, (char *) &metap->hashm_firstfree, sizeof(uint32));
409 
410  recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_ADD_OVFL_PAGE);
411 
412  PageSetLSN(BufferGetPage(ovflbuf), recptr);
413  PageSetLSN(BufferGetPage(buf), recptr);
414 
415  if (BufferIsValid(mapbuf))
416  PageSetLSN(BufferGetPage(mapbuf), recptr);
417 
418  if (BufferIsValid(newmapbuf))
419  PageSetLSN(BufferGetPage(newmapbuf), recptr);
420 
421  PageSetLSN(BufferGetPage(metabuf), recptr);
422  }
423 
425 
426  if (retain_pin)
428  else
429  _hash_relbuf(rel, buf);
430 
431  if (BufferIsValid(mapbuf))
432  _hash_relbuf(rel, mapbuf);
433 
434  LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
435 
436  if (BufferIsValid(newmapbuf))
437  _hash_relbuf(rel, newmapbuf);
438 
439  return ovflbuf;
440 }
441 
442 /*
443  * _hash_firstfreebit()
444  *
445  * Return the number of the first bit that is not set in the word 'map'.
446  */
447 static uint32
449 {
450  uint32 i,
451  mask;
452 
453  mask = 0x1;
454  for (i = 0; i < BITS_PER_MAP; i++)
455  {
456  if (!(mask & map))
457  return i;
458  mask <<= 1;
459  }
460 
461  elog(ERROR, "firstfreebit found no free bit");
462 
463  return 0; /* keep compiler quiet */
464 }
465 
466 /*
467  * _hash_freeovflpage() -
468  *
469  * Remove this overflow page from its bucket's chain, and mark the page as
470  * free. On entry, ovflbuf is write-locked; it is released before exiting.
471  *
472  * Add the tuples (itups) to wbuf in this function. We could do that in the
473  * caller as well, but the advantage of doing it here is we can easily write
474  * the WAL for XLOG_HASH_SQUEEZE_PAGE operation. Addition of tuples and
475  * removal of overflow page has to done as an atomic operation, otherwise
476  * during replay on standby users might find duplicate records.
477  *
478  * Since this function is invoked in VACUUM, we provide an access strategy
479  * parameter that controls fetches of the bucket pages.
480  *
481  * Returns the block number of the page that followed the given page
482  * in the bucket, or InvalidBlockNumber if no following page.
483  *
484  * NB: caller must not hold lock on metapage, nor on page, that's next to
485  * ovflbuf in the bucket chain. We don't acquire the lock on page that's
486  * prior to ovflbuf in chain if it is same as wbuf because the caller already
487  * has a lock on same.
488  */
490 _hash_freeovflpage(Relation rel, Buffer bucketbuf, Buffer ovflbuf,
491  Buffer wbuf, IndexTuple *itups, OffsetNumber *itup_offsets,
492  Size *tups_size, uint16 nitups,
493  BufferAccessStrategy bstrategy)
494 {
495  HashMetaPage metap;
496  Buffer metabuf;
497  Buffer mapbuf;
498  BlockNumber ovflblkno;
499  BlockNumber prevblkno;
500  BlockNumber blkno;
501  BlockNumber nextblkno;
502  BlockNumber writeblkno;
503  HashPageOpaque ovflopaque;
504  Page ovflpage;
505  Page mappage;
506  uint32 *freep;
507  uint32 ovflbitno;
508  int32 bitmappage,
509  bitmapbit;
511  Buffer prevbuf = InvalidBuffer;
512  Buffer nextbuf = InvalidBuffer;
513  bool update_metap = false;
514 
515  /* Get information from the doomed page */
516  _hash_checkpage(rel, ovflbuf, LH_OVERFLOW_PAGE);
517  ovflblkno = BufferGetBlockNumber(ovflbuf);
518  ovflpage = BufferGetPage(ovflbuf);
519  ovflopaque = HashPageGetOpaque(ovflpage);
520  nextblkno = ovflopaque->hasho_nextblkno;
521  prevblkno = ovflopaque->hasho_prevblkno;
522  writeblkno = BufferGetBlockNumber(wbuf);
523  bucket = ovflopaque->hasho_bucket;
524 
525  /*
526  * Fix up the bucket chain. this is a doubly-linked list, so we must fix
527  * up the bucket chain members behind and ahead of the overflow page being
528  * deleted. Concurrency issues are avoided by using lock chaining as
529  * described atop hashbucketcleanup.
530  */
531  if (BlockNumberIsValid(prevblkno))
532  {
533  if (prevblkno == writeblkno)
534  prevbuf = wbuf;
535  else
536  prevbuf = _hash_getbuf_with_strategy(rel,
537  prevblkno,
538  HASH_WRITE,
540  bstrategy);
541  }
542  if (BlockNumberIsValid(nextblkno))
543  nextbuf = _hash_getbuf_with_strategy(rel,
544  nextblkno,
545  HASH_WRITE,
547  bstrategy);
548 
549  /* Note: bstrategy is intentionally not used for metapage and bitmap */
550 
551  /* Read the metapage so we can determine which bitmap page to use */
553  metap = HashPageGetMeta(BufferGetPage(metabuf));
554 
555  /* Identify which bit to set */
556  ovflbitno = _hash_ovflblkno_to_bitno(metap, ovflblkno);
557 
558  bitmappage = ovflbitno >> BMPG_SHIFT(metap);
559  bitmapbit = ovflbitno & BMPG_MASK(metap);
560 
561  if (bitmappage >= metap->hashm_nmaps)
562  elog(ERROR, "invalid overflow bit number %u", ovflbitno);
563  blkno = metap->hashm_mapp[bitmappage];
564 
565  /* Release metapage lock while we access the bitmap page */
566  LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
567 
568  /* read the bitmap page to clear the bitmap bit */
569  mapbuf = _hash_getbuf(rel, blkno, HASH_WRITE, LH_BITMAP_PAGE);
570  mappage = BufferGetPage(mapbuf);
571  freep = HashPageGetBitmap(mappage);
572  Assert(ISSET(freep, bitmapbit));
573 
574  /* Get write-lock on metapage to update firstfree */
576 
577  /* This operation needs to log multiple tuples, prepare WAL for that */
578  if (RelationNeedsWAL(rel))
580 
582 
583  /*
584  * we have to insert tuples on the "write" page, being careful to preserve
585  * hashkey ordering. (If we insert many tuples into the same "write" page
586  * it would be worth qsort'ing them).
587  */
588  if (nitups > 0)
589  {
590  _hash_pgaddmultitup(rel, wbuf, itups, itup_offsets, nitups);
591  MarkBufferDirty(wbuf);
592  }
593 
594  /*
595  * Reinitialize the freed overflow page. Just zeroing the page won't
596  * work, because WAL replay routines expect pages to be initialized. See
597  * explanation of RBM_NORMAL mode atop XLogReadBufferExtended. We are
598  * careful to make the special space valid here so that tools like
599  * pageinspect won't get confused.
600  */
601  _hash_pageinit(ovflpage, BufferGetPageSize(ovflbuf));
602 
603  ovflopaque = HashPageGetOpaque(ovflpage);
604 
605  ovflopaque->hasho_prevblkno = InvalidBlockNumber;
606  ovflopaque->hasho_nextblkno = InvalidBlockNumber;
607  ovflopaque->hasho_bucket = InvalidBucket;
608  ovflopaque->hasho_flag = LH_UNUSED_PAGE;
609  ovflopaque->hasho_page_id = HASHO_PAGE_ID;
610 
611  MarkBufferDirty(ovflbuf);
612 
613  if (BufferIsValid(prevbuf))
614  {
615  Page prevpage = BufferGetPage(prevbuf);
616  HashPageOpaque prevopaque = HashPageGetOpaque(prevpage);
617 
618  Assert(prevopaque->hasho_bucket == bucket);
619  prevopaque->hasho_nextblkno = nextblkno;
620  MarkBufferDirty(prevbuf);
621  }
622  if (BufferIsValid(nextbuf))
623  {
624  Page nextpage = BufferGetPage(nextbuf);
625  HashPageOpaque nextopaque = HashPageGetOpaque(nextpage);
626 
627  Assert(nextopaque->hasho_bucket == bucket);
628  nextopaque->hasho_prevblkno = prevblkno;
629  MarkBufferDirty(nextbuf);
630  }
631 
632  /* Clear the bitmap bit to indicate that this overflow page is free */
633  CLRBIT(freep, bitmapbit);
634  MarkBufferDirty(mapbuf);
635 
636  /* if this is now the first free page, update hashm_firstfree */
637  if (ovflbitno < metap->hashm_firstfree)
638  {
639  metap->hashm_firstfree = ovflbitno;
640  update_metap = true;
641  MarkBufferDirty(metabuf);
642  }
643 
644  /* XLOG stuff */
645  if (RelationNeedsWAL(rel))
646  {
647  xl_hash_squeeze_page xlrec;
648  XLogRecPtr recptr;
649  int i;
650  bool mod_wbuf = false;
651 
652  xlrec.prevblkno = prevblkno;
653  xlrec.nextblkno = nextblkno;
654  xlrec.ntups = nitups;
655  xlrec.is_prim_bucket_same_wrt = (wbuf == bucketbuf);
656  xlrec.is_prev_bucket_same_wrt = (wbuf == prevbuf);
657 
658  XLogBeginInsert();
659  XLogRegisterData((char *) &xlrec, SizeOfHashSqueezePage);
660 
661  /*
662  * bucket buffer was not changed, but still needs to be registered to
663  * ensure that we can acquire a cleanup lock on it during replay.
664  */
665  if (!xlrec.is_prim_bucket_same_wrt)
666  {
668 
669  XLogRegisterBuffer(0, bucketbuf, flags);
670  }
671 
672  if (xlrec.ntups > 0)
673  {
675 
676  /* Remember that wbuf is modified. */
677  mod_wbuf = true;
678 
679  XLogRegisterBufData(1, (char *) itup_offsets,
680  nitups * sizeof(OffsetNumber));
681  for (i = 0; i < nitups; i++)
682  XLogRegisterBufData(1, (char *) itups[i], tups_size[i]);
683  }
684  else if (xlrec.is_prim_bucket_same_wrt || xlrec.is_prev_bucket_same_wrt)
685  {
686  uint8 wbuf_flags;
687 
688  /*
689  * A write buffer needs to be registered even if no tuples are
690  * added to it to ensure that we can acquire a cleanup lock on it
691  * if it is the same as primary bucket buffer or update the
692  * nextblkno if it is same as the previous bucket buffer.
693  */
694  Assert(xlrec.ntups == 0);
695 
696  wbuf_flags = REGBUF_STANDARD;
697  if (!xlrec.is_prev_bucket_same_wrt)
698  {
699  wbuf_flags |= REGBUF_NO_CHANGE;
700  }
701  else
702  {
703  /* Remember that wbuf is modified. */
704  mod_wbuf = true;
705  }
706  XLogRegisterBuffer(1, wbuf, wbuf_flags);
707  }
708 
709  XLogRegisterBuffer(2, ovflbuf, REGBUF_STANDARD);
710 
711  /*
712  * If prevpage and the writepage (block in which we are moving tuples
713  * from overflow) are same, then no need to separately register
714  * prevpage. During replay, we can directly update the nextblock in
715  * writepage.
716  */
717  if (BufferIsValid(prevbuf) && !xlrec.is_prev_bucket_same_wrt)
718  XLogRegisterBuffer(3, prevbuf, REGBUF_STANDARD);
719 
720  if (BufferIsValid(nextbuf))
721  XLogRegisterBuffer(4, nextbuf, REGBUF_STANDARD);
722 
724  XLogRegisterBufData(5, (char *) &bitmapbit, sizeof(uint32));
725 
726  if (update_metap)
727  {
728  XLogRegisterBuffer(6, metabuf, REGBUF_STANDARD);
729  XLogRegisterBufData(6, (char *) &metap->hashm_firstfree, sizeof(uint32));
730  }
731 
732  recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SQUEEZE_PAGE);
733 
734  /* Set LSN iff wbuf is modified. */
735  if (mod_wbuf)
736  PageSetLSN(BufferGetPage(wbuf), recptr);
737 
738  PageSetLSN(BufferGetPage(ovflbuf), recptr);
739 
740  if (BufferIsValid(prevbuf) && !xlrec.is_prev_bucket_same_wrt)
741  PageSetLSN(BufferGetPage(prevbuf), recptr);
742  if (BufferIsValid(nextbuf))
743  PageSetLSN(BufferGetPage(nextbuf), recptr);
744 
745  PageSetLSN(BufferGetPage(mapbuf), recptr);
746 
747  if (update_metap)
748  PageSetLSN(BufferGetPage(metabuf), recptr);
749  }
750 
752 
753  /* release previous bucket if it is not same as write bucket */
754  if (BufferIsValid(prevbuf) && prevblkno != writeblkno)
755  _hash_relbuf(rel, prevbuf);
756 
757  if (BufferIsValid(ovflbuf))
758  _hash_relbuf(rel, ovflbuf);
759 
760  if (BufferIsValid(nextbuf))
761  _hash_relbuf(rel, nextbuf);
762 
763  _hash_relbuf(rel, mapbuf);
764  _hash_relbuf(rel, metabuf);
765 
766  return nextblkno;
767 }
768 
769 
770 /*
771  * _hash_initbitmapbuffer()
772  *
773  * Initialize a new bitmap page. All bits in the new bitmap page are set to
774  * "1", indicating "in use".
775  */
776 void
777 _hash_initbitmapbuffer(Buffer buf, uint16 bmsize, bool initpage)
778 {
779  Page pg;
780  HashPageOpaque op;
781  uint32 *freep;
782 
783  pg = BufferGetPage(buf);
784 
785  /* initialize the page */
786  if (initpage)
788 
789  /* initialize the page's special space */
790  op = HashPageGetOpaque(pg);
796 
797  /* set all of the bits to 1 */
798  freep = HashPageGetBitmap(pg);
799  memset(freep, 0xFF, bmsize);
800 
801  /*
802  * Set pd_lower just past the end of the bitmap page data. We could even
803  * set pd_lower equal to pd_upper, but this is more precise and makes the
804  * page look compressible to xlog.c.
805  */
806  ((PageHeader) pg)->pd_lower = ((char *) freep + bmsize) - (char *) pg;
807 }
808 
809 
810 /*
811  * _hash_squeezebucket(rel, bucket)
812  *
813  * Try to squeeze the tuples onto pages occurring earlier in the
814  * bucket chain in an attempt to free overflow pages. When we start
815  * the "squeezing", the page from which we start taking tuples (the
816  * "read" page) is the last bucket in the bucket chain and the page
817  * onto which we start squeezing tuples (the "write" page) is the
818  * first page in the bucket chain. The read page works backward and
819  * the write page works forward; the procedure terminates when the
820  * read page and write page are the same page.
821  *
822  * At completion of this procedure, it is guaranteed that all pages in
823  * the bucket are nonempty, unless the bucket is totally empty (in
824  * which case all overflow pages will be freed). The original implementation
825  * required that to be true on entry as well, but it's a lot easier for
826  * callers to leave empty overflow pages and let this guy clean it up.
827  *
828  * Caller must acquire cleanup lock on the primary page of the target
829  * bucket to exclude any scans that are in progress, which could easily
830  * be confused into returning the same tuple more than once or some tuples
831  * not at all by the rearrangement we are performing here. To prevent
832  * any concurrent scan to cross the squeeze scan we use lock chaining
833  * similar to hashbucketcleanup. Refer comments atop hashbucketcleanup.
834  *
835  * We need to retain a pin on the primary bucket to ensure that no concurrent
836  * split can start.
837  *
838  * Since this function is invoked in VACUUM, we provide an access strategy
839  * parameter that controls fetches of the bucket pages.
840  */
841 void
843  Bucket bucket,
844  BlockNumber bucket_blkno,
845  Buffer bucket_buf,
846  BufferAccessStrategy bstrategy)
847 {
848  BlockNumber wblkno;
849  BlockNumber rblkno;
850  Buffer wbuf;
851  Buffer rbuf;
852  Page wpage;
853  Page rpage;
854  HashPageOpaque wopaque;
855  HashPageOpaque ropaque;
856 
857  /*
858  * start squeezing into the primary bucket page.
859  */
860  wblkno = bucket_blkno;
861  wbuf = bucket_buf;
862  wpage = BufferGetPage(wbuf);
863  wopaque = HashPageGetOpaque(wpage);
864 
865  /*
866  * if there aren't any overflow pages, there's nothing to squeeze. caller
867  * is responsible for releasing the pin on primary bucket page.
868  */
869  if (!BlockNumberIsValid(wopaque->hasho_nextblkno))
870  {
872  return;
873  }
874 
875  /*
876  * Find the last page in the bucket chain by starting at the base bucket
877  * page and working forward. Note: we assume that a hash bucket chain is
878  * usually smaller than the buffer ring being used by VACUUM, else using
879  * the access strategy here would be counterproductive.
880  */
881  rbuf = InvalidBuffer;
882  ropaque = wopaque;
883  do
884  {
885  rblkno = ropaque->hasho_nextblkno;
886  if (rbuf != InvalidBuffer)
887  _hash_relbuf(rel, rbuf);
888  rbuf = _hash_getbuf_with_strategy(rel,
889  rblkno,
890  HASH_WRITE,
892  bstrategy);
893  rpage = BufferGetPage(rbuf);
894  ropaque = HashPageGetOpaque(rpage);
895  Assert(ropaque->hasho_bucket == bucket);
896  } while (BlockNumberIsValid(ropaque->hasho_nextblkno));
897 
898  /*
899  * squeeze the tuples.
900  */
901  for (;;)
902  {
903  OffsetNumber roffnum;
904  OffsetNumber maxroffnum;
905  OffsetNumber deletable[MaxOffsetNumber];
907  Size tups_size[MaxIndexTuplesPerPage];
908  OffsetNumber itup_offsets[MaxIndexTuplesPerPage];
909  uint16 ndeletable = 0;
910  uint16 nitups = 0;
911  Size all_tups_size = 0;
912  int i;
913  bool retain_pin = false;
914 
915 readpage:
916  /* Scan each tuple in "read" page */
917  maxroffnum = PageGetMaxOffsetNumber(rpage);
918  for (roffnum = FirstOffsetNumber;
919  roffnum <= maxroffnum;
920  roffnum = OffsetNumberNext(roffnum))
921  {
922  IndexTuple itup;
923  Size itemsz;
924 
925  /* skip dead tuples */
926  if (ItemIdIsDead(PageGetItemId(rpage, roffnum)))
927  continue;
928 
929  itup = (IndexTuple) PageGetItem(rpage,
930  PageGetItemId(rpage, roffnum));
931  itemsz = IndexTupleSize(itup);
932  itemsz = MAXALIGN(itemsz);
933 
934  /*
935  * Walk up the bucket chain, looking for a page big enough for
936  * this item and all other accumulated items. Exit if we reach
937  * the read page.
938  */
939  while (PageGetFreeSpaceForMultipleTuples(wpage, nitups + 1) < (all_tups_size + itemsz))
940  {
941  Buffer next_wbuf = InvalidBuffer;
942  bool tups_moved = false;
943 
944  Assert(!PageIsEmpty(wpage));
945 
946  if (wblkno == bucket_blkno)
947  retain_pin = true;
948 
949  wblkno = wopaque->hasho_nextblkno;
950  Assert(BlockNumberIsValid(wblkno));
951 
952  /* don't need to move to next page if we reached the read page */
953  if (wblkno != rblkno)
954  next_wbuf = _hash_getbuf_with_strategy(rel,
955  wblkno,
956  HASH_WRITE,
958  bstrategy);
959 
960  if (nitups > 0)
961  {
962  Assert(nitups == ndeletable);
963 
964  /*
965  * This operation needs to log multiple tuples, prepare
966  * WAL for that.
967  */
968  if (RelationNeedsWAL(rel))
969  XLogEnsureRecordSpace(0, 3 + nitups);
970 
972 
973  /*
974  * we have to insert tuples on the "write" page, being
975  * careful to preserve hashkey ordering. (If we insert
976  * many tuples into the same "write" page it would be
977  * worth qsort'ing them).
978  */
979  _hash_pgaddmultitup(rel, wbuf, itups, itup_offsets, nitups);
980  MarkBufferDirty(wbuf);
981 
982  /* Delete tuples we already moved off read page */
983  PageIndexMultiDelete(rpage, deletable, ndeletable);
984  MarkBufferDirty(rbuf);
985 
986  /* XLOG stuff */
987  if (RelationNeedsWAL(rel))
988  {
989  XLogRecPtr recptr;
991 
992  xlrec.ntups = nitups;
993  xlrec.is_prim_bucket_same_wrt = (wbuf == bucket_buf);
994 
995  XLogBeginInsert();
997 
998  /*
999  * bucket buffer was not changed, but still needs to
1000  * be registered to ensure that we can acquire a
1001  * cleanup lock on it during replay.
1002  */
1003  if (!xlrec.is_prim_bucket_same_wrt)
1004  {
1006 
1007  XLogRegisterBuffer(0, bucket_buf, flags);
1008  }
1009 
1011  XLogRegisterBufData(1, (char *) itup_offsets,
1012  nitups * sizeof(OffsetNumber));
1013  for (i = 0; i < nitups; i++)
1014  XLogRegisterBufData(1, (char *) itups[i], tups_size[i]);
1015 
1017  XLogRegisterBufData(2, (char *) deletable,
1018  ndeletable * sizeof(OffsetNumber));
1019 
1020  recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_MOVE_PAGE_CONTENTS);
1021 
1022  PageSetLSN(BufferGetPage(wbuf), recptr);
1023  PageSetLSN(BufferGetPage(rbuf), recptr);
1024  }
1025 
1026  END_CRIT_SECTION();
1027 
1028  tups_moved = true;
1029  }
1030 
1031  /*
1032  * release the lock on previous page after acquiring the lock
1033  * on next page
1034  */
1035  if (retain_pin)
1037  else
1038  _hash_relbuf(rel, wbuf);
1039 
1040  /* nothing more to do if we reached the read page */
1041  if (rblkno == wblkno)
1042  {
1043  _hash_relbuf(rel, rbuf);
1044  return;
1045  }
1046 
1047  wbuf = next_wbuf;
1048  wpage = BufferGetPage(wbuf);
1049  wopaque = HashPageGetOpaque(wpage);
1050  Assert(wopaque->hasho_bucket == bucket);
1051  retain_pin = false;
1052 
1053  /* be tidy */
1054  for (i = 0; i < nitups; i++)
1055  pfree(itups[i]);
1056  nitups = 0;
1057  all_tups_size = 0;
1058  ndeletable = 0;
1059 
1060  /*
1061  * after moving the tuples, rpage would have been compacted,
1062  * so we need to rescan it.
1063  */
1064  if (tups_moved)
1065  goto readpage;
1066  }
1067 
1068  /* remember tuple for deletion from "read" page */
1069  deletable[ndeletable++] = roffnum;
1070 
1071  /*
1072  * we need a copy of index tuples as they can be freed as part of
1073  * overflow page, however we need them to write a WAL record in
1074  * _hash_freeovflpage.
1075  */
1076  itups[nitups] = CopyIndexTuple(itup);
1077  tups_size[nitups++] = itemsz;
1078  all_tups_size += itemsz;
1079  }
1080 
1081  /*
1082  * If we reach here, there are no live tuples on the "read" page ---
1083  * it was empty when we got to it, or we moved them all. So we can
1084  * just free the page without bothering with deleting tuples
1085  * individually. Then advance to the previous "read" page.
1086  *
1087  * Tricky point here: if our read and write pages are adjacent in the
1088  * bucket chain, our write lock on wbuf will conflict with
1089  * _hash_freeovflpage's attempt to update the sibling links of the
1090  * removed page. In that case, we don't need to lock it again.
1091  */
1092  rblkno = ropaque->hasho_prevblkno;
1093  Assert(BlockNumberIsValid(rblkno));
1094 
1095  /* free this overflow page (releases rbuf) */
1096  _hash_freeovflpage(rel, bucket_buf, rbuf, wbuf, itups, itup_offsets,
1097  tups_size, nitups, bstrategy);
1098 
1099  /* be tidy */
1100  for (i = 0; i < nitups; i++)
1101  pfree(itups[i]);
1102 
1103  /* are we freeing the page adjacent to wbuf? */
1104  if (rblkno == wblkno)
1105  {
1106  /* retain the pin on primary bucket page till end of bucket scan */
1107  if (wblkno == bucket_blkno)
1109  else
1110  _hash_relbuf(rel, wbuf);
1111  return;
1112  }
1113 
1114  rbuf = _hash_getbuf_with_strategy(rel,
1115  rblkno,
1116  HASH_WRITE,
1118  bstrategy);
1119  rpage = BufferGetPage(rbuf);
1120  ropaque = HashPageGetOpaque(rpage);
1121  Assert(ropaque->hasho_bucket == bucket);
1122  }
1123 
1124  /* NOTREACHED */
1125 }
uint32 BlockNumber
Definition: block.h:31
#define InvalidBlockNumber
Definition: block.h:33
static bool BlockNumberIsValid(BlockNumber blockNumber)
Definition: block.h:71
#define CLRBIT(x, i)
Definition: blutils.c:31
#define SETBIT(x, i)
Definition: blutils.c:32
int Buffer
Definition: buf.h:23
#define InvalidBuffer
Definition: buf.h:25
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:3667
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:2474
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:5085
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:197
static Page BufferGetPage(Buffer buffer)
Definition: bufmgr.h:408
static Size BufferGetPageSize(Buffer buffer)
Definition: bufmgr.h:397
#define BUFFER_LOCK_EXCLUSIVE
Definition: bufmgr.h:199
static bool BufferIsValid(Buffer bufnum)
Definition: bufmgr.h:359
Size PageGetFreeSpaceForMultipleTuples(Page page, int ntups)
Definition: bufpage.c:934
void PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems)
Definition: bufpage.c:1161
PageHeaderData * PageHeader
Definition: bufpage.h:170
static bool PageIsEmpty(Page page)
Definition: bufpage.h:220
Pointer Page
Definition: bufpage.h:78
static Item PageGetItem(Page page, ItemId itemId)
Definition: bufpage.h:351
static ItemId PageGetItemId(Page page, OffsetNumber offsetNumber)
Definition: bufpage.h:240
static void PageSetLSN(Page page, XLogRecPtr lsn)
Definition: bufpage.h:388
static OffsetNumber PageGetMaxOffsetNumber(Page page)
Definition: bufpage.h:369
unsigned short uint16
Definition: c.h:505
unsigned int uint32
Definition: c.h:506
#define MAXALIGN(LEN)
Definition: c.h:811
signed int int32
Definition: c.h:494
#define PG_USED_FOR_ASSERTS_ONLY
Definition: c.h:182
#define Assert(condition)
Definition: c.h:858
unsigned char uint8
Definition: c.h:504
size_t Size
Definition: c.h:605
int errcode(int sqlerrcode)
Definition: elog.c:859
int errmsg(const char *fmt,...)
Definition: elog.c:1072
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:224
#define ereport(elevel,...)
Definition: elog.h:149
#define HashPageGetOpaque(page)
Definition: hash.h:88
#define LH_BUCKET_PAGE
Definition: hash.h:55
#define HASH_MAX_BITMAPS
Definition: hash.h:230
#define BMPG_MASK(metap)
Definition: hash.h:314
#define HASH_WRITE
Definition: hash.h:340
#define LH_UNUSED_PAGE
Definition: hash.h:53
#define BITS_PER_MAP
Definition: hash.h:329
#define HashPageGetBitmap(page)
Definition: hash.h:316
#define LH_META_PAGE
Definition: hash.h:57
#define HASHO_PAGE_ID
Definition: hash.h:101
#define ISSET(A, N)
Definition: hash.h:334
#define HashPageGetMeta(page)
Definition: hash.h:323
#define HASH_READ
Definition: hash.h:339
#define BMPGSZ_BIT(metap)
Definition: hash.h:312
#define HASH_METAPAGE
Definition: hash.h:198
#define LH_PAGE_TYPE
Definition: hash.h:63
uint32 Bucket
Definition: hash.h:35
#define ALL_SET
Definition: hash.h:302
#define LH_BITMAP_PAGE
Definition: hash.h:56
#define BMPG_SHIFT(metap)
Definition: hash.h:313
#define LH_OVERFLOW_PAGE
Definition: hash.h:54
#define InvalidBucket
Definition: hash.h:37
#define HASH_XLOG_FREE_OVFL_BUFS
Definition: hash_xlog.h:22
#define XLOG_HASH_SQUEEZE_PAGE
Definition: hash_xlog.h:35
#define SizeOfHashAddOvflPage
Definition: hash_xlog.h:80
#define XLOG_HASH_ADD_OVFL_PAGE
Definition: hash_xlog.h:30
#define SizeOfHashSqueezePage
Definition: hash_xlog.h:167
#define SizeOfHashMovePageContents
Definition: hash_xlog.h:138
#define XLOG_HASH_MOVE_PAGE_CONTENTS
Definition: hash_xlog.h:34
void _hash_pgaddmultitup(Relation rel, Buffer buf, IndexTuple *itups, OffsetNumber *itup_offsets, uint16 nitups)
Definition: hashinsert.c:331
void _hash_squeezebucket(Relation rel, Bucket bucket, BlockNumber bucket_blkno, Buffer bucket_buf, BufferAccessStrategy bstrategy)
Definition: hashovfl.c:842
static uint32 _hash_firstfreebit(uint32 map)
Definition: hashovfl.c:448
uint32 _hash_ovflblkno_to_bitno(HashMetaPage metap, BlockNumber ovflblkno)
Definition: hashovfl.c:62
BlockNumber _hash_freeovflpage(Relation rel, Buffer bucketbuf, Buffer ovflbuf, Buffer wbuf, IndexTuple *itups, OffsetNumber *itup_offsets, Size *tups_size, uint16 nitups, BufferAccessStrategy bstrategy)
Definition: hashovfl.c:490
void _hash_initbitmapbuffer(Buffer buf, uint16 bmsize, bool initpage)
Definition: hashovfl.c:777
Buffer _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf, bool retain_pin)
Definition: hashovfl.c:112
static BlockNumber bitno_to_blkno(HashMetaPage metap, uint32 ovflbitnum)
Definition: hashovfl.c:35
Buffer _hash_getinitbuf(Relation rel, BlockNumber blkno)
Definition: hashpage.c:135
void _hash_relbuf(Relation rel, Buffer buf)
Definition: hashpage.c:266
void _hash_pageinit(Page page, Size size)
Definition: hashpage.c:596
Buffer _hash_getbuf(Relation rel, BlockNumber blkno, int access, int flags)
Definition: hashpage.c:70
Buffer _hash_getbuf_with_strategy(Relation rel, BlockNumber blkno, int access, int flags, BufferAccessStrategy bstrategy)
Definition: hashpage.c:239
Buffer _hash_getnewbuf(Relation rel, BlockNumber blkno, ForkNumber forkNum)
Definition: hashpage.c:198
uint32 _hash_get_totalbuckets(uint32 splitpoint_phase)
Definition: hashutil.c:174
void _hash_checkpage(Relation rel, Buffer buf, int flags)
Definition: hashutil.c:210
IndexTuple CopyIndexTuple(IndexTuple source)
Definition: indextuple.c:547
int j
Definition: isn.c:74
int i
Definition: isn.c:73
#define ItemIdIsDead(itemId)
Definition: itemid.h:113
IndexTupleData * IndexTuple
Definition: itup.h:53
#define IndexTupleSize(itup)
Definition: itup.h:70
#define MaxIndexTuplesPerPage
Definition: itup.h:165
void pfree(void *pointer)
Definition: mcxt.c:1520
#define START_CRIT_SECTION()
Definition: miscadmin.h:149
#define END_CRIT_SECTION()
Definition: miscadmin.h:151
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
uint16 OffsetNumber
Definition: off.h:24
#define FirstOffsetNumber
Definition: off.h:27
#define MaxOffsetNumber
Definition: off.h:28
static char * buf
Definition: pg_test_fsync.c:73
#define RelationGetRelationName(relation)
Definition: rel.h:539
#define RelationNeedsWAL(relation)
Definition: rel.h:628
@ MAIN_FORKNUM
Definition: relpath.h:50
BlockNumber hashm_mapp[HASH_MAX_BITMAPS]
Definition: hash.h:264
uint32 hashm_spares[HASH_MAX_SPLITPOINTS]
Definition: hash.h:262
uint32 hashm_firstfree
Definition: hash.h:259
uint16 hashm_bmsize
Definition: hash.h:251
uint32 hashm_ovflpoint
Definition: hash.h:257
uint32 hashm_nmaps
Definition: hash.h:260
BlockNumber hasho_nextblkno
Definition: hash.h:80
uint16 hasho_flag
Definition: hash.h:82
BlockNumber hasho_prevblkno
Definition: hash.h:79
uint16 hasho_page_id
Definition: hash.h:83
Bucket hasho_bucket
Definition: hash.h:81
BlockNumber prevblkno
Definition: hash_xlog.h:155
bool is_prim_bucket_same_wrt
Definition: hash_xlog.h:158
bool is_prev_bucket_same_wrt
Definition: hash_xlog.h:161
BlockNumber nextblkno
Definition: hash_xlog.h:156
Datum bit(PG_FUNCTION_ARGS)
Definition: varbit.c:391
uint64 XLogRecPtr
Definition: xlogdefs.h:21
void XLogRegisterData(char *data, uint32 len)
Definition: xloginsert.c:364
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:474
void XLogRegisterBufData(uint8 block_id, char *data, uint32 len)
Definition: xloginsert.c:405
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition: xloginsert.c:242
void XLogBeginInsert(void)
Definition: xloginsert.c:149
void XLogEnsureRecordSpace(int max_block_id, int ndatas)
Definition: xloginsert.c:175
#define REGBUF_NO_CHANGE
Definition: xloginsert.h:36
#define REGBUF_STANDARD
Definition: xloginsert.h:34
#define REGBUF_NO_IMAGE
Definition: xloginsert.h:32
#define REGBUF_WILL_INIT
Definition: xloginsert.h:33