PostgreSQL Source Code  git master
hashinsert.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * hashinsert.c
4  * Item insertion in hash tables for Postgres.
5  *
6  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  * src/backend/access/hash/hashinsert.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 
16 #include "postgres.h"
17 
18 #include "access/hash.h"
19 #include "access/hash_xlog.h"
20 #include "access/xloginsert.h"
21 #include "miscadmin.h"
22 #include "storage/predicate.h"
23 #include "utils/rel.h"
24 
25 static void _hash_vacuum_one_page(Relation rel, Relation hrel,
26  Buffer metabuf, Buffer buf);
27 
28 /*
29  * _hash_doinsert() -- Handle insertion of a single index tuple.
30  *
31  * This routine is called by the public interface routines, hashbuild
32  * and hashinsert. By here, itup is completely filled in.
33  *
34  * 'sorted' must only be passed as 'true' when inserts are done in hashkey
35  * order.
36  */
37 void
38 _hash_doinsert(Relation rel, IndexTuple itup, Relation heapRel, bool sorted)
39 {
41  Buffer bucket_buf;
42  Buffer metabuf;
43  HashMetaPage metap;
44  HashMetaPage usedmetap = NULL;
45  Page metapage;
46  Page page;
47  HashPageOpaque pageopaque;
48  Size itemsz;
49  bool do_expand;
50  uint32 hashkey;
51  Bucket bucket;
52  OffsetNumber itup_off;
53 
54  /*
55  * Get the hash key for the item (it's stored in the index tuple itself).
56  */
57  hashkey = _hash_get_indextuple_hashkey(itup);
58 
59  /* compute item size too */
60  itemsz = IndexTupleSize(itup);
61  itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this but we
62  * need to be consistent */
63 
64 restart_insert:
65 
66  /*
67  * Read the metapage. We don't lock it yet; HashMaxItemSize() will
68  * examine pd_pagesize_version, but that can't change so we can examine it
69  * without a lock.
70  */
72  metapage = BufferGetPage(metabuf);
73 
74  /*
75  * Check whether the item can fit on a hash page at all. (Eventually, we
76  * ought to try to apply TOAST methods if not.) Note that at this point,
77  * itemsz doesn't include the ItemId.
78  *
79  * XXX this is useless code if we are only storing hash keys.
80  */
81  if (itemsz > HashMaxItemSize(metapage))
82  ereport(ERROR,
83  (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
84  errmsg("index row size %zu exceeds hash maximum %zu",
85  itemsz, HashMaxItemSize(metapage)),
86  errhint("Values larger than a buffer page cannot be indexed.")));
87 
88  /* Lock the primary bucket page for the target bucket. */
90  &usedmetap);
91  Assert(usedmetap != NULL);
92 
94 
95  /* remember the primary bucket buffer to release the pin on it at end. */
96  bucket_buf = buf;
97 
98  page = BufferGetPage(buf);
99  pageopaque = HashPageGetOpaque(page);
100  bucket = pageopaque->hasho_bucket;
101 
102  /*
103  * If this bucket is in the process of being split, try to finish the
104  * split before inserting, because that might create room for the
105  * insertion to proceed without allocating an additional overflow page.
106  * It's only interesting to finish the split if we're trying to insert
107  * into the bucket from which we're removing tuples (the "old" bucket),
108  * not if we're trying to insert into the bucket into which tuples are
109  * being moved (the "new" bucket).
110  */
111  if (H_BUCKET_BEING_SPLIT(pageopaque) && IsBufferCleanupOK(buf))
112  {
113  /* release the lock on bucket buffer, before completing the split. */
115 
116  _hash_finish_split(rel, metabuf, buf, bucket,
117  usedmetap->hashm_maxbucket,
118  usedmetap->hashm_highmask,
119  usedmetap->hashm_lowmask);
120 
121  /* release the pin on old and meta buffer. retry for insert. */
122  _hash_dropbuf(rel, buf);
123  _hash_dropbuf(rel, metabuf);
124  goto restart_insert;
125  }
126 
127  /* Do the insertion */
128  while (PageGetFreeSpace(page) < itemsz)
129  {
130  BlockNumber nextblkno;
131 
132  /*
133  * Check if current page has any DEAD tuples. If yes, delete these
134  * tuples and see if we can get a space for the new item to be
135  * inserted before moving to the next page in the bucket chain.
136  */
137  if (H_HAS_DEAD_TUPLES(pageopaque))
138  {
139 
140  if (IsBufferCleanupOK(buf))
141  {
142  _hash_vacuum_one_page(rel, heapRel, metabuf, buf);
143 
144  if (PageGetFreeSpace(page) >= itemsz)
145  break; /* OK, now we have enough space */
146  }
147  }
148 
149  /*
150  * no space on this page; check for an overflow page
151  */
152  nextblkno = pageopaque->hasho_nextblkno;
153 
154  if (BlockNumberIsValid(nextblkno))
155  {
156  /*
157  * ovfl page exists; go get it. if it doesn't have room, we'll
158  * find out next pass through the loop test above. we always
159  * release both the lock and pin if this is an overflow page, but
160  * only the lock if this is the primary bucket page, since the pin
161  * on the primary bucket must be retained throughout the scan.
162  */
163  if (buf != bucket_buf)
164  _hash_relbuf(rel, buf);
165  else
167  buf = _hash_getbuf(rel, nextblkno, HASH_WRITE, LH_OVERFLOW_PAGE);
168  page = BufferGetPage(buf);
169  }
170  else
171  {
172  /*
173  * we're at the end of the bucket chain and we haven't found a
174  * page with enough room. allocate a new overflow page.
175  */
176 
177  /* release our write lock without modifying buffer */
179 
180  /* chain to a new overflow page */
181  buf = _hash_addovflpage(rel, metabuf, buf, (buf == bucket_buf));
182  page = BufferGetPage(buf);
183 
184  /* should fit now, given test above */
185  Assert(PageGetFreeSpace(page) >= itemsz);
186  }
187  pageopaque = HashPageGetOpaque(page);
188  Assert((pageopaque->hasho_flag & LH_PAGE_TYPE) == LH_OVERFLOW_PAGE);
189  Assert(pageopaque->hasho_bucket == bucket);
190  }
191 
192  /*
193  * Write-lock the metapage so we can increment the tuple count. After
194  * incrementing it, check to see if it's time for a split.
195  */
197 
198  /* Do the update. No ereport(ERROR) until changes are logged */
200 
201  /* found page with enough space, so add the item here */
202  itup_off = _hash_pgaddtup(rel, buf, itemsz, itup, sorted);
204 
205  /* metapage operations */
206  metap = HashPageGetMeta(metapage);
207  metap->hashm_ntuples += 1;
208 
209  /* Make sure this stays in sync with _hash_expandtable() */
210  do_expand = metap->hashm_ntuples >
211  (double) metap->hashm_ffactor * (metap->hashm_maxbucket + 1);
212 
213  MarkBufferDirty(metabuf);
214 
215  /* XLOG stuff */
216  if (RelationNeedsWAL(rel))
217  {
218  xl_hash_insert xlrec;
219  XLogRecPtr recptr;
220 
221  xlrec.offnum = itup_off;
222 
223  XLogBeginInsert();
224  XLogRegisterData((char *) &xlrec, SizeOfHashInsert);
225 
226  XLogRegisterBuffer(1, metabuf, REGBUF_STANDARD);
227 
229  XLogRegisterBufData(0, (char *) itup, IndexTupleSize(itup));
230 
231  recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_INSERT);
232 
233  PageSetLSN(BufferGetPage(buf), recptr);
234  PageSetLSN(BufferGetPage(metabuf), recptr);
235  }
236 
238 
239  /* drop lock on metapage, but keep pin */
240  LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
241 
242  /*
243  * Release the modified page and ensure to release the pin on primary
244  * page.
245  */
246  _hash_relbuf(rel, buf);
247  if (buf != bucket_buf)
248  _hash_dropbuf(rel, bucket_buf);
249 
250  /* Attempt to split if a split is needed */
251  if (do_expand)
252  _hash_expandtable(rel, metabuf);
253 
254  /* Finally drop our pin on the metapage */
255  _hash_dropbuf(rel, metabuf);
256 }
257 
258 /*
259  * _hash_pgaddtup() -- add a tuple to a particular page in the index.
260  *
261  * This routine adds the tuple to the page as requested; it does not write out
262  * the page. It is an error to call this function without pin and write lock
263  * on the target buffer.
264  *
265  * Returns the offset number at which the tuple was inserted. This function
266  * is responsible for preserving the condition that tuples in a hash index
267  * page are sorted by hashkey value, however, if the caller is certain that
268  * the hashkey for the tuple being added is >= the hashkeys of all existing
269  * tuples on the page, then the 'appendtup' flag may be passed as true. This
270  * saves from having to binary search for the correct location to insert the
271  * tuple.
272  */
275  bool appendtup)
276 {
277  OffsetNumber itup_off;
278  Page page;
279 
281  page = BufferGetPage(buf);
282 
283  /*
284  * Find where to insert the tuple (preserving page's hashkey ordering). If
285  * 'appendtup' is true then we just insert it at the end.
286  */
287  if (appendtup)
288  {
289  itup_off = PageGetMaxOffsetNumber(page) + 1;
290 
291 #ifdef USE_ASSERT_CHECKING
292  /* ensure this tuple's hashkey is >= the final existing tuple */
293  if (PageGetMaxOffsetNumber(page) > 0)
294  {
295  IndexTuple lasttup;
296  ItemId itemid;
297 
298  itemid = PageGetItemId(page, PageGetMaxOffsetNumber(page));
299  lasttup = (IndexTuple) PageGetItem(page, itemid);
300 
303  }
304 #endif
305  }
306  else
307  {
308  uint32 hashkey = _hash_get_indextuple_hashkey(itup);
309 
310  itup_off = _hash_binsearch(page, hashkey);
311  }
312 
313  if (PageAddItem(page, (Item) itup, itemsize, itup_off, false, false)
315  elog(ERROR, "failed to add index item to \"%s\"",
317 
318  return itup_off;
319 }
320 
321 /*
322  * _hash_pgaddmultitup() -- add a tuple vector to a particular page in the
323  * index.
324  *
325  * This routine has same requirements for locking and tuple ordering as
326  * _hash_pgaddtup().
327  *
328  * Returns the offset number array at which the tuples were inserted.
329  */
330 void
332  OffsetNumber *itup_offsets, uint16 nitups)
333 {
334  OffsetNumber itup_off;
335  Page page;
336  uint32 hashkey;
337  int i;
338 
340  page = BufferGetPage(buf);
341 
342  for (i = 0; i < nitups; i++)
343  {
344  Size itemsize;
345 
346  itemsize = IndexTupleSize(itups[i]);
347  itemsize = MAXALIGN(itemsize);
348 
349  /* Find where to insert the tuple (preserving page's hashkey ordering) */
350  hashkey = _hash_get_indextuple_hashkey(itups[i]);
351  itup_off = _hash_binsearch(page, hashkey);
352 
353  itup_offsets[i] = itup_off;
354 
355  if (PageAddItem(page, (Item) itups[i], itemsize, itup_off, false, false)
357  elog(ERROR, "failed to add index item to \"%s\"",
359  }
360 }
361 
362 /*
363  * _hash_vacuum_one_page - vacuum just one index page.
364  *
365  * Try to remove LP_DEAD items from the given page. We must acquire cleanup
366  * lock on the page being modified before calling this function.
367  */
368 
369 static void
371 {
372  OffsetNumber deletable[MaxOffsetNumber];
373  int ndeletable = 0;
374  OffsetNumber offnum,
375  maxoff;
376  Page page = BufferGetPage(buf);
377  HashPageOpaque pageopaque;
378  HashMetaPage metap;
379 
380  /* Scan each tuple in page to see if it is marked as LP_DEAD */
381  maxoff = PageGetMaxOffsetNumber(page);
382  for (offnum = FirstOffsetNumber;
383  offnum <= maxoff;
384  offnum = OffsetNumberNext(offnum))
385  {
386  ItemId itemId = PageGetItemId(page, offnum);
387 
388  if (ItemIdIsDead(itemId))
389  deletable[ndeletable++] = offnum;
390  }
391 
392  if (ndeletable > 0)
393  {
394  TransactionId snapshotConflictHorizon;
395 
396  snapshotConflictHorizon =
398  deletable, ndeletable);
399 
400  /*
401  * Write-lock the meta page so that we can decrement tuple count.
402  */
404 
405  /* No ereport(ERROR) until changes are logged */
407 
408  PageIndexMultiDelete(page, deletable, ndeletable);
409 
410  /*
411  * Mark the page as not containing any LP_DEAD items. This is not
412  * certainly true (there might be some that have recently been marked,
413  * but weren't included in our target-item list), but it will almost
414  * always be true and it doesn't seem worth an additional page scan to
415  * check it. Remember that LH_PAGE_HAS_DEAD_TUPLES is only a hint
416  * anyway.
417  */
418  pageopaque = HashPageGetOpaque(page);
419  pageopaque->hasho_flag &= ~LH_PAGE_HAS_DEAD_TUPLES;
420 
421  metap = HashPageGetMeta(BufferGetPage(metabuf));
422  metap->hashm_ntuples -= ndeletable;
423 
425  MarkBufferDirty(metabuf);
426 
427  /* XLOG stuff */
428  if (RelationNeedsWAL(rel))
429  {
431  XLogRecPtr recptr;
432 
434  xlrec.snapshotConflictHorizon = snapshotConflictHorizon;
435  xlrec.ntuples = ndeletable;
436 
437  XLogBeginInsert();
439  XLogRegisterData((char *) &xlrec, SizeOfHashVacuumOnePage);
440 
441  /*
442  * We need the target-offsets array whether or not we store the
443  * whole buffer, to allow us to find the snapshotConflictHorizon
444  * on a standby server.
445  */
446  XLogRegisterData((char *) deletable,
447  ndeletable * sizeof(OffsetNumber));
448 
449  XLogRegisterBuffer(1, metabuf, REGBUF_STANDARD);
450 
451  recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_VACUUM_ONE_PAGE);
452 
453  PageSetLSN(BufferGetPage(buf), recptr);
454  PageSetLSN(BufferGetPage(metabuf), recptr);
455  }
456 
458 
459  /*
460  * Releasing write lock on meta page as we have updated the tuple
461  * count.
462  */
463  LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
464  }
465 }
uint32 BlockNumber
Definition: block.h:31
static bool BlockNumberIsValid(BlockNumber blockNumber)
Definition: block.h:71
int Buffer
Definition: buf.h:23
#define InvalidBuffer
Definition: buf.h:25
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:3667
bool IsBufferCleanupOK(Buffer buffer)
Definition: bufmgr.c:5382
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:2474
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:5085
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:197
static Page BufferGetPage(Buffer buffer)
Definition: bufmgr.h:408
#define BUFFER_LOCK_EXCLUSIVE
Definition: bufmgr.h:199
void PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems)
Definition: bufpage.c:1161
Size PageGetFreeSpace(Page page)
Definition: bufpage.c:907
Pointer Page
Definition: bufpage.h:78
static Item PageGetItem(Page page, ItemId itemId)
Definition: bufpage.h:351
static ItemId PageGetItemId(Page page, OffsetNumber offsetNumber)
Definition: bufpage.h:240
static void PageSetLSN(Page page, XLogRecPtr lsn)
Definition: bufpage.h:388
static OffsetNumber PageGetMaxOffsetNumber(Page page)
Definition: bufpage.h:369
#define PageAddItem(page, item, size, offsetNumber, overwrite, is_heap)
Definition: bufpage.h:468
unsigned short uint16
Definition: c.h:505
unsigned int uint32
Definition: c.h:506
#define MAXALIGN(LEN)
Definition: c.h:811
#define Assert(condition)
Definition: c.h:858
uint32 TransactionId
Definition: c.h:652
size_t Size
Definition: c.h:605
int errhint(const char *fmt,...)
Definition: elog.c:1319
int errcode(int sqlerrcode)
Definition: elog.c:859
int errmsg(const char *fmt,...)
Definition: elog.c:1072
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:224
#define ereport(elevel,...)
Definition: elog.h:149
TransactionId index_compute_xid_horizon_for_tuples(Relation irel, Relation hrel, Buffer ibuf, OffsetNumber *itemnos, int nitems)
Definition: genam.c:291
#define HASH_NOLOCK
Definition: hash.h:341
#define HashPageGetOpaque(page)
Definition: hash.h:88
#define LH_BUCKET_PAGE
Definition: hash.h:55
#define HASH_WRITE
Definition: hash.h:340
#define H_BUCKET_BEING_SPLIT(opaque)
Definition: hash.h:91
#define LH_META_PAGE
Definition: hash.h:57
#define HashPageGetMeta(page)
Definition: hash.h:323
#define HASH_METAPAGE
Definition: hash.h:198
#define H_HAS_DEAD_TUPLES(opaque)
Definition: hash.h:93
#define LH_PAGE_TYPE
Definition: hash.h:63
uint32 Bucket
Definition: hash.h:35
#define HashMaxItemSize(page)
Definition: hash.h:287
#define LH_PAGE_HAS_DEAD_TUPLES
Definition: hash.h:61
#define LH_OVERFLOW_PAGE
Definition: hash.h:54
#define SizeOfHashInsert
Definition: hash_xlog.h:61
#define XLOG_HASH_INSERT
Definition: hash_xlog.h:29
#define XLOG_HASH_VACUUM_ONE_PAGE
Definition: hash_xlog.h:40
#define SizeOfHashVacuumOnePage
Definition: hash_xlog.h:256
void _hash_doinsert(Relation rel, IndexTuple itup, Relation heapRel, bool sorted)
Definition: hashinsert.c:38
OffsetNumber _hash_pgaddtup(Relation rel, Buffer buf, Size itemsize, IndexTuple itup, bool appendtup)
Definition: hashinsert.c:274
void _hash_pgaddmultitup(Relation rel, Buffer buf, IndexTuple *itups, OffsetNumber *itup_offsets, uint16 nitups)
Definition: hashinsert.c:331
static void _hash_vacuum_one_page(Relation rel, Relation hrel, Buffer metabuf, Buffer buf)
Definition: hashinsert.c:370
Buffer _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf, bool retain_pin)
Definition: hashovfl.c:112
void _hash_relbuf(Relation rel, Buffer buf)
Definition: hashpage.c:266
void _hash_dropbuf(Relation rel, Buffer buf)
Definition: hashpage.c:277
Buffer _hash_getbuf(Relation rel, BlockNumber blkno, int access, int flags)
Definition: hashpage.c:70
void _hash_finish_split(Relation rel, Buffer metabuf, Buffer obuf, Bucket obucket, uint32 maxbucket, uint32 highmask, uint32 lowmask)
Definition: hashpage.c:1356
Buffer _hash_getbucketbuf_from_hashkey(Relation rel, uint32 hashkey, int access, HashMetaPage *cachedmetap)
Definition: hashpage.c:1559
void _hash_expandtable(Relation rel, Buffer metabuf)
Definition: hashpage.c:614
uint32 _hash_get_indextuple_hashkey(IndexTuple itup)
Definition: hashutil.c:291
OffsetNumber _hash_binsearch(Page page, uint32 hash_value)
Definition: hashutil.c:350
void _hash_checkpage(Relation rel, Buffer buf, int flags)
Definition: hashutil.c:210
int i
Definition: isn.c:73
Pointer Item
Definition: item.h:17
#define ItemIdIsDead(itemId)
Definition: itemid.h:113
IndexTupleData * IndexTuple
Definition: itup.h:53
#define IndexTupleSize(itup)
Definition: itup.h:70
#define START_CRIT_SECTION()
Definition: miscadmin.h:149
#define END_CRIT_SECTION()
Definition: miscadmin.h:151
#define InvalidOffsetNumber
Definition: off.h:26
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
uint16 OffsetNumber
Definition: off.h:24
#define FirstOffsetNumber
Definition: off.h:27
#define MaxOffsetNumber
Definition: off.h:28
static char * buf
Definition: pg_test_fsync.c:73
void CheckForSerializableConflictIn(Relation relation, ItemPointer tid, BlockNumber blkno)
Definition: predicate.c:4321
#define RelationGetRelationName(relation)
Definition: rel.h:539
#define RelationIsAccessibleInLogicalDecoding(relation)
Definition: rel.h:684
#define RelationNeedsWAL(relation)
Definition: rel.h:628
uint32 hashm_lowmask
Definition: hash.h:256
uint32 hashm_maxbucket
Definition: hash.h:254
double hashm_ntuples
Definition: hash.h:248
uint32 hashm_highmask
Definition: hash.h:255
uint16 hashm_ffactor
Definition: hash.h:249
BlockNumber hasho_nextblkno
Definition: hash.h:80
uint16 hasho_flag
Definition: hash.h:82
Bucket hasho_bucket
Definition: hash.h:81
OffsetNumber offnum
Definition: hash_xlog.h:58
TransactionId snapshotConflictHorizon
Definition: hash_xlog.h:247
uint64 XLogRecPtr
Definition: xlogdefs.h:21
void XLogRegisterData(char *data, uint32 len)
Definition: xloginsert.c:364
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:474
void XLogRegisterBufData(uint8 block_id, char *data, uint32 len)
Definition: xloginsert.c:405
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition: xloginsert.c:242
void XLogBeginInsert(void)
Definition: xloginsert.c:149
#define REGBUF_STANDARD
Definition: xloginsert.h:34