PostgreSQL Source Code git master
ginpostinglist.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * ginpostinglist.c
4 * routines for dealing with posting lists.
5 *
6 *
7 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
8 * Portions Copyright (c) 1994, Regents of the University of California
9 *
10 * IDENTIFICATION
11 * src/backend/access/gin/ginpostinglist.c
12 *-------------------------------------------------------------------------
13 */
14
15#include "postgres.h"
16
17#include "access/gin_private.h"
18
19#ifdef USE_ASSERT_CHECKING
20#define CHECK_ENCODING_ROUNDTRIP
21#endif
22
23/*
24 * For encoding purposes, item pointers are represented as 64-bit unsigned
25 * integers. The lowest 11 bits represent the offset number, and the next
26 * lowest 32 bits are the block number. That leaves 21 bits unused, i.e.
27 * only 43 low bits are used.
28 *
29 * 11 bits is enough for the offset number, because MaxHeapTuplesPerPage <
30 * 2^11 on all supported block sizes. We are frugal with the bits, because
31 * smaller integers use fewer bytes in the varbyte encoding, saving disk
32 * space. (If we get a new table AM in the future that wants to use the full
33 * range of possible offset numbers, we'll need to change this.)
34 *
35 * These 43-bit integers are encoded using varbyte encoding. In each byte,
36 * the 7 low bits contain data, while the highest bit is a continuation bit.
37 * When the continuation bit is set, the next byte is part of the same
38 * integer, otherwise this is the last byte of this integer. 43 bits need
39 * at most 7 bytes in this encoding:
40 *
41 * 0XXXXXXX
42 * 1XXXXXXX 0XXXXYYY
43 * 1XXXXXXX 1XXXXYYY 0YYYYYYY
44 * 1XXXXXXX 1XXXXYYY 1YYYYYYY 0YYYYYYY
45 * 1XXXXXXX 1XXXXYYY 1YYYYYYY 1YYYYYYY 0YYYYYYY
46 * 1XXXXXXX 1XXXXYYY 1YYYYYYY 1YYYYYYY 1YYYYYYY 0YYYYYYY
47 * 1XXXXXXX 1XXXXYYY 1YYYYYYY 1YYYYYYY 1YYYYYYY 1YYYYYYY 0uuuuuuY
48 *
49 * X = bits used for offset number
50 * Y = bits used for block number
51 * u = unused bit
52 *
53 * The bytes are in stored in little-endian order.
54 *
55 * An important property of this encoding is that removing an item from list
56 * never increases the size of the resulting compressed posting list. Proof:
57 *
58 * Removing number is actually replacement of two numbers with their sum. We
59 * have to prove that varbyte encoding of a sum can't be longer than varbyte
60 * encoding of its summands. Sum of two numbers is at most one bit wider than
61 * the larger of the summands. Widening a number by one bit enlarges its length
62 * in varbyte encoding by at most one byte. Therefore, varbyte encoding of sum
63 * is at most one byte longer than varbyte encoding of larger summand. Lesser
64 * summand is at least one byte, so the sum cannot take more space than the
65 * summands, Q.E.D.
66 *
67 * This property greatly simplifies VACUUM, which can assume that posting
68 * lists always fit on the same page after vacuuming. Note that even though
69 * that holds for removing items from a posting list, you must also be
70 * careful to not cause expansion e.g. when merging uncompressed items on the
71 * page into the compressed lists, when vacuuming.
72 */
73
74/*
75 * How many bits do you need to encode offset number? OffsetNumber is a 16-bit
76 * integer, but you can't fit that many items on a page. 11 ought to be more
77 * than enough. It's tempting to derive this from MaxHeapTuplesPerPage, and
78 * use the minimum number of bits, but that would require changing the on-disk
79 * format if MaxHeapTuplesPerPage changes. Better to leave some slack.
80 */
81#define MaxHeapTuplesPerPageBits 11
82
83/* Max. number of bytes needed to encode the largest supported integer. */
84#define MaxBytesPerInteger 7
85
86static inline uint64
88{
89 uint64 val;
90
93
97
98 return val;
99}
100
101static inline void
103{
107
109}
110
111/*
112 * Varbyte-encode 'val' into *ptr. *ptr is incremented to next integer.
113 */
114static void
115encode_varbyte(uint64 val, unsigned char **ptr)
116{
117 unsigned char *p = *ptr;
118
119 while (val > 0x7F)
120 {
121 *(p++) = 0x80 | (val & 0x7F);
122 val >>= 7;
123 }
124 *(p++) = (unsigned char) val;
125
126 *ptr = p;
127}
128
129/*
130 * Decode varbyte-encoded integer at *ptr. *ptr is incremented to next integer.
131 */
132static uint64
133decode_varbyte(unsigned char **ptr)
134{
135 uint64 val;
136 unsigned char *p = *ptr;
137 uint64 c;
138
139 /* 1st byte */
140 c = *(p++);
141 val = c & 0x7F;
142 if (c & 0x80)
143 {
144 /* 2nd byte */
145 c = *(p++);
146 val |= (c & 0x7F) << 7;
147 if (c & 0x80)
148 {
149 /* 3rd byte */
150 c = *(p++);
151 val |= (c & 0x7F) << 14;
152 if (c & 0x80)
153 {
154 /* 4th byte */
155 c = *(p++);
156 val |= (c & 0x7F) << 21;
157 if (c & 0x80)
158 {
159 /* 5th byte */
160 c = *(p++);
161 val |= (c & 0x7F) << 28;
162 if (c & 0x80)
163 {
164 /* 6th byte */
165 c = *(p++);
166 val |= (c & 0x7F) << 35;
167 if (c & 0x80)
168 {
169 /* 7th byte, should not have continuation bit */
170 c = *(p++);
171 val |= c << 42;
172 Assert((c & 0x80) == 0);
173 }
174 }
175 }
176 }
177 }
178 }
179
180 *ptr = p;
181
182 return val;
183}
184
185/*
186 * Encode a posting list.
187 *
188 * The encoded list is returned in a palloc'd struct, which will be at most
189 * 'maxsize' bytes in size. The number items in the returned segment is
190 * returned in *nwritten. If it's not equal to nipd, not all the items fit
191 * in 'maxsize', and only the first *nwritten were encoded.
192 *
193 * The allocated size of the returned struct is short-aligned, and the padding
194 * byte at the end, if any, is zero.
195 */
197ginCompressPostingList(const ItemPointer ipd, int nipd, int maxsize,
198 int *nwritten)
199{
200 uint64 prev;
201 int totalpacked = 0;
202 int maxbytes;
203 GinPostingList *result;
204 unsigned char *ptr;
205 unsigned char *endptr;
206
207 maxsize = SHORTALIGN_DOWN(maxsize);
208
209 result = palloc(maxsize);
210
211 maxbytes = maxsize - offsetof(GinPostingList, bytes);
212 Assert(maxbytes > 0);
213
214 /* Store the first special item */
215 result->first = ipd[0];
216
217 prev = itemptr_to_uint64(&result->first);
218
219 ptr = result->bytes;
220 endptr = result->bytes + maxbytes;
221 for (totalpacked = 1; totalpacked < nipd; totalpacked++)
222 {
223 uint64 val = itemptr_to_uint64(&ipd[totalpacked]);
224 uint64 delta = val - prev;
225
226 Assert(val > prev);
227
228 if (endptr - ptr >= MaxBytesPerInteger)
229 encode_varbyte(delta, &ptr);
230 else
231 {
232 /*
233 * There are less than 7 bytes left. Have to check if the next
234 * item fits in that space before writing it out.
235 */
236 unsigned char buf[MaxBytesPerInteger];
237 unsigned char *p = buf;
238
239 encode_varbyte(delta, &p);
240 if (p - buf > (endptr - ptr))
241 break; /* output is full */
242
243 memcpy(ptr, buf, p - buf);
244 ptr += (p - buf);
245 }
246 prev = val;
247 }
248 result->nbytes = ptr - result->bytes;
249
250 /*
251 * If we wrote an odd number of bytes, zero out the padding byte at the
252 * end.
253 */
254 if (result->nbytes != SHORTALIGN(result->nbytes))
255 result->bytes[result->nbytes] = 0;
256
257 if (nwritten)
258 *nwritten = totalpacked;
259
260 Assert(SizeOfGinPostingList(result) <= maxsize);
261
262 /*
263 * Check that the encoded segment decodes back to the original items.
264 */
265#if defined (CHECK_ENCODING_ROUNDTRIP)
266 {
267 int ndecoded;
268 ItemPointer tmp = ginPostingListDecode(result, &ndecoded);
269
270 Assert(ndecoded == totalpacked);
271 Assert(memcmp(tmp, ipd, ndecoded * sizeof(ItemPointerData)) == 0);
272 pfree(tmp);
273 }
274#endif
275
276 return result;
277}
278
279/*
280 * Decode a compressed posting list into an array of item pointers.
281 * The number of items is returned in *ndecoded.
282 */
284ginPostingListDecode(GinPostingList *plist, int *ndecoded_out)
285{
288 ndecoded_out);
289}
290
291/*
292 * Decode multiple posting list segments into an array of item pointers.
293 * The number of items is returned in *ndecoded_out. The segments are stored
294 * one after each other, with total size 'len' bytes.
295 */
297ginPostingListDecodeAllSegments(GinPostingList *segment, int len, int *ndecoded_out)
298{
299 ItemPointer result;
300 int nallocated;
301 uint64 val;
302 char *endseg = ((char *) segment) + len;
303 int ndecoded;
304 unsigned char *ptr;
305 unsigned char *endptr;
306
307 /*
308 * Guess an initial size of the array.
309 */
310 nallocated = segment->nbytes * 2 + 1;
311 result = palloc(nallocated * sizeof(ItemPointerData));
312
313 ndecoded = 0;
314 while ((char *) segment < endseg)
315 {
316 /* enlarge output array if needed */
317 if (ndecoded >= nallocated)
318 {
319 nallocated *= 2;
320 result = repalloc(result, nallocated * sizeof(ItemPointerData));
321 }
322
323 /* copy the first item */
325 Assert(ndecoded == 0 || ginCompareItemPointers(&segment->first, &result[ndecoded - 1]) > 0);
326 result[ndecoded] = segment->first;
327 ndecoded++;
328
329 val = itemptr_to_uint64(&segment->first);
330 ptr = segment->bytes;
331 endptr = segment->bytes + segment->nbytes;
332 while (ptr < endptr)
333 {
334 /* enlarge output array if needed */
335 if (ndecoded >= nallocated)
336 {
337 nallocated *= 2;
338 result = repalloc(result, nallocated * sizeof(ItemPointerData));
339 }
340
341 val += decode_varbyte(&ptr);
342
343 uint64_to_itemptr(val, &result[ndecoded]);
344 ndecoded++;
345 }
346 segment = GinNextPostingListSegment(segment);
347 }
348
349 if (ndecoded_out)
350 *ndecoded_out = ndecoded;
351 return result;
352}
353
354/*
355 * Add all item pointers from a bunch of posting lists to a TIDBitmap.
356 */
357int
359 TIDBitmap *tbm)
360{
361 int ndecoded;
363
364 items = ginPostingListDecodeAllSegments(ptr, len, &ndecoded);
365 tbm_add_tuples(tbm, items, ndecoded, false);
366 pfree(items);
367
368 return ndecoded;
369}
370
371/*
372 * Merge two ordered arrays of itempointers, eliminating any duplicates.
373 *
374 * Returns a palloc'd array, and *nmerged is set to the number of items in
375 * the result, after eliminating duplicates.
376 */
380 int *nmerged)
381{
382 ItemPointerData *dst;
383
384 dst = (ItemPointer) palloc((na + nb) * sizeof(ItemPointerData));
385
386 /*
387 * If the argument arrays don't overlap, we can just append them to each
388 * other.
389 */
390 if (na == 0 || nb == 0 || ginCompareItemPointers(&a[na - 1], &b[0]) < 0)
391 {
392 memcpy(dst, a, na * sizeof(ItemPointerData));
393 memcpy(&dst[na], b, nb * sizeof(ItemPointerData));
394 *nmerged = na + nb;
395 }
396 else if (ginCompareItemPointers(&b[nb - 1], &a[0]) < 0)
397 {
398 memcpy(dst, b, nb * sizeof(ItemPointerData));
399 memcpy(&dst[nb], a, na * sizeof(ItemPointerData));
400 *nmerged = na + nb;
401 }
402 else
403 {
404 ItemPointerData *dptr = dst;
405 ItemPointerData *aptr = a;
406 ItemPointerData *bptr = b;
407
408 while (aptr - a < na && bptr - b < nb)
409 {
410 int cmp = ginCompareItemPointers(aptr, bptr);
411
412 if (cmp > 0)
413 *dptr++ = *bptr++;
414 else if (cmp == 0)
415 {
416 /* only keep one copy of the identical items */
417 *dptr++ = *bptr++;
418 aptr++;
419 }
420 else
421 *dptr++ = *aptr++;
422 }
423
424 while (aptr - a < na)
425 *dptr++ = *aptr++;
426
427 while (bptr - b < nb)
428 *dptr++ = *bptr++;
429
430 *nmerged = dptr - dst;
431 }
432
433 return dst;
434}
#define SHORTALIGN_DOWN(LEN)
Definition: c.h:776
#define Assert(condition)
Definition: c.h:815
#define SHORTALIGN(LEN)
Definition: c.h:764
uint64_t uint64
Definition: c.h:489
uint32_t uint32
Definition: c.h:488
static int ginCompareItemPointers(ItemPointer a, ItemPointer b)
Definition: gin_private.h:488
#define GinItemPointerGetOffsetNumber(pointer)
Definition: ginblock.h:146
#define SizeOfGinPostingList(plist)
Definition: ginblock.h:342
#define GinItemPointerSetBlockNumber(pointer, blkno)
Definition: ginblock.h:149
#define GinNextPostingListSegment(cur)
Definition: ginblock.h:343
#define GinItemPointerSetOffsetNumber(pointer, offnum)
Definition: ginblock.h:152
#define GinItemPointerGetBlockNumber(pointer)
Definition: ginblock.h:143
ItemPointer ginPostingListDecodeAllSegments(GinPostingList *segment, int len, int *ndecoded_out)
static uint64 itemptr_to_uint64(const ItemPointer iptr)
int ginPostingListDecodeAllSegmentsToTbm(GinPostingList *ptr, int len, TIDBitmap *tbm)
static void encode_varbyte(uint64 val, unsigned char **ptr)
ItemPointer ginPostingListDecode(GinPostingList *plist, int *ndecoded_out)
GinPostingList * ginCompressPostingList(const ItemPointer ipd, int nipd, int maxsize, int *nwritten)
static uint64 decode_varbyte(unsigned char **ptr)
#define MaxBytesPerInteger
#define MaxHeapTuplesPerPageBits
static void uint64_to_itemptr(uint64 val, ItemPointer iptr)
ItemPointer ginMergeItemPointers(ItemPointerData *a, uint32 na, ItemPointerData *b, uint32 nb, int *nmerged)
long val
Definition: informix.c:689
int b
Definition: isn.c:69
int a
Definition: isn.c:68
if(TABLE==NULL||TABLE_index==NULL)
Definition: isn.c:76
static OffsetNumber ItemPointerGetOffsetNumber(const ItemPointerData *pointer)
Definition: itemptr.h:124
ItemPointerData * ItemPointer
Definition: itemptr.h:49
struct ItemPointerData ItemPointerData
static bool ItemPointerIsValid(const ItemPointerData *pointer)
Definition: itemptr.h:83
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1541
void pfree(void *pointer)
Definition: mcxt.c:1521
void * palloc(Size size)
Definition: mcxt.c:1317
#define OffsetNumberIsValid(offsetNumber)
Definition: off.h:39
const void size_t len
static char * buf
Definition: pg_test_fsync.c:72
char * c
static int cmp(const chr *x, const chr *y, size_t len)
Definition: regc_locale.c:743
ItemPointerData first
Definition: ginblock.h:337
unsigned char bytes[FLEXIBLE_ARRAY_MEMBER]
Definition: ginblock.h:339
uint16 nbytes
Definition: ginblock.h:338
static ItemArray items
Definition: test_tidstore.c:48
void tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids, bool recheck)
Definition: tidbitmap.c:377