PostgreSQL Source Code  git master
ginpostinglist.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * ginpostinglist.c
4  * routines for dealing with posting lists.
5  *
6  *
7  * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
8  * Portions Copyright (c) 1994, Regents of the University of California
9  *
10  * IDENTIFICATION
11  * src/backend/access/gin/ginpostinglist.c
12  *-------------------------------------------------------------------------
13  */
14 
15 #include "postgres.h"
16 
17 #include "access/gin_private.h"
18 
19 #ifdef USE_ASSERT_CHECKING
20 #define CHECK_ENCODING_ROUNDTRIP
21 #endif
22 
23 /*
24  * For encoding purposes, item pointers are represented as 64-bit unsigned
25  * integers. The lowest 11 bits represent the offset number, and the next
26  * lowest 32 bits are the block number. That leaves 21 bits unused, i.e.
27  * only 43 low bits are used.
28  *
29  * 11 bits is enough for the offset number, because MaxHeapTuplesPerPage <
30  * 2^11 on all supported block sizes. We are frugal with the bits, because
31  * smaller integers use fewer bytes in the varbyte encoding, saving disk
32  * space. (If we get a new table AM in the future that wants to use the full
33  * range of possible offset numbers, we'll need to change this.)
34  *
35  * These 43-bit integers are encoded using varbyte encoding. In each byte,
36  * the 7 low bits contain data, while the highest bit is a continuation bit.
37  * When the continuation bit is set, the next byte is part of the same
38  * integer, otherwise this is the last byte of this integer. 43 bits need
39  * at most 7 bytes in this encoding:
40  *
41  * 0XXXXXXX
42  * 1XXXXXXX 0XXXXYYY
43  * 1XXXXXXX 1XXXXYYY 0YYYYYYY
44  * 1XXXXXXX 1XXXXYYY 1YYYYYYY 0YYYYYYY
45  * 1XXXXXXX 1XXXXYYY 1YYYYYYY 1YYYYYYY 0YYYYYYY
46  * 1XXXXXXX 1XXXXYYY 1YYYYYYY 1YYYYYYY 1YYYYYYY 0YYYYYYY
47  * 1XXXXXXX 1XXXXYYY 1YYYYYYY 1YYYYYYY 1YYYYYYY 1YYYYYYY 0uuuuuuY
48  *
49  * X = bits used for offset number
50  * Y = bits used for block number
51  * u = unused bit
52  *
53  * The bytes are in stored in little-endian order.
54  *
55  * An important property of this encoding is that removing an item from list
56  * never increases the size of the resulting compressed posting list. Proof:
57  *
58  * Removing number is actually replacement of two numbers with their sum. We
59  * have to prove that varbyte encoding of a sum can't be longer than varbyte
60  * encoding of its summands. Sum of two numbers is at most one bit wider than
61  * the larger of the summands. Widening a number by one bit enlarges its length
62  * in varbyte encoding by at most one byte. Therefore, varbyte encoding of sum
63  * is at most one byte longer than varbyte encoding of larger summand. Lesser
64  * summand is at least one byte, so the sum cannot take more space than the
65  * summands, Q.E.D.
66  *
67  * This property greatly simplifies VACUUM, which can assume that posting
68  * lists always fit on the same page after vacuuming. Note that even though
69  * that holds for removing items from a posting list, you must also be
70  * careful to not cause expansion e.g. when merging uncompressed items on the
71  * page into the compressed lists, when vacuuming.
72  */
73 
74 /*
75  * How many bits do you need to encode offset number? OffsetNumber is a 16-bit
76  * integer, but you can't fit that many items on a page. 11 ought to be more
77  * than enough. It's tempting to derive this from MaxHeapTuplesPerPage, and
78  * use the minimum number of bits, but that would require changing the on-disk
79  * format if MaxHeapTuplesPerPage changes. Better to leave some slack.
80  */
81 #define MaxHeapTuplesPerPageBits 11
82 
83 /* Max. number of bytes needed to encode the largest supported integer. */
84 #define MaxBytesPerInteger 7
85 
86 static inline uint64
88 {
89  uint64 val;
90 
93 
94  val = GinItemPointerGetBlockNumber(iptr);
96  val |= GinItemPointerGetOffsetNumber(iptr);
97 
98  return val;
99 }
100 
101 static inline void
103 {
105  val = val >> MaxHeapTuplesPerPageBits;
106  GinItemPointerSetBlockNumber(iptr, val);
107 
108  Assert(ItemPointerIsValid(iptr));
109 }
110 
111 /*
112  * Varbyte-encode 'val' into *ptr. *ptr is incremented to next integer.
113  */
114 static void
115 encode_varbyte(uint64 val, unsigned char **ptr)
116 {
117  unsigned char *p = *ptr;
118 
119  while (val > 0x7F)
120  {
121  *(p++) = 0x80 | (val & 0x7F);
122  val >>= 7;
123  }
124  *(p++) = (unsigned char) val;
125 
126  *ptr = p;
127 }
128 
129 /*
130  * Decode varbyte-encoded integer at *ptr. *ptr is incremented to next integer.
131  */
132 static uint64
133 decode_varbyte(unsigned char **ptr)
134 {
135  uint64 val;
136  unsigned char *p = *ptr;
137  uint64 c;
138 
139  /* 1st byte */
140  c = *(p++);
141  val = c & 0x7F;
142  if (c & 0x80)
143  {
144  /* 2nd byte */
145  c = *(p++);
146  val |= (c & 0x7F) << 7;
147  if (c & 0x80)
148  {
149  /* 3rd byte */
150  c = *(p++);
151  val |= (c & 0x7F) << 14;
152  if (c & 0x80)
153  {
154  /* 4th byte */
155  c = *(p++);
156  val |= (c & 0x7F) << 21;
157  if (c & 0x80)
158  {
159  /* 5th byte */
160  c = *(p++);
161  val |= (c & 0x7F) << 28;
162  if (c & 0x80)
163  {
164  /* 6th byte */
165  c = *(p++);
166  val |= (c & 0x7F) << 35;
167  if (c & 0x80)
168  {
169  /* 7th byte, should not have continuation bit */
170  c = *(p++);
171  val |= c << 42;
172  Assert((c & 0x80) == 0);
173  }
174  }
175  }
176  }
177  }
178  }
179 
180  *ptr = p;
181 
182  return val;
183 }
184 
185 /*
186  * Encode a posting list.
187  *
188  * The encoded list is returned in a palloc'd struct, which will be at most
189  * 'maxsize' bytes in size. The number items in the returned segment is
190  * returned in *nwritten. If it's not equal to nipd, not all the items fit
191  * in 'maxsize', and only the first *nwritten were encoded.
192  *
193  * The allocated size of the returned struct is short-aligned, and the padding
194  * byte at the end, if any, is zero.
195  */
197 ginCompressPostingList(const ItemPointer ipd, int nipd, int maxsize,
198  int *nwritten)
199 {
200  uint64 prev;
201  int totalpacked = 0;
202  int maxbytes;
203  GinPostingList *result;
204  unsigned char *ptr;
205  unsigned char *endptr;
206 
207  maxsize = SHORTALIGN_DOWN(maxsize);
208 
209  result = palloc(maxsize);
210 
211  maxbytes = maxsize - offsetof(GinPostingList, bytes);
212  Assert(maxbytes > 0);
213 
214  /* Store the first special item */
215  result->first = ipd[0];
216 
217  prev = itemptr_to_uint64(&result->first);
218 
219  ptr = result->bytes;
220  endptr = result->bytes + maxbytes;
221  for (totalpacked = 1; totalpacked < nipd; totalpacked++)
222  {
223  uint64 val = itemptr_to_uint64(&ipd[totalpacked]);
224  uint64 delta = val - prev;
225 
226  Assert(val > prev);
227 
228  if (endptr - ptr >= MaxBytesPerInteger)
229  encode_varbyte(delta, &ptr);
230  else
231  {
232  /*
233  * There are less than 7 bytes left. Have to check if the next
234  * item fits in that space before writing it out.
235  */
236  unsigned char buf[MaxBytesPerInteger];
237  unsigned char *p = buf;
238 
239  encode_varbyte(delta, &p);
240  if (p - buf > (endptr - ptr))
241  break; /* output is full */
242 
243  memcpy(ptr, buf, p - buf);
244  ptr += (p - buf);
245  }
246  prev = val;
247  }
248  result->nbytes = ptr - result->bytes;
249 
250  /*
251  * If we wrote an odd number of bytes, zero out the padding byte at the
252  * end.
253  */
254  if (result->nbytes != SHORTALIGN(result->nbytes))
255  result->bytes[result->nbytes] = 0;
256 
257  if (nwritten)
258  *nwritten = totalpacked;
259 
260  Assert(SizeOfGinPostingList(result) <= maxsize);
261 
262  /*
263  * Check that the encoded segment decodes back to the original items.
264  */
265 #if defined (CHECK_ENCODING_ROUNDTRIP)
266  {
267  int ndecoded;
268  ItemPointer tmp = ginPostingListDecode(result, &ndecoded);
269 
270  Assert(ndecoded == totalpacked);
271  Assert(memcmp(tmp, ipd, ndecoded * sizeof(ItemPointerData)) == 0);
272  pfree(tmp);
273  }
274 #endif
275 
276  return result;
277 }
278 
279 /*
280  * Decode a compressed posting list into an array of item pointers.
281  * The number of items is returned in *ndecoded.
282  */
284 ginPostingListDecode(GinPostingList *plist, int *ndecoded)
285 {
286  return ginPostingListDecodeAllSegments(plist,
287  SizeOfGinPostingList(plist),
288  ndecoded);
289 }
290 
291 /*
292  * Decode multiple posting list segments into an array of item pointers.
293  * The number of items is returned in *ndecoded_out. The segments are stored
294  * one after each other, with total size 'len' bytes.
295  */
297 ginPostingListDecodeAllSegments(GinPostingList *segment, int len, int *ndecoded_out)
298 {
299  ItemPointer result;
300  int nallocated;
301  uint64 val;
302  char *endseg = ((char *) segment) + len;
303  int ndecoded;
304  unsigned char *ptr;
305  unsigned char *endptr;
306 
307  /*
308  * Guess an initial size of the array.
309  */
310  nallocated = segment->nbytes * 2 + 1;
311  result = palloc(nallocated * sizeof(ItemPointerData));
312 
313  ndecoded = 0;
314  while ((char *) segment < endseg)
315  {
316  /* enlarge output array if needed */
317  if (ndecoded >= nallocated)
318  {
319  nallocated *= 2;
320  result = repalloc(result, nallocated * sizeof(ItemPointerData));
321  }
322 
323  /* copy the first item */
325  Assert(ndecoded == 0 || ginCompareItemPointers(&segment->first, &result[ndecoded - 1]) > 0);
326  result[ndecoded] = segment->first;
327  ndecoded++;
328 
329  val = itemptr_to_uint64(&segment->first);
330  ptr = segment->bytes;
331  endptr = segment->bytes + segment->nbytes;
332  while (ptr < endptr)
333  {
334  /* enlarge output array if needed */
335  if (ndecoded >= nallocated)
336  {
337  nallocated *= 2;
338  result = repalloc(result, nallocated * sizeof(ItemPointerData));
339  }
340 
341  val += decode_varbyte(&ptr);
342 
343  uint64_to_itemptr(val, &result[ndecoded]);
344  ndecoded++;
345  }
346  segment = GinNextPostingListSegment(segment);
347  }
348 
349  if (ndecoded_out)
350  *ndecoded_out = ndecoded;
351  return result;
352 }
353 
354 /*
355  * Add all item pointers from a bunch of posting lists to a TIDBitmap.
356  */
357 int
359  TIDBitmap *tbm)
360 {
361  int ndecoded;
362  ItemPointer items;
363 
364  items = ginPostingListDecodeAllSegments(ptr, len, &ndecoded);
365  tbm_add_tuples(tbm, items, ndecoded, false);
366  pfree(items);
367 
368  return ndecoded;
369 }
370 
371 /*
372  * Merge two ordered arrays of itempointers, eliminating any duplicates.
373  *
374  * Returns a palloc'd array, and *nmerged is set to the number of items in
375  * the result, after eliminating duplicates.
376  */
379  ItemPointerData *b, uint32 nb,
380  int *nmerged)
381 {
382  ItemPointerData *dst;
383 
384  dst = (ItemPointer) palloc((na + nb) * sizeof(ItemPointerData));
385 
386  /*
387  * If the argument arrays don't overlap, we can just append them to each
388  * other.
389  */
390  if (na == 0 || nb == 0 || ginCompareItemPointers(&a[na - 1], &b[0]) < 0)
391  {
392  memcpy(dst, a, na * sizeof(ItemPointerData));
393  memcpy(&dst[na], b, nb * sizeof(ItemPointerData));
394  *nmerged = na + nb;
395  }
396  else if (ginCompareItemPointers(&b[nb - 1], &a[0]) < 0)
397  {
398  memcpy(dst, b, nb * sizeof(ItemPointerData));
399  memcpy(&dst[nb], a, na * sizeof(ItemPointerData));
400  *nmerged = na + nb;
401  }
402  else
403  {
404  ItemPointerData *dptr = dst;
405  ItemPointerData *aptr = a;
406  ItemPointerData *bptr = b;
407 
408  while (aptr - a < na && bptr - b < nb)
409  {
410  int cmp = ginCompareItemPointers(aptr, bptr);
411 
412  if (cmp > 0)
413  *dptr++ = *bptr++;
414  else if (cmp == 0)
415  {
416  /* only keep one copy of the identical items */
417  *dptr++ = *bptr++;
418  aptr++;
419  }
420  else
421  *dptr++ = *aptr++;
422  }
423 
424  while (aptr - a < na)
425  *dptr++ = *aptr++;
426 
427  while (bptr - b < nb)
428  *dptr++ = *bptr++;
429 
430  *nmerged = dptr - dst;
431  }
432 
433  return dst;
434 }
#define ItemPointerIsValid(pointer)
Definition: itemptr.h:82
GinPostingList * ginCompressPostingList(const ItemPointer ipd, int nipd, int maxsize, int *nwritten)
#define GinItemPointerGetOffsetNumber(pointer)
Definition: ginblock.h:147
ItemPointer ginPostingListDecode(GinPostingList *plist, int *ndecoded)
int ginPostingListDecodeAllSegmentsToTbm(GinPostingList *ptr, int len, TIDBitmap *tbm)
def bytes(source, encoding='ascii', errors='strict')
void tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids, bool recheck)
Definition: tidbitmap.c:376
static uint64 decode_varbyte(unsigned char **ptr)
#define SHORTALIGN_DOWN(LEN)
Definition: c.h:706
ItemPointer ginMergeItemPointers(ItemPointerData *a, uint32 na, ItemPointerData *b, uint32 nb, int *nmerged)
ItemPointerData * ItemPointer
Definition: itemptr.h:49
ItemPointerData first
Definition: ginblock.h:338
void pfree(void *pointer)
Definition: mcxt.c:1057
#define GinNextPostingListSegment(cur)
Definition: ginblock.h:344
static void uint64_to_itemptr(uint64 val, ItemPointer iptr)
unsigned char bytes[FLEXIBLE_ARRAY_MEMBER]
Definition: ginblock.h:340
#define MaxBytesPerInteger
char * c
static char * buf
Definition: pg_test_fsync.c:67
unsigned int uint32
Definition: c.h:374
#define MaxHeapTuplesPerPageBits
uint16 nbytes
Definition: ginblock.h:339
#define Assert(condition)
Definition: c.h:745
#define GinItemPointerSetOffsetNumber(pointer, offnum)
Definition: ginblock.h:153
struct ItemPointerData ItemPointerData
#define GinItemPointerSetBlockNumber(pointer, blkno)
Definition: ginblock.h:150
#define ItemPointerGetOffsetNumber(pointer)
Definition: itemptr.h:117
static uint64 itemptr_to_uint64(const ItemPointer iptr)
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1070
static int ginCompareItemPointers(ItemPointer a, ItemPointer b)
Definition: gin_private.h:484
static void encode_varbyte(uint64 val, unsigned char **ptr)
void * palloc(Size size)
Definition: mcxt.c:950
#define SizeOfGinPostingList(plist)
Definition: ginblock.h:343
ItemPointer ginPostingListDecodeAllSegments(GinPostingList *segment, int len, int *ndecoded_out)
#define OffsetNumberIsValid(offsetNumber)
Definition: off.h:39
#define SHORTALIGN(LEN)
Definition: c.h:694
#define GinItemPointerGetBlockNumber(pointer)
Definition: ginblock.h:144
long val
Definition: informix.c:664
#define offsetof(type, field)
Definition: c.h:668
static int cmp(const chr *x, const chr *y, size_t len)
Definition: regc_locale.c:742