PostgreSQL Source Code  git master
ginpostinglist.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * ginpostinglist.c
4  * routines for dealing with posting lists.
5  *
6  *
7  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
8  * Portions Copyright (c) 1994, Regents of the University of California
9  *
10  * IDENTIFICATION
11  * src/backend/access/gin/ginpostinglist.c
12  *-------------------------------------------------------------------------
13  */
14 
15 #include "postgres.h"
16 
17 #include "access/gin_private.h"
18 
19 #ifdef USE_ASSERT_CHECKING
20 #define CHECK_ENCODING_ROUNDTRIP
21 #endif
22 
23 /*
24  * For encoding purposes, item pointers are represented as 64-bit unsigned
25  * integers. The lowest 11 bits represent the offset number, and the next
26  * lowest 32 bits are the block number. That leaves 17 bits unused, i.e.
27  * only 43 low bits are used.
28  *
29  * These 43-bit integers are encoded using varbyte encoding. In each byte,
30  * the 7 low bits contain data, while the highest bit is a continuation bit.
31  * When the continuation bit is set, the next byte is part of the same
32  * integer, otherwise this is the last byte of this integer. 43 bits fit
33  * conveniently in at most 6 bytes when varbyte encoded (the 6th byte does
34  * not need a continuation bit, because we know the max size to be 43 bits):
35  *
36  * 0XXXXXXX
37  * 1XXXXXXX 0XXXXYYY
38  * 1XXXXXXX 1XXXXYYY 0YYYYYYY
39  * 1XXXXXXX 1XXXXYYY 1YYYYYYY 0YYYYYYY
40  * 1XXXXXXX 1XXXXYYY 1YYYYYYY 1YYYYYYY 0YYYYYYY
41  * 1XXXXXXX 1XXXXYYY 1YYYYYYY 1YYYYYYY 1YYYYYYY YYYYYYYY
42  *
43  * X = bits used for offset number
44  * Y = bits used for block number
45  *
46  * The bytes are in stored in little-endian order.
47  *
48  * An important property of this encoding is that removing an item from list
49  * never increases the size of the resulting compressed posting list. Proof:
50  *
51  * Removing number is actually replacement of two numbers with their sum. We
52  * have to prove that varbyte encoding of a sum can't be longer than varbyte
53  * encoding of its summands. Sum of two numbers is at most one bit wider than
54  * the larger of the summands. Widening a number by one bit enlarges its length
55  * in varbyte encoding by at most one byte. Therefore, varbyte encoding of sum
56  * is at most one byte longer than varbyte encoding of larger summand. Lesser
57  * summand is at least one byte, so the sum cannot take more space than the
58  * summands, Q.E.D.
59  *
60  * This property greatly simplifies VACUUM, which can assume that posting
61  * lists always fit on the same page after vacuuming. Note that even though
62  * that holds for removing items from a posting list, you must also be
63  * careful to not cause expansion e.g. when merging uncompressed items on the
64  * page into the compressed lists, when vacuuming.
65  */
66 
67 /*
68  * How many bits do you need to encode offset number? OffsetNumber is a 16-bit
69  * integer, but you can't fit that many items on a page. 11 ought to be more
70  * than enough. It's tempting to derive this from MaxHeapTuplesPerPage, and
71  * use the minimum number of bits, but that would require changing the on-disk
72  * format if MaxHeapTuplesPerPage changes. Better to leave some slack.
73  */
74 #define MaxHeapTuplesPerPageBits 11
75 
76 static inline uint64
78 {
79  uint64 val;
80 
83 
84  val = GinItemPointerGetBlockNumber(iptr);
86  val |= GinItemPointerGetOffsetNumber(iptr);
87 
88  return val;
89 }
90 
91 static inline void
93 {
95  val = val >> MaxHeapTuplesPerPageBits;
97 
99 }
100 
101 /*
102  * Varbyte-encode 'val' into *ptr. *ptr is incremented to next integer.
103  */
104 static void
105 encode_varbyte(uint64 val, unsigned char **ptr)
106 {
107  unsigned char *p = *ptr;
108 
109  while (val > 0x7F)
110  {
111  *(p++) = 0x80 | (val & 0x7F);
112  val >>= 7;
113  }
114  *(p++) = (unsigned char) val;
115 
116  *ptr = p;
117 }
118 
119 /*
120  * Decode varbyte-encoded integer at *ptr. *ptr is incremented to next integer.
121  */
122 static uint64
123 decode_varbyte(unsigned char **ptr)
124 {
125  uint64 val;
126  unsigned char *p = *ptr;
127  uint64 c;
128 
129  c = *(p++);
130  val = c & 0x7F;
131  if (c & 0x80)
132  {
133  c = *(p++);
134  val |= (c & 0x7F) << 7;
135  if (c & 0x80)
136  {
137  c = *(p++);
138  val |= (c & 0x7F) << 14;
139  if (c & 0x80)
140  {
141  c = *(p++);
142  val |= (c & 0x7F) << 21;
143  if (c & 0x80)
144  {
145  c = *(p++);
146  val |= (c & 0x7F) << 28;
147  if (c & 0x80)
148  {
149  c = *(p++);
150  val |= (c & 0x7F) << 35;
151  if (c & 0x80)
152  {
153  /* last byte, no continuation bit */
154  c = *(p++);
155  val |= c << 42;
156  }
157  }
158  }
159  }
160  }
161  }
162 
163  *ptr = p;
164 
165  return val;
166 }
167 
168 /*
169  * Encode a posting list.
170  *
171  * The encoded list is returned in a palloc'd struct, which will be at most
172  * 'maxsize' bytes in size. The number items in the returned segment is
173  * returned in *nwritten. If it's not equal to nipd, not all the items fit
174  * in 'maxsize', and only the first *nwritten were encoded.
175  *
176  * The allocated size of the returned struct is short-aligned, and the padding
177  * byte at the end, if any, is zero.
178  */
180 ginCompressPostingList(const ItemPointer ipd, int nipd, int maxsize,
181  int *nwritten)
182 {
183  uint64 prev;
184  int totalpacked = 0;
185  int maxbytes;
186  GinPostingList *result;
187  unsigned char *ptr;
188  unsigned char *endptr;
189 
190  maxsize = SHORTALIGN_DOWN(maxsize);
191 
192  result = palloc(maxsize);
193 
194  maxbytes = maxsize - offsetof(GinPostingList, bytes);
195  Assert(maxbytes > 0);
196 
197  /* Store the first special item */
198  result->first = ipd[0];
199 
200  prev = itemptr_to_uint64(&result->first);
201 
202  ptr = result->bytes;
203  endptr = result->bytes + maxbytes;
204  for (totalpacked = 1; totalpacked < nipd; totalpacked++)
205  {
206  uint64 val = itemptr_to_uint64(&ipd[totalpacked]);
207  uint64 delta = val - prev;
208 
209  Assert(val > prev);
210 
211  if (endptr - ptr >= 6)
212  encode_varbyte(delta, &ptr);
213  else
214  {
215  /*
216  * There are less than 6 bytes left. Have to check if the next
217  * item fits in that space before writing it out.
218  */
219  unsigned char buf[6];
220  unsigned char *p = buf;
221 
222  encode_varbyte(delta, &p);
223  if (p - buf > (endptr - ptr))
224  break; /* output is full */
225 
226  memcpy(ptr, buf, p - buf);
227  ptr += (p - buf);
228  }
229  prev = val;
230  }
231  result->nbytes = ptr - result->bytes;
232 
233  /*
234  * If we wrote an odd number of bytes, zero out the padding byte at the
235  * end.
236  */
237  if (result->nbytes != SHORTALIGN(result->nbytes))
238  result->bytes[result->nbytes] = 0;
239 
240  if (nwritten)
241  *nwritten = totalpacked;
242 
243  Assert(SizeOfGinPostingList(result) <= maxsize);
244 
245  /*
246  * Check that the encoded segment decodes back to the original items.
247  */
248 #if defined (CHECK_ENCODING_ROUNDTRIP)
249  {
250  int ndecoded;
251  ItemPointer tmp = ginPostingListDecode(result, &ndecoded);
252  int i;
253 
254  Assert(ndecoded == totalpacked);
255  for (i = 0; i < ndecoded; i++)
256  Assert(memcmp(&tmp[i], &ipd[i], sizeof(ItemPointerData)) == 0);
257  pfree(tmp);
258  }
259 #endif
260 
261  return result;
262 }
263 
264 /*
265  * Decode a compressed posting list into an array of item pointers.
266  * The number of items is returned in *ndecoded.
267  */
269 ginPostingListDecode(GinPostingList *plist, int *ndecoded)
270 {
271  return ginPostingListDecodeAllSegments(plist,
272  SizeOfGinPostingList(plist),
273  ndecoded);
274 }
275 
276 /*
277  * Decode multiple posting list segments into an array of item pointers.
278  * The number of items is returned in *ndecoded_out. The segments are stored
279  * one after each other, with total size 'len' bytes.
280  */
282 ginPostingListDecodeAllSegments(GinPostingList *segment, int len, int *ndecoded_out)
283 {
284  ItemPointer result;
285  int nallocated;
286  uint64 val;
287  char *endseg = ((char *) segment) + len;
288  int ndecoded;
289  unsigned char *ptr;
290  unsigned char *endptr;
291 
292  /*
293  * Guess an initial size of the array.
294  */
295  nallocated = segment->nbytes * 2 + 1;
296  result = palloc(nallocated * sizeof(ItemPointerData));
297 
298  ndecoded = 0;
299  while ((char *) segment < endseg)
300  {
301  /* enlarge output array if needed */
302  if (ndecoded >= nallocated)
303  {
304  nallocated *= 2;
305  result = repalloc(result, nallocated * sizeof(ItemPointerData));
306  }
307 
308  /* copy the first item */
310  Assert(ndecoded == 0 || ginCompareItemPointers(&segment->first, &result[ndecoded - 1]) > 0);
311  result[ndecoded] = segment->first;
312  ndecoded++;
313 
314  val = itemptr_to_uint64(&segment->first);
315  ptr = segment->bytes;
316  endptr = segment->bytes + segment->nbytes;
317  while (ptr < endptr)
318  {
319  /* enlarge output array if needed */
320  if (ndecoded >= nallocated)
321  {
322  nallocated *= 2;
323  result = repalloc(result, nallocated * sizeof(ItemPointerData));
324  }
325 
326  val += decode_varbyte(&ptr);
327 
328  uint64_to_itemptr(val, &result[ndecoded]);
329  ndecoded++;
330  }
331  segment = GinNextPostingListSegment(segment);
332  }
333 
334  if (ndecoded_out)
335  *ndecoded_out = ndecoded;
336  return result;
337 }
338 
339 /*
340  * Add all item pointers from a bunch of posting lists to a TIDBitmap.
341  */
342 int
344  TIDBitmap *tbm)
345 {
346  int ndecoded;
347  ItemPointer items;
348 
349  items = ginPostingListDecodeAllSegments(ptr, len, &ndecoded);
350  tbm_add_tuples(tbm, items, ndecoded, false);
351  pfree(items);
352 
353  return ndecoded;
354 }
355 
356 /*
357  * Merge two ordered arrays of itempointers, eliminating any duplicates.
358  *
359  * Returns a palloc'd array, and *nmerged is set to the number of items in
360  * the result, after eliminating duplicates.
361  */
364  ItemPointerData *b, uint32 nb,
365  int *nmerged)
366 {
367  ItemPointerData *dst;
368 
369  dst = (ItemPointer) palloc((na + nb) * sizeof(ItemPointerData));
370 
371  /*
372  * If the argument arrays don't overlap, we can just append them to each
373  * other.
374  */
375  if (na == 0 || nb == 0 || ginCompareItemPointers(&a[na - 1], &b[0]) < 0)
376  {
377  memcpy(dst, a, na * sizeof(ItemPointerData));
378  memcpy(&dst[na], b, nb * sizeof(ItemPointerData));
379  *nmerged = na + nb;
380  }
381  else if (ginCompareItemPointers(&b[nb - 1], &a[0]) < 0)
382  {
383  memcpy(dst, b, nb * sizeof(ItemPointerData));
384  memcpy(&dst[nb], a, na * sizeof(ItemPointerData));
385  *nmerged = na + nb;
386  }
387  else
388  {
389  ItemPointerData *dptr = dst;
390  ItemPointerData *aptr = a;
391  ItemPointerData *bptr = b;
392 
393  while (aptr - a < na && bptr - b < nb)
394  {
395  int cmp = ginCompareItemPointers(aptr, bptr);
396 
397  if (cmp > 0)
398  *dptr++ = *bptr++;
399  else if (cmp == 0)
400  {
401  /* only keep one copy of the identical items */
402  *dptr++ = *bptr++;
403  aptr++;
404  }
405  else
406  *dptr++ = *aptr++;
407  }
408 
409  while (aptr - a < na)
410  *dptr++ = *aptr++;
411 
412  while (bptr - b < nb)
413  *dptr++ = *bptr++;
414 
415  *nmerged = dptr - dst;
416  }
417 
418  return dst;
419 }
#define ItemPointerIsValid(pointer)
Definition: itemptr.h:60
GinPostingList * ginCompressPostingList(const ItemPointer ipd, int nipd, int maxsize, int *nwritten)
#define GinItemPointerGetOffsetNumber(pointer)
Definition: ginblock.h:137
ItemPointer ginPostingListDecode(GinPostingList *plist, int *ndecoded)
int ginPostingListDecodeAllSegmentsToTbm(GinPostingList *ptr, int len, TIDBitmap *tbm)
void tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids, bool recheck)
Definition: tidbitmap.c:376
static uint64 decode_varbyte(unsigned char **ptr)
#define SHORTALIGN_DOWN(LEN)
Definition: c.h:631
ItemPointer ginMergeItemPointers(ItemPointerData *a, uint32 na, ItemPointerData *b, uint32 nb, int *nmerged)
ItemPointerData * ItemPointer
Definition: itemptr.h:49
ItemPointerData first
Definition: ginblock.h:328
void pfree(void *pointer)
Definition: mcxt.c:949
#define GinNextPostingListSegment(cur)
Definition: ginblock.h:334
static void uint64_to_itemptr(uint64 val, ItemPointer iptr)
unsigned char bytes[FLEXIBLE_ARRAY_MEMBER]
Definition: ginblock.h:330
char * c
static char * buf
Definition: pg_test_fsync.c:67
unsigned int uint32
Definition: c.h:296
#define MaxHeapTuplesPerPageBits
uint16 nbytes
Definition: ginblock.h:329
#define Assert(condition)
Definition: c.h:670
#define GinItemPointerSetOffsetNumber(pointer, offnum)
Definition: ginblock.h:143
struct ItemPointerData ItemPointerData
#define GinItemPointerSetBlockNumber(pointer, blkno)
Definition: ginblock.h:140
#define ItemPointerGetOffsetNumber(pointer)
Definition: itemptr.h:95
static uint64 itemptr_to_uint64(const ItemPointer iptr)
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:962
static int ginCompareItemPointers(ItemPointer a, ItemPointer b)
Definition: gin_private.h:461
static void encode_varbyte(uint64 val, unsigned char **ptr)
void * palloc(Size size)
Definition: mcxt.c:848
#define SizeOfGinPostingList(plist)
Definition: ginblock.h:333
ItemPointer ginPostingListDecodeAllSegments(GinPostingList *segment, int len, int *ndecoded_out)
int i
#define OffsetNumberIsValid(offsetNumber)
Definition: off.h:40
#define SHORTALIGN(LEN)
Definition: c.h:619
#define GinItemPointerGetBlockNumber(pointer)
Definition: ginblock.h:134
long val
Definition: informix.c:689
#define offsetof(type, field)
Definition: c.h:593
static int cmp(const chr *x, const chr *y, size_t len)
Definition: regc_locale.c:742