PostgreSQL Source Code  git master
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros
ginpostinglist.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * ginpostinglist.c
4  * routines for dealing with posting lists.
5  *
6  *
7  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
8  * Portions Copyright (c) 1994, Regents of the University of California
9  *
10  * IDENTIFICATION
11  * src/backend/access/gin/ginpostinglist.c
12  *-------------------------------------------------------------------------
13  */
14 
15 #include "postgres.h"
16 
17 #include "access/gin_private.h"
18 
19 #ifdef USE_ASSERT_CHECKING
20 #define CHECK_ENCODING_ROUNDTRIP
21 #endif
22 
23 /*
24  * For encoding purposes, item pointers are represented as 64-bit unsigned
25  * integers. The lowest 11 bits represent the offset number, and the next
26  * lowest 32 bits are the block number. That leaves 17 bits unused, i.e.
27  * only 43 low bits are used.
28  *
29  * These 43-bit integers are encoded using varbyte encoding. In each byte,
30  * the 7 low bits contain data, while the highest bit is a continuation bit.
31  * When the continuation bit is set, the next byte is part of the same
32  * integer, otherwise this is the last byte of this integer. 43 bits fit
33  * conveniently in at most 6 bytes when varbyte encoded (the 6th byte does
34  * not need a continuation bit, because we know the max size to be 43 bits):
35  *
36  * 0XXXXXXX
37  * 1XXXXXXX 0XXXXYYY
38  * 1XXXXXXX 1XXXXYYY 0YYYYYYY
39  * 1XXXXXXX 1XXXXYYY 1YYYYYYY 0YYYYYYY
40  * 1XXXXXXX 1XXXXYYY 1YYYYYYY 1YYYYYYY 0YYYYYYY
41  * 1XXXXXXX 1XXXXYYY 1YYYYYYY 1YYYYYYY 1YYYYYYY YYYYYYYY
42  *
43  * X = bits used for offset number
44  * Y = bits used for block number
45  *
46  * The bytes are in stored in little-endian order.
47  *
48  * An important property of this encoding is that removing an item from list
49  * never increases the size of the resulting compressed posting list. Proof:
50  *
51  * Removing number is actually replacement of two numbers with their sum. We
52  * have to prove that varbyte encoding of a sum can't be longer than varbyte
53  * encoding of its summands. Sum of two numbers is at most one bit wider than
54  * the larger of the summands. Widening a number by one bit enlarges its length
55  * in varbyte encoding by at most one byte. Therefore, varbyte encoding of sum
56  * is at most one byte longer than varbyte encoding of larger summand. Lesser
57  * summand is at least one byte, so the sum cannot take more space than the
58  * summands, Q.E.D.
59  *
60  * This property greatly simplifies VACUUM, which can assume that posting
61  * lists always fit on the same page after vacuuming. Note that even though
62  * that holds for removing items from a posting list, you must also be
63  * careful to not cause expansion e.g. when merging uncompressed items on the
64  * page into the compressed lists, when vacuuming.
65  */
66 
67 /*
68  * How many bits do you need to encode offset number? OffsetNumber is a 16-bit
69  * integer, but you can't fit that many items on a page. 11 ought to be more
70  * than enough. It's tempting to derive this from MaxHeapTuplesPerPage, and
71  * use the minimum number of bits, but that would require changing the on-disk
72  * format if MaxHeapTuplesPerPage changes. Better to leave some slack.
73  */
74 #define MaxHeapTuplesPerPageBits 11
75 
76 static inline uint64
78 {
79  uint64 val;
80 
83 
84  val = iptr->ip_blkid.bi_hi;
85  val <<= 16;
86  val |= iptr->ip_blkid.bi_lo;
88  val |= iptr->ip_posid;
89 
90  return val;
91 }
92 
93 static inline void
95 {
96  iptr->ip_posid = val & ((1 << MaxHeapTuplesPerPageBits) - 1);
97  val = val >> MaxHeapTuplesPerPageBits;
98  iptr->ip_blkid.bi_lo = val & 0xFFFF;
99  val = val >> 16;
100  iptr->ip_blkid.bi_hi = val & 0xFFFF;
101 
102  Assert(ItemPointerIsValid(iptr));
103 }
104 
105 /*
106  * Varbyte-encode 'val' into *ptr. *ptr is incremented to next integer.
107  */
108 static void
109 encode_varbyte(uint64 val, unsigned char **ptr)
110 {
111  unsigned char *p = *ptr;
112 
113  while (val > 0x7F)
114  {
115  *(p++) = 0x80 | (val & 0x7F);
116  val >>= 7;
117  }
118  *(p++) = (unsigned char) val;
119 
120  *ptr = p;
121 }
122 
123 /*
124  * Decode varbyte-encoded integer at *ptr. *ptr is incremented to next integer.
125  */
126 static uint64
127 decode_varbyte(unsigned char **ptr)
128 {
129  uint64 val;
130  unsigned char *p = *ptr;
131  uint64 c;
132 
133  c = *(p++);
134  val = c & 0x7F;
135  if (c & 0x80)
136  {
137  c = *(p++);
138  val |= (c & 0x7F) << 7;
139  if (c & 0x80)
140  {
141  c = *(p++);
142  val |= (c & 0x7F) << 14;
143  if (c & 0x80)
144  {
145  c = *(p++);
146  val |= (c & 0x7F) << 21;
147  if (c & 0x80)
148  {
149  c = *(p++);
150  val |= (c & 0x7F) << 28;
151  if (c & 0x80)
152  {
153  c = *(p++);
154  val |= (c & 0x7F) << 35;
155  if (c & 0x80)
156  {
157  /* last byte, no continuation bit */
158  c = *(p++);
159  val |= c << 42;
160  }
161  }
162  }
163  }
164  }
165  }
166 
167  *ptr = p;
168 
169  return val;
170 }
171 
172 /*
173  * Encode a posting list.
174  *
175  * The encoded list is returned in a palloc'd struct, which will be at most
176  * 'maxsize' bytes in size. The number items in the returned segment is
177  * returned in *nwritten. If it's not equal to nipd, not all the items fit
178  * in 'maxsize', and only the first *nwritten were encoded.
179  *
180  * The allocated size of the returned struct is short-aligned, and the padding
181  * byte at the end, if any, is zero.
182  */
184 ginCompressPostingList(const ItemPointer ipd, int nipd, int maxsize,
185  int *nwritten)
186 {
187  uint64 prev;
188  int totalpacked = 0;
189  int maxbytes;
190  GinPostingList *result;
191  unsigned char *ptr;
192  unsigned char *endptr;
193 
194  maxsize = SHORTALIGN_DOWN(maxsize);
195 
196  result = palloc(maxsize);
197 
198  maxbytes = maxsize - offsetof(GinPostingList, bytes);
199  Assert(maxbytes > 0);
200 
201  /* Store the first special item */
202  result->first = ipd[0];
203 
204  prev = itemptr_to_uint64(&result->first);
205 
206  ptr = result->bytes;
207  endptr = result->bytes + maxbytes;
208  for (totalpacked = 1; totalpacked < nipd; totalpacked++)
209  {
210  uint64 val = itemptr_to_uint64(&ipd[totalpacked]);
211  uint64 delta = val - prev;
212 
213  Assert(val > prev);
214 
215  if (endptr - ptr >= 6)
216  encode_varbyte(delta, &ptr);
217  else
218  {
219  /*
220  * There are less than 6 bytes left. Have to check if the next
221  * item fits in that space before writing it out.
222  */
223  unsigned char buf[6];
224  unsigned char *p = buf;
225 
226  encode_varbyte(delta, &p);
227  if (p - buf > (endptr - ptr))
228  break; /* output is full */
229 
230  memcpy(ptr, buf, p - buf);
231  ptr += (p - buf);
232  }
233  prev = val;
234  }
235  result->nbytes = ptr - result->bytes;
236 
237  /*
238  * If we wrote an odd number of bytes, zero out the padding byte at the
239  * end.
240  */
241  if (result->nbytes != SHORTALIGN(result->nbytes))
242  result->bytes[result->nbytes] = 0;
243 
244  if (nwritten)
245  *nwritten = totalpacked;
246 
247  Assert(SizeOfGinPostingList(result) <= maxsize);
248 
249  /*
250  * Check that the encoded segment decodes back to the original items.
251  */
252 #if defined (CHECK_ENCODING_ROUNDTRIP)
253  {
254  int ndecoded;
255  ItemPointer tmp = ginPostingListDecode(result, &ndecoded);
256  int i;
257 
258  Assert(ndecoded == totalpacked);
259  for (i = 0; i < ndecoded; i++)
260  Assert(memcmp(&tmp[i], &ipd[i], sizeof(ItemPointerData)) == 0);
261  pfree(tmp);
262  }
263 #endif
264 
265  return result;
266 }
267 
268 /*
269  * Decode a compressed posting list into an array of item pointers.
270  * The number of items is returned in *ndecoded.
271  */
273 ginPostingListDecode(GinPostingList *plist, int *ndecoded)
274 {
275  return ginPostingListDecodeAllSegments(plist,
276  SizeOfGinPostingList(plist),
277  ndecoded);
278 }
279 
280 /*
281  * Decode multiple posting list segments into an array of item pointers.
282  * The number of items is returned in *ndecoded_out. The segments are stored
283  * one after each other, with total size 'len' bytes.
284  */
286 ginPostingListDecodeAllSegments(GinPostingList *segment, int len, int *ndecoded_out)
287 {
288  ItemPointer result;
289  int nallocated;
290  uint64 val;
291  char *endseg = ((char *) segment) + len;
292  int ndecoded;
293  unsigned char *ptr;
294  unsigned char *endptr;
295 
296  /*
297  * Guess an initial size of the array.
298  */
299  nallocated = segment->nbytes * 2 + 1;
300  result = palloc(nallocated * sizeof(ItemPointerData));
301 
302  ndecoded = 0;
303  while ((char *) segment < endseg)
304  {
305  /* enlarge output array if needed */
306  if (ndecoded >= nallocated)
307  {
308  nallocated *= 2;
309  result = repalloc(result, nallocated * sizeof(ItemPointerData));
310  }
311 
312  /* copy the first item */
314  Assert(ndecoded == 0 || ginCompareItemPointers(&segment->first, &result[ndecoded - 1]) > 0);
315  result[ndecoded] = segment->first;
316  ndecoded++;
317 
318  val = itemptr_to_uint64(&segment->first);
319  ptr = segment->bytes;
320  endptr = segment->bytes + segment->nbytes;
321  while (ptr < endptr)
322  {
323  /* enlarge output array if needed */
324  if (ndecoded >= nallocated)
325  {
326  nallocated *= 2;
327  result = repalloc(result, nallocated * sizeof(ItemPointerData));
328  }
329 
330  val += decode_varbyte(&ptr);
331 
332  uint64_to_itemptr(val, &result[ndecoded]);
333  ndecoded++;
334  }
335  segment = GinNextPostingListSegment(segment);
336  }
337 
338  if (ndecoded_out)
339  *ndecoded_out = ndecoded;
340  return result;
341 }
342 
343 /*
344  * Add all item pointers from a bunch of posting lists to a TIDBitmap.
345  */
346 int
348  TIDBitmap *tbm)
349 {
350  int ndecoded;
351  ItemPointer items;
352 
353  items = ginPostingListDecodeAllSegments(ptr, len, &ndecoded);
354  tbm_add_tuples(tbm, items, ndecoded, false);
355  pfree(items);
356 
357  return ndecoded;
358 }
359 
360 /*
361  * Merge two ordered arrays of itempointers, eliminating any duplicates.
362  *
363  * Returns a palloc'd array, and *nmerged is set to the number of items in
364  * the result, after eliminating duplicates.
365  */
368  ItemPointerData *b, uint32 nb,
369  int *nmerged)
370 {
371  ItemPointerData *dst;
372 
373  dst = (ItemPointer) palloc((na + nb) * sizeof(ItemPointerData));
374 
375  /*
376  * If the argument arrays don't overlap, we can just append them to each
377  * other.
378  */
379  if (na == 0 || nb == 0 || ginCompareItemPointers(&a[na - 1], &b[0]) < 0)
380  {
381  memcpy(dst, a, na * sizeof(ItemPointerData));
382  memcpy(&dst[na], b, nb * sizeof(ItemPointerData));
383  *nmerged = na + nb;
384  }
385  else if (ginCompareItemPointers(&b[nb - 1], &a[0]) < 0)
386  {
387  memcpy(dst, b, nb * sizeof(ItemPointerData));
388  memcpy(&dst[nb], a, na * sizeof(ItemPointerData));
389  *nmerged = na + nb;
390  }
391  else
392  {
393  ItemPointerData *dptr = dst;
394  ItemPointerData *aptr = a;
395  ItemPointerData *bptr = b;
396 
397  while (aptr - a < na && bptr - b < nb)
398  {
399  int cmp = ginCompareItemPointers(aptr, bptr);
400 
401  if (cmp > 0)
402  *dptr++ = *bptr++;
403  else if (cmp == 0)
404  {
405  /* only keep one copy of the identical items */
406  *dptr++ = *bptr++;
407  aptr++;
408  }
409  else
410  *dptr++ = *aptr++;
411  }
412 
413  while (aptr - a < na)
414  *dptr++ = *aptr++;
415 
416  while (bptr - b < nb)
417  *dptr++ = *bptr++;
418 
419  *nmerged = dptr - dst;
420  }
421 
422  return dst;
423 }
#define ItemPointerIsValid(pointer)
Definition: itemptr.h:59
GinPostingList * ginCompressPostingList(const ItemPointer ipd, int nipd, int maxsize, int *nwritten)
ItemPointer ginPostingListDecode(GinPostingList *plist, int *ndecoded)
int ginPostingListDecodeAllSegmentsToTbm(GinPostingList *ptr, int len, TIDBitmap *tbm)
void tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids, bool recheck)
Definition: tidbitmap.c:290
static uint64 decode_varbyte(unsigned char **ptr)
#define SHORTALIGN_DOWN(LEN)
Definition: c.h:591
ItemPointer ginMergeItemPointers(ItemPointerData *a, uint32 na, ItemPointerData *b, uint32 nb, int *nmerged)
uint16 bi_hi
Definition: block.h:55
ItemPointerData * ItemPointer
Definition: itemptr.h:48
ItemPointerData first
Definition: ginblock.h:321
void pfree(void *pointer)
Definition: mcxt.c:992
#define GinNextPostingListSegment(cur)
Definition: ginblock.h:327
BlockIdData ip_blkid
Definition: itemptr.h:38
static void uint64_to_itemptr(uint64 val, ItemPointer iptr)
unsigned char bytes[FLEXIBLE_ARRAY_MEMBER]
Definition: ginblock.h:323
char * c
static char * buf
Definition: pg_test_fsync.c:65
unsigned int uint32
Definition: c.h:265
#define MaxHeapTuplesPerPageBits
uint16 nbytes
Definition: ginblock.h:322
#define Assert(condition)
Definition: c.h:670
struct ItemPointerData ItemPointerData
#define ItemPointerGetOffsetNumber(pointer)
Definition: itemptr.h:76
static uint64 itemptr_to_uint64(const ItemPointer iptr)
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1021
static int ginCompareItemPointers(ItemPointer a, ItemPointer b)
Definition: gin_private.h:461
static void encode_varbyte(uint64 val, unsigned char **ptr)
void * palloc(Size size)
Definition: mcxt.c:891
#define SizeOfGinPostingList(plist)
Definition: ginblock.h:326
ItemPointer ginPostingListDecodeAllSegments(GinPostingList *segment, int len, int *ndecoded_out)
int i
#define OffsetNumberIsValid(offsetNumber)
Definition: off.h:40
#define SHORTALIGN(LEN)
Definition: c.h:579
OffsetNumber ip_posid
Definition: itemptr.h:39
long val
Definition: informix.c:689
#define offsetof(type, field)
Definition: c.h:550
static int cmp(const chr *x, const chr *y, size_t len)
Definition: regc_locale.c:742
uint16 bi_lo
Definition: block.h:56