PostgreSQL Source Code  git master
varlena.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * varlena.c
4  * Functions for the variable-length built-in types.
5  *
6  * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  * src/backend/utils/adt/varlena.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 #include "postgres.h"
16 
17 #include <ctype.h>
18 #include <limits.h>
19 
20 #include "access/detoast.h"
22 #include "catalog/pg_collation.h"
23 #include "catalog/pg_type.h"
24 #include "common/hashfn.h"
25 #include "common/int.h"
26 #include "common/unicode_norm.h"
27 #include "funcapi.h"
28 #include "lib/hyperloglog.h"
29 #include "libpq/pqformat.h"
30 #include "miscadmin.h"
31 #include "nodes/execnodes.h"
32 #include "parser/scansup.h"
33 #include "port/pg_bswap.h"
34 #include "regex/regex.h"
35 #include "utils/builtins.h"
36 #include "utils/bytea.h"
37 #include "utils/guc.h"
38 #include "utils/lsyscache.h"
39 #include "utils/memutils.h"
40 #include "utils/pg_locale.h"
41 #include "utils/sortsupport.h"
42 #include "utils/varlena.h"
43 
44 
45 /* GUC variable */
47 
48 typedef struct varlena unknown;
49 typedef struct varlena VarString;
50 
51 /*
52  * State for text_position_* functions.
53  */
54 typedef struct
55 {
56  bool is_multibyte_char_in_char; /* need to check char boundaries? */
57 
58  char *str1; /* haystack string */
59  char *str2; /* needle string */
60  int len1; /* string lengths in bytes */
61  int len2;
62 
63  /* Skip table for Boyer-Moore-Horspool search algorithm: */
64  int skiptablemask; /* mask for ANDing with skiptable subscripts */
65  int skiptable[256]; /* skip distance for given mismatched char */
66 
67  char *last_match; /* pointer to last match in 'str1' */
68 
69  /*
70  * Sometimes we need to convert the byte position of a match to a
71  * character position. These store the last position that was converted,
72  * so that on the next call, we can continue from that point, rather than
73  * count characters from the very beginning.
74  */
75  char *refpoint; /* pointer within original haystack string */
76  int refpos; /* 0-based character offset of the same point */
78 
79 typedef struct
80 {
81  char *buf1; /* 1st string, or abbreviation original string
82  * buf */
83  char *buf2; /* 2nd string, or abbreviation strxfrm() buf */
84  int buflen1; /* Allocated length of buf1 */
85  int buflen2; /* Allocated length of buf2 */
86  int last_len1; /* Length of last buf1 string/strxfrm() input */
87  int last_len2; /* Length of last buf2 string/strxfrm() blob */
88  int last_returned; /* Last comparison result (cache) */
89  bool cache_blob; /* Does buf2 contain strxfrm() blob, etc? */
90  bool collate_c;
91  Oid typid; /* Actual datatype (text/bpchar/bytea/name) */
92  hyperLogLogState abbr_card; /* Abbreviated key cardinality state */
93  hyperLogLogState full_card; /* Full key cardinality state */
94  double prop_card; /* Required cardinality proportion */
97 
98 /*
99  * Output data for split_text(): we output either to an array or a table.
100  * tupstore and tupdesc must be set up in advance to output to a table.
101  */
102 typedef struct
103 {
108 
109 /*
110  * This should be large enough that most strings will fit, but small enough
111  * that we feel comfortable putting it on the stack
112  */
113 #define TEXTBUFLEN 1024
114 
115 #define DatumGetUnknownP(X) ((unknown *) PG_DETOAST_DATUM(X))
116 #define DatumGetUnknownPCopy(X) ((unknown *) PG_DETOAST_DATUM_COPY(X))
117 #define PG_GETARG_UNKNOWN_P(n) DatumGetUnknownP(PG_GETARG_DATUM(n))
118 #define PG_GETARG_UNKNOWN_P_COPY(n) DatumGetUnknownPCopy(PG_GETARG_DATUM(n))
119 #define PG_RETURN_UNKNOWN_P(x) PG_RETURN_POINTER(x)
120 
121 #define DatumGetVarStringP(X) ((VarString *) PG_DETOAST_DATUM(X))
122 #define DatumGetVarStringPP(X) ((VarString *) PG_DETOAST_DATUM_PACKED(X))
123 
124 static int varstrfastcmp_c(Datum x, Datum y, SortSupport ssup);
125 static int bpcharfastcmp_c(Datum x, Datum y, SortSupport ssup);
126 static int namefastcmp_c(Datum x, Datum y, SortSupport ssup);
127 static int varlenafastcmp_locale(Datum x, Datum y, SortSupport ssup);
128 static int namefastcmp_locale(Datum x, Datum y, SortSupport ssup);
129 static int varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup);
130 static Datum varstr_abbrev_convert(Datum original, SortSupport ssup);
131 static bool varstr_abbrev_abort(int memtupcount, SortSupport ssup);
132 static int32 text_length(Datum str);
133 static text *text_catenate(text *t1, text *t2);
134 static text *text_substring(Datum str,
135  int32 start,
136  int32 length,
137  bool length_not_specified);
138 static text *text_overlay(text *t1, text *t2, int sp, int sl);
139 static int text_position(text *t1, text *t2, Oid collid);
140 static void text_position_setup(text *t1, text *t2, Oid collid, TextPositionState *state);
142 static char *text_position_next_internal(char *start_ptr, TextPositionState *state);
146 static void check_collation_set(Oid collid);
147 static int text_cmp(text *arg1, text *arg2, Oid collid);
148 static bytea *bytea_catenate(bytea *t1, bytea *t2);
150  int S,
151  int L,
152  bool length_not_specified);
153 static bytea *bytea_overlay(bytea *t1, bytea *t2, int sp, int sl);
154 static void appendStringInfoText(StringInfo str, const text *t);
155 static bool split_text(FunctionCallInfo fcinfo, SplitTextOutputData *tstate);
156 static void split_text_accum_result(SplitTextOutputData *tstate,
157  text *field_value,
158  text *null_string,
159  Oid collation);
161  const char *fldsep, const char *null_string);
163 static bool text_format_parse_digits(const char **ptr, const char *end_ptr,
164  int *value);
165 static const char *text_format_parse_format(const char *start_ptr,
166  const char *end_ptr,
167  int *argpos, int *widthpos,
168  int *flags, int *width);
169 static void text_format_string_conversion(StringInfo buf, char conversion,
170  FmgrInfo *typOutputInfo,
171  Datum value, bool isNull,
172  int flags, int width);
173 static void text_format_append_string(StringInfo buf, const char *str,
174  int flags, int width);
175 
176 
177 /*****************************************************************************
178  * CONVERSION ROUTINES EXPORTED FOR USE BY C CODE *
179  *****************************************************************************/
180 
181 /*
182  * cstring_to_text
183  *
184  * Create a text value from a null-terminated C string.
185  *
186  * The new text value is freshly palloc'd with a full-size VARHDR.
187  */
188 text *
189 cstring_to_text(const char *s)
190 {
191  return cstring_to_text_with_len(s, strlen(s));
192 }
193 
194 /*
195  * cstring_to_text_with_len
196  *
197  * Same as cstring_to_text except the caller specifies the string length;
198  * the string need not be null_terminated.
199  */
200 text *
201 cstring_to_text_with_len(const char *s, int len)
202 {
203  text *result = (text *) palloc(len + VARHDRSZ);
204 
205  SET_VARSIZE(result, len + VARHDRSZ);
206  memcpy(VARDATA(result), s, len);
207 
208  return result;
209 }
210 
211 /*
212  * text_to_cstring
213  *
214  * Create a palloc'd, null-terminated C string from a text value.
215  *
216  * We support being passed a compressed or toasted text value.
217  * This is a bit bogus since such values shouldn't really be referred to as
218  * "text *", but it seems useful for robustness. If we didn't handle that
219  * case here, we'd need another routine that did, anyway.
220  */
221 char *
223 {
224  /* must cast away the const, unfortunately */
225  text *tunpacked = pg_detoast_datum_packed(unconstify(text *, t));
226  int len = VARSIZE_ANY_EXHDR(tunpacked);
227  char *result;
228 
229  result = (char *) palloc(len + 1);
230  memcpy(result, VARDATA_ANY(tunpacked), len);
231  result[len] = '\0';
232 
233  if (tunpacked != t)
234  pfree(tunpacked);
235 
236  return result;
237 }
238 
239 /*
240  * text_to_cstring_buffer
241  *
242  * Copy a text value into a caller-supplied buffer of size dst_len.
243  *
244  * The text string is truncated if necessary to fit. The result is
245  * guaranteed null-terminated (unless dst_len == 0).
246  *
247  * We support being passed a compressed or toasted text value.
248  * This is a bit bogus since such values shouldn't really be referred to as
249  * "text *", but it seems useful for robustness. If we didn't handle that
250  * case here, we'd need another routine that did, anyway.
251  */
252 void
253 text_to_cstring_buffer(const text *src, char *dst, size_t dst_len)
254 {
255  /* must cast away the const, unfortunately */
256  text *srcunpacked = pg_detoast_datum_packed(unconstify(text *, src));
257  size_t src_len = VARSIZE_ANY_EXHDR(srcunpacked);
258 
259  if (dst_len > 0)
260  {
261  dst_len--;
262  if (dst_len >= src_len)
263  dst_len = src_len;
264  else /* ensure truncation is encoding-safe */
265  dst_len = pg_mbcliplen(VARDATA_ANY(srcunpacked), src_len, dst_len);
266  memcpy(dst, VARDATA_ANY(srcunpacked), dst_len);
267  dst[dst_len] = '\0';
268  }
269 
270  if (srcunpacked != src)
271  pfree(srcunpacked);
272 }
273 
274 
275 /*****************************************************************************
276  * USER I/O ROUTINES *
277  *****************************************************************************/
278 
279 
280 #define VAL(CH) ((CH) - '0')
281 #define DIG(VAL) ((VAL) + '0')
282 
283 /*
284  * byteain - converts from printable representation of byte array
285  *
286  * Non-printable characters must be passed as '\nnn' (octal) and are
287  * converted to internal form. '\' must be passed as '\\'.
288  * ereport(ERROR, ...) if bad form.
289  *
290  * BUGS:
291  * The input is scanned twice.
292  * The error checking of input is minimal.
293  */
294 Datum
296 {
297  char *inputText = PG_GETARG_CSTRING(0);
298  char *tp;
299  char *rp;
300  int bc;
301  bytea *result;
302 
303  /* Recognize hex input */
304  if (inputText[0] == '\\' && inputText[1] == 'x')
305  {
306  size_t len = strlen(inputText);
307 
308  bc = (len - 2) / 2 + VARHDRSZ; /* maximum possible length */
309  result = palloc(bc);
310  bc = hex_decode(inputText + 2, len - 2, VARDATA(result));
311  SET_VARSIZE(result, bc + VARHDRSZ); /* actual length */
312 
313  PG_RETURN_BYTEA_P(result);
314  }
315 
316  /* Else, it's the traditional escaped style */
317  for (bc = 0, tp = inputText; *tp != '\0'; bc++)
318  {
319  if (tp[0] != '\\')
320  tp++;
321  else if ((tp[0] == '\\') &&
322  (tp[1] >= '0' && tp[1] <= '3') &&
323  (tp[2] >= '0' && tp[2] <= '7') &&
324  (tp[3] >= '0' && tp[3] <= '7'))
325  tp += 4;
326  else if ((tp[0] == '\\') &&
327  (tp[1] == '\\'))
328  tp += 2;
329  else
330  {
331  /*
332  * one backslash, not followed by another or ### valid octal
333  */
334  ereport(ERROR,
335  (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
336  errmsg("invalid input syntax for type %s", "bytea")));
337  }
338  }
339 
340  bc += VARHDRSZ;
341 
342  result = (bytea *) palloc(bc);
343  SET_VARSIZE(result, bc);
344 
345  tp = inputText;
346  rp = VARDATA(result);
347  while (*tp != '\0')
348  {
349  if (tp[0] != '\\')
350  *rp++ = *tp++;
351  else if ((tp[0] == '\\') &&
352  (tp[1] >= '0' && tp[1] <= '3') &&
353  (tp[2] >= '0' && tp[2] <= '7') &&
354  (tp[3] >= '0' && tp[3] <= '7'))
355  {
356  bc = VAL(tp[1]);
357  bc <<= 3;
358  bc += VAL(tp[2]);
359  bc <<= 3;
360  *rp++ = bc + VAL(tp[3]);
361 
362  tp += 4;
363  }
364  else if ((tp[0] == '\\') &&
365  (tp[1] == '\\'))
366  {
367  *rp++ = '\\';
368  tp += 2;
369  }
370  else
371  {
372  /*
373  * We should never get here. The first pass should not allow it.
374  */
375  ereport(ERROR,
376  (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
377  errmsg("invalid input syntax for type %s", "bytea")));
378  }
379  }
380 
381  PG_RETURN_BYTEA_P(result);
382 }
383 
384 /*
385  * byteaout - converts to printable representation of byte array
386  *
387  * In the traditional escaped format, non-printable characters are
388  * printed as '\nnn' (octal) and '\' as '\\'.
389  */
390 Datum
392 {
393  bytea *vlena = PG_GETARG_BYTEA_PP(0);
394  char *result;
395  char *rp;
396 
398  {
399  /* Print hex format */
400  rp = result = palloc(VARSIZE_ANY_EXHDR(vlena) * 2 + 2 + 1);
401  *rp++ = '\\';
402  *rp++ = 'x';
403  rp += hex_encode(VARDATA_ANY(vlena), VARSIZE_ANY_EXHDR(vlena), rp);
404  }
405  else if (bytea_output == BYTEA_OUTPUT_ESCAPE)
406  {
407  /* Print traditional escaped format */
408  char *vp;
409  uint64 len;
410  int i;
411 
412  len = 1; /* empty string has 1 char */
413  vp = VARDATA_ANY(vlena);
414  for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++)
415  {
416  if (*vp == '\\')
417  len += 2;
418  else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e)
419  len += 4;
420  else
421  len++;
422  }
423 
424  /*
425  * In principle len can't overflow uint32 if the input fit in 1GB, but
426  * for safety let's check rather than relying on palloc's internal
427  * check.
428  */
429  if (len > MaxAllocSize)
430  ereport(ERROR,
431  (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
432  errmsg_internal("result of bytea output conversion is too large")));
433  rp = result = (char *) palloc(len);
434 
435  vp = VARDATA_ANY(vlena);
436  for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++)
437  {
438  if (*vp == '\\')
439  {
440  *rp++ = '\\';
441  *rp++ = '\\';
442  }
443  else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e)
444  {
445  int val; /* holds unprintable chars */
446 
447  val = *vp;
448  rp[0] = '\\';
449  rp[3] = DIG(val & 07);
450  val >>= 3;
451  rp[2] = DIG(val & 07);
452  val >>= 3;
453  rp[1] = DIG(val & 03);
454  rp += 4;
455  }
456  else
457  *rp++ = *vp;
458  }
459  }
460  else
461  {
462  elog(ERROR, "unrecognized bytea_output setting: %d",
463  bytea_output);
464  rp = result = NULL; /* keep compiler quiet */
465  }
466  *rp = '\0';
467  PG_RETURN_CSTRING(result);
468 }
469 
470 /*
471  * bytearecv - converts external binary format to bytea
472  */
473 Datum
475 {
477  bytea *result;
478  int nbytes;
479 
480  nbytes = buf->len - buf->cursor;
481  result = (bytea *) palloc(nbytes + VARHDRSZ);
482  SET_VARSIZE(result, nbytes + VARHDRSZ);
483  pq_copymsgbytes(buf, VARDATA(result), nbytes);
484  PG_RETURN_BYTEA_P(result);
485 }
486 
487 /*
488  * byteasend - converts bytea to binary format
489  *
490  * This is a special case: just copy the input...
491  */
492 Datum
494 {
495  bytea *vlena = PG_GETARG_BYTEA_P_COPY(0);
496 
497  PG_RETURN_BYTEA_P(vlena);
498 }
499 
500 Datum
502 {
504 
505  state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
506 
507  /* Append the value unless null. */
508  if (!PG_ARGISNULL(1))
509  {
511 
512  /* On the first time through, we ignore the delimiter. */
513  if (state == NULL)
514  state = makeStringAggState(fcinfo);
515  else if (!PG_ARGISNULL(2))
516  {
517  bytea *delim = PG_GETARG_BYTEA_PP(2);
518 
520  }
521 
523  }
524 
525  /*
526  * The transition type for string_agg() is declared to be "internal",
527  * which is a pass-by-value type the same size as a pointer.
528  */
530 }
531 
532 Datum
534 {
536 
537  /* cannot be called directly because of internal-type argument */
538  Assert(AggCheckCallContext(fcinfo, NULL));
539 
540  state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
541 
542  if (state != NULL)
543  {
544  bytea *result;
545 
546  result = (bytea *) palloc(state->len + VARHDRSZ);
547  SET_VARSIZE(result, state->len + VARHDRSZ);
548  memcpy(VARDATA(result), state->data, state->len);
549  PG_RETURN_BYTEA_P(result);
550  }
551  else
552  PG_RETURN_NULL();
553 }
554 
555 /*
556  * textin - converts "..." to internal representation
557  */
558 Datum
560 {
561  char *inputText = PG_GETARG_CSTRING(0);
562 
563  PG_RETURN_TEXT_P(cstring_to_text(inputText));
564 }
565 
566 /*
567  * textout - converts internal representation to "..."
568  */
569 Datum
571 {
572  Datum txt = PG_GETARG_DATUM(0);
573 
575 }
576 
577 /*
578  * textrecv - converts external binary format to text
579  */
580 Datum
582 {
584  text *result;
585  char *str;
586  int nbytes;
587 
588  str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
589 
590  result = cstring_to_text_with_len(str, nbytes);
591  pfree(str);
592  PG_RETURN_TEXT_P(result);
593 }
594 
595 /*
596  * textsend - converts text to binary format
597  */
598 Datum
600 {
601  text *t = PG_GETARG_TEXT_PP(0);
603 
607 }
608 
609 
610 /*
611  * unknownin - converts "..." to internal representation
612  */
613 Datum
615 {
616  char *str = PG_GETARG_CSTRING(0);
617 
618  /* representation is same as cstring */
620 }
621 
622 /*
623  * unknownout - converts internal representation to "..."
624  */
625 Datum
627 {
628  /* representation is same as cstring */
629  char *str = PG_GETARG_CSTRING(0);
630 
632 }
633 
634 /*
635  * unknownrecv - converts external binary format to unknown
636  */
637 Datum
639 {
641  char *str;
642  int nbytes;
643 
644  str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
645  /* representation is same as cstring */
647 }
648 
649 /*
650  * unknownsend - converts unknown to binary format
651  */
652 Datum
654 {
655  /* representation is same as cstring */
656  char *str = PG_GETARG_CSTRING(0);
658 
660  pq_sendtext(&buf, str, strlen(str));
662 }
663 
664 
665 /* ========== PUBLIC ROUTINES ========== */
666 
667 /*
668  * textlen -
669  * returns the logical length of a text*
670  * (which is less than the VARSIZE of the text*)
671  */
672 Datum
674 {
676 
677  /* try to avoid decompressing argument */
679 }
680 
681 /*
682  * text_length -
683  * Does the real work for textlen()
684  *
685  * This is broken out so it can be called directly by other string processing
686  * functions. Note that the argument is passed as a Datum, to indicate that
687  * it may still be in compressed form. We can avoid decompressing it at all
688  * in some cases.
689  */
690 static int32
692 {
693  /* fastpath when max encoding length is one */
696  else
697  {
698  text *t = DatumGetTextPP(str);
699 
701  VARSIZE_ANY_EXHDR(t)));
702  }
703 }
704 
705 /*
706  * textoctetlen -
707  * returns the physical length of a text*
708  * (which is less than the VARSIZE of the text*)
709  */
710 Datum
712 {
714 
715  /* We need not detoast the input at all */
717 }
718 
719 /*
720  * textcat -
721  * takes two text* and returns a text* that is the concatenation of
722  * the two.
723  *
724  * Rewritten by Sapa, sapa@hq.icb.chel.su. 8-Jul-96.
725  * Updated by Thomas, Thomas.Lockhart@jpl.nasa.gov 1997-07-10.
726  * Allocate space for output in all cases.
727  * XXX - thomas 1997-07-10
728  */
729 Datum
731 {
732  text *t1 = PG_GETARG_TEXT_PP(0);
733  text *t2 = PG_GETARG_TEXT_PP(1);
734 
736 }
737 
738 /*
739  * text_catenate
740  * Guts of textcat(), broken out so it can be used by other functions
741  *
742  * Arguments can be in short-header form, but not compressed or out-of-line
743  */
744 static text *
746 {
747  text *result;
748  int len1,
749  len2,
750  len;
751  char *ptr;
752 
753  len1 = VARSIZE_ANY_EXHDR(t1);
754  len2 = VARSIZE_ANY_EXHDR(t2);
755 
756  /* paranoia ... probably should throw error instead? */
757  if (len1 < 0)
758  len1 = 0;
759  if (len2 < 0)
760  len2 = 0;
761 
762  len = len1 + len2 + VARHDRSZ;
763  result = (text *) palloc(len);
764 
765  /* Set size of result string... */
766  SET_VARSIZE(result, len);
767 
768  /* Fill data field of result string... */
769  ptr = VARDATA(result);
770  if (len1 > 0)
771  memcpy(ptr, VARDATA_ANY(t1), len1);
772  if (len2 > 0)
773  memcpy(ptr + len1, VARDATA_ANY(t2), len2);
774 
775  return result;
776 }
777 
778 /*
779  * charlen_to_bytelen()
780  * Compute the number of bytes occupied by n characters starting at *p
781  *
782  * It is caller's responsibility that there actually are n characters;
783  * the string need not be null-terminated.
784  */
785 static int
786 charlen_to_bytelen(const char *p, int n)
787 {
789  {
790  /* Optimization for single-byte encodings */
791  return n;
792  }
793  else
794  {
795  const char *s;
796 
797  for (s = p; n > 0; n--)
798  s += pg_mblen(s);
799 
800  return s - p;
801  }
802 }
803 
804 /*
805  * text_substr()
806  * Return a substring starting at the specified position.
807  * - thomas 1997-12-31
808  *
809  * Input:
810  * - string
811  * - starting position (is one-based)
812  * - string length
813  *
814  * If the starting position is zero or less, then return from the start of the string
815  * adjusting the length to be consistent with the "negative start" per SQL.
816  * If the length is less than zero, return the remaining string.
817  *
818  * Added multibyte support.
819  * - Tatsuo Ishii 1998-4-21
820  * Changed behavior if starting position is less than one to conform to SQL behavior.
821  * Formerly returned the entire string; now returns a portion.
822  * - Thomas Lockhart 1998-12-10
823  * Now uses faster TOAST-slicing interface
824  * - John Gray 2002-02-22
825  * Remove "#ifdef MULTIBYTE" and test for encoding_max_length instead. Change
826  * behaviors conflicting with SQL to meet SQL (if E = S + L < S throw
827  * error; if E < 1, return '', not entire string). Fixed MB related bug when
828  * S > LC and < LC + 4 sometimes garbage characters are returned.
829  * - Joe Conway 2002-08-10
830  */
831 Datum
833 {
835  PG_GETARG_INT32(1),
836  PG_GETARG_INT32(2),
837  false));
838 }
839 
840 /*
841  * text_substr_no_len -
842  * Wrapper to avoid opr_sanity failure due to
843  * one function accepting a different number of args.
844  */
845 Datum
847 {
849  PG_GETARG_INT32(1),
850  -1, true));
851 }
852 
853 /*
854  * text_substring -
855  * Does the real work for text_substr() and text_substr_no_len()
856  *
857  * This is broken out so it can be called directly by other string processing
858  * functions. Note that the argument is passed as a Datum, to indicate that
859  * it may still be in compressed/toasted form. We can avoid detoasting all
860  * of it in some cases.
861  *
862  * The result is always a freshly palloc'd datum.
863  */
864 static text *
865 text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
866 {
868  int32 S = start; /* start position */
869  int32 S1; /* adjusted start position */
870  int32 L1; /* adjusted substring length */
871  int32 E; /* end position */
872 
873  /*
874  * SQL99 says S can be zero or negative, but we still must fetch from the
875  * start of the string.
876  */
877  S1 = Max(S, 1);
878 
879  /* life is easy if the encoding max length is 1 */
880  if (eml == 1)
881  {
882  if (length_not_specified) /* special case - get length to end of
883  * string */
884  L1 = -1;
885  else if (length < 0)
886  {
887  /* SQL99 says to throw an error for E < S, i.e., negative length */
888  ereport(ERROR,
889  (errcode(ERRCODE_SUBSTRING_ERROR),
890  errmsg("negative substring length not allowed")));
891  L1 = -1; /* silence stupider compilers */
892  }
893  else if (pg_add_s32_overflow(S, length, &E))
894  {
895  /*
896  * L could be large enough for S + L to overflow, in which case
897  * the substring must run to end of string.
898  */
899  L1 = -1;
900  }
901  else
902  {
903  /*
904  * A zero or negative value for the end position can happen if the
905  * start was negative or one. SQL99 says to return a zero-length
906  * string.
907  */
908  if (E < 1)
909  return cstring_to_text("");
910 
911  L1 = E - S1;
912  }
913 
914  /*
915  * If the start position is past the end of the string, SQL99 says to
916  * return a zero-length string -- DatumGetTextPSlice() will do that
917  * for us. We need only convert S1 to zero-based starting position.
918  */
919  return DatumGetTextPSlice(str, S1 - 1, L1);
920  }
921  else if (eml > 1)
922  {
923  /*
924  * When encoding max length is > 1, we can't get LC without
925  * detoasting, so we'll grab a conservatively large slice now and go
926  * back later to do the right thing
927  */
928  int32 slice_start;
929  int32 slice_size;
930  int32 slice_strlen;
931  text *slice;
932  int32 E1;
933  int32 i;
934  char *p;
935  char *s;
936  text *ret;
937 
938  /*
939  * We need to start at position zero because there is no way to know
940  * in advance which byte offset corresponds to the supplied start
941  * position.
942  */
943  slice_start = 0;
944 
945  if (length_not_specified) /* special case - get length to end of
946  * string */
947  slice_size = L1 = -1;
948  else if (length < 0)
949  {
950  /* SQL99 says to throw an error for E < S, i.e., negative length */
951  ereport(ERROR,
952  (errcode(ERRCODE_SUBSTRING_ERROR),
953  errmsg("negative substring length not allowed")));
954  slice_size = L1 = -1; /* silence stupider compilers */
955  }
956  else if (pg_add_s32_overflow(S, length, &E))
957  {
958  /*
959  * L could be large enough for S + L to overflow, in which case
960  * the substring must run to end of string.
961  */
962  slice_size = L1 = -1;
963  }
964  else
965  {
966  /*
967  * A zero or negative value for the end position can happen if the
968  * start was negative or one. SQL99 says to return a zero-length
969  * string.
970  */
971  if (E < 1)
972  return cstring_to_text("");
973 
974  /*
975  * if E is past the end of the string, the tuple toaster will
976  * truncate the length for us
977  */
978  L1 = E - S1;
979 
980  /*
981  * Total slice size in bytes can't be any longer than the start
982  * position plus substring length times the encoding max length.
983  * If that overflows, we can just use -1.
984  */
985  if (pg_mul_s32_overflow(E, eml, &slice_size))
986  slice_size = -1;
987  }
988 
989  /*
990  * If we're working with an untoasted source, no need to do an extra
991  * copying step.
992  */
995  slice = DatumGetTextPSlice(str, slice_start, slice_size);
996  else
997  slice = (text *) DatumGetPointer(str);
998 
999  /* see if we got back an empty string */
1000  if (VARSIZE_ANY_EXHDR(slice) == 0)
1001  {
1002  if (slice != (text *) DatumGetPointer(str))
1003  pfree(slice);
1004  return cstring_to_text("");
1005  }
1006 
1007  /* Now we can get the actual length of the slice in MB characters */
1008  slice_strlen = pg_mbstrlen_with_len(VARDATA_ANY(slice),
1009  VARSIZE_ANY_EXHDR(slice));
1010 
1011  /*
1012  * Check that the start position wasn't > slice_strlen. If so, SQL99
1013  * says to return a zero-length string.
1014  */
1015  if (S1 > slice_strlen)
1016  {
1017  if (slice != (text *) DatumGetPointer(str))
1018  pfree(slice);
1019  return cstring_to_text("");
1020  }
1021 
1022  /*
1023  * Adjust L1 and E1 now that we know the slice string length. Again
1024  * remember that S1 is one based, and slice_start is zero based.
1025  */
1026  if (L1 > -1)
1027  E1 = Min(S1 + L1, slice_start + 1 + slice_strlen);
1028  else
1029  E1 = slice_start + 1 + slice_strlen;
1030 
1031  /*
1032  * Find the start position in the slice; remember S1 is not zero based
1033  */
1034  p = VARDATA_ANY(slice);
1035  for (i = 0; i < S1 - 1; i++)
1036  p += pg_mblen(p);
1037 
1038  /* hang onto a pointer to our start position */
1039  s = p;
1040 
1041  /*
1042  * Count the actual bytes used by the substring of the requested
1043  * length.
1044  */
1045  for (i = S1; i < E1; i++)
1046  p += pg_mblen(p);
1047 
1048  ret = (text *) palloc(VARHDRSZ + (p - s));
1049  SET_VARSIZE(ret, VARHDRSZ + (p - s));
1050  memcpy(VARDATA(ret), s, (p - s));
1051 
1052  if (slice != (text *) DatumGetPointer(str))
1053  pfree(slice);
1054 
1055  return ret;
1056  }
1057  else
1058  elog(ERROR, "invalid backend encoding: encoding max length < 1");
1059 
1060  /* not reached: suppress compiler warning */
1061  return NULL;
1062 }
1063 
1064 /*
1065  * textoverlay
1066  * Replace specified substring of first string with second
1067  *
1068  * The SQL standard defines OVERLAY() in terms of substring and concatenation.
1069  * This code is a direct implementation of what the standard says.
1070  */
1071 Datum
1073 {
1074  text *t1 = PG_GETARG_TEXT_PP(0);
1075  text *t2 = PG_GETARG_TEXT_PP(1);
1076  int sp = PG_GETARG_INT32(2); /* substring start position */
1077  int sl = PG_GETARG_INT32(3); /* substring length */
1078 
1079  PG_RETURN_TEXT_P(text_overlay(t1, t2, sp, sl));
1080 }
1081 
1082 Datum
1084 {
1085  text *t1 = PG_GETARG_TEXT_PP(0);
1086  text *t2 = PG_GETARG_TEXT_PP(1);
1087  int sp = PG_GETARG_INT32(2); /* substring start position */
1088  int sl;
1089 
1090  sl = text_length(PointerGetDatum(t2)); /* defaults to length(t2) */
1091  PG_RETURN_TEXT_P(text_overlay(t1, t2, sp, sl));
1092 }
1093 
1094 static text *
1095 text_overlay(text *t1, text *t2, int sp, int sl)
1096 {
1097  text *result;
1098  text *s1;
1099  text *s2;
1100  int sp_pl_sl;
1101 
1102  /*
1103  * Check for possible integer-overflow cases. For negative sp, throw a
1104  * "substring length" error because that's what should be expected
1105  * according to the spec's definition of OVERLAY().
1106  */
1107  if (sp <= 0)
1108  ereport(ERROR,
1109  (errcode(ERRCODE_SUBSTRING_ERROR),
1110  errmsg("negative substring length not allowed")));
1111  if (pg_add_s32_overflow(sp, sl, &sp_pl_sl))
1112  ereport(ERROR,
1113  (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
1114  errmsg("integer out of range")));
1115 
1116  s1 = text_substring(PointerGetDatum(t1), 1, sp - 1, false);
1117  s2 = text_substring(PointerGetDatum(t1), sp_pl_sl, -1, true);
1118  result = text_catenate(s1, t2);
1119  result = text_catenate(result, s2);
1120 
1121  return result;
1122 }
1123 
1124 /*
1125  * textpos -
1126  * Return the position of the specified substring.
1127  * Implements the SQL POSITION() function.
1128  * Ref: A Guide To The SQL Standard, Date & Darwen, 1997
1129  * - thomas 1997-07-27
1130  */
1131 Datum
1133 {
1134  text *str = PG_GETARG_TEXT_PP(0);
1135  text *search_str = PG_GETARG_TEXT_PP(1);
1136 
1138 }
1139 
1140 /*
1141  * text_position -
1142  * Does the real work for textpos()
1143  *
1144  * Inputs:
1145  * t1 - string to be searched
1146  * t2 - pattern to match within t1
1147  * Result:
1148  * Character index of the first matched char, starting from 1,
1149  * or 0 if no match.
1150  *
1151  * This is broken out so it can be called directly by other string processing
1152  * functions.
1153  */
1154 static int
1155 text_position(text *t1, text *t2, Oid collid)
1156 {
1158  int result;
1159 
1160  /* Empty needle always matches at position 1 */
1161  if (VARSIZE_ANY_EXHDR(t2) < 1)
1162  return 1;
1163 
1164  /* Otherwise, can't match if haystack is shorter than needle */
1165  if (VARSIZE_ANY_EXHDR(t1) < VARSIZE_ANY_EXHDR(t2))
1166  return 0;
1167 
1168  text_position_setup(t1, t2, collid, &state);
1169  if (!text_position_next(&state))
1170  result = 0;
1171  else
1174  return result;
1175 }
1176 
1177 
1178 /*
1179  * text_position_setup, text_position_next, text_position_cleanup -
1180  * Component steps of text_position()
1181  *
1182  * These are broken out so that a string can be efficiently searched for
1183  * multiple occurrences of the same pattern. text_position_next may be
1184  * called multiple times, and it advances to the next match on each call.
1185  * text_position_get_match_ptr() and text_position_get_match_pos() return
1186  * a pointer or 1-based character position of the last match, respectively.
1187  *
1188  * The "state" variable is normally just a local variable in the caller.
1189  *
1190  * NOTE: text_position_next skips over the matched portion. For example,
1191  * searching for "xx" in "xxx" returns only one match, not two.
1192  */
1193 
1194 static void
1196 {
1197  int len1 = VARSIZE_ANY_EXHDR(t1);
1198  int len2 = VARSIZE_ANY_EXHDR(t2);
1199  pg_locale_t mylocale = 0;
1200 
1201  check_collation_set(collid);
1202 
1203  if (!lc_collate_is_c(collid))
1204  mylocale = pg_newlocale_from_collation(collid);
1205 
1206  if (mylocale && !mylocale->deterministic)
1207  ereport(ERROR,
1208  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1209  errmsg("nondeterministic collations are not supported for substring searches")));
1210 
1211  Assert(len1 > 0);
1212  Assert(len2 > 0);
1213 
1214  /*
1215  * Even with a multi-byte encoding, we perform the search using the raw
1216  * byte sequence, ignoring multibyte issues. For UTF-8, that works fine,
1217  * because in UTF-8 the byte sequence of one character cannot contain
1218  * another character. For other multi-byte encodings, we do the search
1219  * initially as a simple byte search, ignoring multibyte issues, but
1220  * verify afterwards that the match we found is at a character boundary,
1221  * and continue the search if it was a false match.
1222  */
1224  state->is_multibyte_char_in_char = false;
1225  else if (GetDatabaseEncoding() == PG_UTF8)
1226  state->is_multibyte_char_in_char = false;
1227  else
1228  state->is_multibyte_char_in_char = true;
1229 
1230  state->str1 = VARDATA_ANY(t1);
1231  state->str2 = VARDATA_ANY(t2);
1232  state->len1 = len1;
1233  state->len2 = len2;
1234  state->last_match = NULL;
1235  state->refpoint = state->str1;
1236  state->refpos = 0;
1237 
1238  /*
1239  * Prepare the skip table for Boyer-Moore-Horspool searching. In these
1240  * notes we use the terminology that the "haystack" is the string to be
1241  * searched (t1) and the "needle" is the pattern being sought (t2).
1242  *
1243  * If the needle is empty or bigger than the haystack then there is no
1244  * point in wasting cycles initializing the table. We also choose not to
1245  * use B-M-H for needles of length 1, since the skip table can't possibly
1246  * save anything in that case.
1247  */
1248  if (len1 >= len2 && len2 > 1)
1249  {
1250  int searchlength = len1 - len2;
1251  int skiptablemask;
1252  int last;
1253  int i;
1254  const char *str2 = state->str2;
1255 
1256  /*
1257  * First we must determine how much of the skip table to use. The
1258  * declaration of TextPositionState allows up to 256 elements, but for
1259  * short search problems we don't really want to have to initialize so
1260  * many elements --- it would take too long in comparison to the
1261  * actual search time. So we choose a useful skip table size based on
1262  * the haystack length minus the needle length. The closer the needle
1263  * length is to the haystack length the less useful skipping becomes.
1264  *
1265  * Note: since we use bit-masking to select table elements, the skip
1266  * table size MUST be a power of 2, and so the mask must be 2^N-1.
1267  */
1268  if (searchlength < 16)
1269  skiptablemask = 3;
1270  else if (searchlength < 64)
1271  skiptablemask = 7;
1272  else if (searchlength < 128)
1273  skiptablemask = 15;
1274  else if (searchlength < 512)
1275  skiptablemask = 31;
1276  else if (searchlength < 2048)
1277  skiptablemask = 63;
1278  else if (searchlength < 4096)
1279  skiptablemask = 127;
1280  else
1281  skiptablemask = 255;
1282  state->skiptablemask = skiptablemask;
1283 
1284  /*
1285  * Initialize the skip table. We set all elements to the needle
1286  * length, since this is the correct skip distance for any character
1287  * not found in the needle.
1288  */
1289  for (i = 0; i <= skiptablemask; i++)
1290  state->skiptable[i] = len2;
1291 
1292  /*
1293  * Now examine the needle. For each character except the last one,
1294  * set the corresponding table element to the appropriate skip
1295  * distance. Note that when two characters share the same skip table
1296  * entry, the one later in the needle must determine the skip
1297  * distance.
1298  */
1299  last = len2 - 1;
1300 
1301  for (i = 0; i < last; i++)
1302  state->skiptable[(unsigned char) str2[i] & skiptablemask] = last - i;
1303  }
1304 }
1305 
1306 /*
1307  * Advance to the next match, starting from the end of the previous match
1308  * (or the beginning of the string, on first call). Returns true if a match
1309  * is found.
1310  *
1311  * Note that this refuses to match an empty-string needle. Most callers
1312  * will have handled that case specially and we'll never see it here.
1313  */
1314 static bool
1316 {
1317  int needle_len = state->len2;
1318  char *start_ptr;
1319  char *matchptr;
1320 
1321  if (needle_len <= 0)
1322  return false; /* result for empty pattern */
1323 
1324  /* Start from the point right after the previous match. */
1325  if (state->last_match)
1326  start_ptr = state->last_match + needle_len;
1327  else
1328  start_ptr = state->str1;
1329 
1330 retry:
1331  matchptr = text_position_next_internal(start_ptr, state);
1332 
1333  if (!matchptr)
1334  return false;
1335 
1336  /*
1337  * Found a match for the byte sequence. If this is a multibyte encoding,
1338  * where one character's byte sequence can appear inside a longer
1339  * multi-byte character, we need to verify that the match was at a
1340  * character boundary, not in the middle of a multi-byte character.
1341  */
1342  if (state->is_multibyte_char_in_char)
1343  {
1344  /* Walk one character at a time, until we reach the match. */
1345 
1346  /* the search should never move backwards. */
1347  Assert(state->refpoint <= matchptr);
1348 
1349  while (state->refpoint < matchptr)
1350  {
1351  /* step to next character. */
1352  state->refpoint += pg_mblen(state->refpoint);
1353  state->refpos++;
1354 
1355  /*
1356  * If we stepped over the match's start position, then it was a
1357  * false positive, where the byte sequence appeared in the middle
1358  * of a multi-byte character. Skip it, and continue the search at
1359  * the next character boundary.
1360  */
1361  if (state->refpoint > matchptr)
1362  {
1363  start_ptr = state->refpoint;
1364  goto retry;
1365  }
1366  }
1367  }
1368 
1369  state->last_match = matchptr;
1370  return true;
1371 }
1372 
1373 /*
1374  * Subroutine of text_position_next(). This searches for the raw byte
1375  * sequence, ignoring any multi-byte encoding issues. Returns the first
1376  * match starting at 'start_ptr', or NULL if no match is found.
1377  */
1378 static char *
1380 {
1381  int haystack_len = state->len1;
1382  int needle_len = state->len2;
1383  int skiptablemask = state->skiptablemask;
1384  const char *haystack = state->str1;
1385  const char *needle = state->str2;
1386  const char *haystack_end = &haystack[haystack_len];
1387  const char *hptr;
1388 
1389  Assert(start_ptr >= haystack && start_ptr <= haystack_end);
1390 
1391  if (needle_len == 1)
1392  {
1393  /* No point in using B-M-H for a one-character needle */
1394  char nchar = *needle;
1395 
1396  hptr = start_ptr;
1397  while (hptr < haystack_end)
1398  {
1399  if (*hptr == nchar)
1400  return (char *) hptr;
1401  hptr++;
1402  }
1403  }
1404  else
1405  {
1406  const char *needle_last = &needle[needle_len - 1];
1407 
1408  /* Start at startpos plus the length of the needle */
1409  hptr = start_ptr + needle_len - 1;
1410  while (hptr < haystack_end)
1411  {
1412  /* Match the needle scanning *backward* */
1413  const char *nptr;
1414  const char *p;
1415 
1416  nptr = needle_last;
1417  p = hptr;
1418  while (*nptr == *p)
1419  {
1420  /* Matched it all? If so, return 1-based position */
1421  if (nptr == needle)
1422  return (char *) p;
1423  nptr--, p--;
1424  }
1425 
1426  /*
1427  * No match, so use the haystack char at hptr to decide how far to
1428  * advance. If the needle had any occurrence of that character
1429  * (or more precisely, one sharing the same skiptable entry)
1430  * before its last character, then we advance far enough to align
1431  * the last such needle character with that haystack position.
1432  * Otherwise we can advance by the whole needle length.
1433  */
1434  hptr += state->skiptable[(unsigned char) *hptr & skiptablemask];
1435  }
1436  }
1437 
1438  return 0; /* not found */
1439 }
1440 
1441 /*
1442  * Return a pointer to the current match.
1443  *
1444  * The returned pointer points into the original haystack string.
1445  */
1446 static char *
1448 {
1449  return state->last_match;
1450 }
1451 
1452 /*
1453  * Return the offset of the current match.
1454  *
1455  * The offset is in characters, 1-based.
1456  */
1457 static int
1459 {
1460  /* Convert the byte position to char position. */
1461  state->refpos += pg_mbstrlen_with_len(state->refpoint,
1462  state->last_match - state->refpoint);
1463  state->refpoint = state->last_match;
1464  return state->refpos + 1;
1465 }
1466 
1467 /*
1468  * Reset search state to the initial state installed by text_position_setup.
1469  *
1470  * The next call to text_position_next will search from the beginning
1471  * of the string.
1472  */
1473 static void
1475 {
1476  state->last_match = NULL;
1477  state->refpoint = state->str1;
1478  state->refpos = 0;
1479 }
1480 
1481 static void
1483 {
1484  /* no cleanup needed */
1485 }
1486 
1487 
1488 static void
1490 {
1491  if (!OidIsValid(collid))
1492  {
1493  /*
1494  * This typically means that the parser could not resolve a conflict
1495  * of implicit collations, so report it that way.
1496  */
1497  ereport(ERROR,
1498  (errcode(ERRCODE_INDETERMINATE_COLLATION),
1499  errmsg("could not determine which collation to use for string comparison"),
1500  errhint("Use the COLLATE clause to set the collation explicitly.")));
1501  }
1502 }
1503 
1504 /* varstr_cmp()
1505  * Comparison function for text strings with given lengths.
1506  * Includes locale support, but must copy strings to temporary memory
1507  * to allow null-termination for inputs to strcoll().
1508  * Returns an integer less than, equal to, or greater than zero, indicating
1509  * whether arg1 is less than, equal to, or greater than arg2.
1510  *
1511  * Note: many functions that depend on this are marked leakproof; therefore,
1512  * avoid reporting the actual contents of the input when throwing errors.
1513  * All errors herein should be things that can't happen except on corrupt
1514  * data, anyway; otherwise we will have trouble with indexing strings that
1515  * would cause them.
1516  */
1517 int
1518 varstr_cmp(const char *arg1, int len1, const char *arg2, int len2, Oid collid)
1519 {
1520  int result;
1521 
1522  check_collation_set(collid);
1523 
1524  /*
1525  * Unfortunately, there is no strncoll(), so in the non-C locale case we
1526  * have to do some memory copying. This turns out to be significantly
1527  * slower, so we optimize the case where LC_COLLATE is C. We also try to
1528  * optimize relatively-short strings by avoiding palloc/pfree overhead.
1529  */
1530  if (lc_collate_is_c(collid))
1531  {
1532  result = memcmp(arg1, arg2, Min(len1, len2));
1533  if ((result == 0) && (len1 != len2))
1534  result = (len1 < len2) ? -1 : 1;
1535  }
1536  else
1537  {
1538  char a1buf[TEXTBUFLEN];
1539  char a2buf[TEXTBUFLEN];
1540  char *a1p,
1541  *a2p;
1542  pg_locale_t mylocale;
1543 
1544  mylocale = pg_newlocale_from_collation(collid);
1545 
1546  /*
1547  * memcmp() can't tell us which of two unequal strings sorts first,
1548  * but it's a cheap way to tell if they're equal. Testing shows that
1549  * memcmp() followed by strcoll() is only trivially slower than
1550  * strcoll() by itself, so we don't lose much if this doesn't work out
1551  * very often, and if it does - for example, because there are many
1552  * equal strings in the input - then we win big by avoiding expensive
1553  * collation-aware comparisons.
1554  */
1555  if (len1 == len2 && memcmp(arg1, arg2, len1) == 0)
1556  return 0;
1557 
1558 #ifdef WIN32
1559  /* Win32 does not have UTF-8, so we need to map to UTF-16 */
1560  if (GetDatabaseEncoding() == PG_UTF8
1561  && (!mylocale || mylocale->provider == COLLPROVIDER_LIBC))
1562  {
1563  int a1len;
1564  int a2len;
1565  int r;
1566 
1567  if (len1 >= TEXTBUFLEN / 2)
1568  {
1569  a1len = len1 * 2 + 2;
1570  a1p = palloc(a1len);
1571  }
1572  else
1573  {
1574  a1len = TEXTBUFLEN;
1575  a1p = a1buf;
1576  }
1577  if (len2 >= TEXTBUFLEN / 2)
1578  {
1579  a2len = len2 * 2 + 2;
1580  a2p = palloc(a2len);
1581  }
1582  else
1583  {
1584  a2len = TEXTBUFLEN;
1585  a2p = a2buf;
1586  }
1587 
1588  /* stupid Microsloth API does not work for zero-length input */
1589  if (len1 == 0)
1590  r = 0;
1591  else
1592  {
1593  r = MultiByteToWideChar(CP_UTF8, 0, arg1, len1,
1594  (LPWSTR) a1p, a1len / 2);
1595  if (!r)
1596  ereport(ERROR,
1597  (errmsg("could not convert string to UTF-16: error code %lu",
1598  GetLastError())));
1599  }
1600  ((LPWSTR) a1p)[r] = 0;
1601 
1602  if (len2 == 0)
1603  r = 0;
1604  else
1605  {
1606  r = MultiByteToWideChar(CP_UTF8, 0, arg2, len2,
1607  (LPWSTR) a2p, a2len / 2);
1608  if (!r)
1609  ereport(ERROR,
1610  (errmsg("could not convert string to UTF-16: error code %lu",
1611  GetLastError())));
1612  }
1613  ((LPWSTR) a2p)[r] = 0;
1614 
1615  errno = 0;
1616 #ifdef HAVE_LOCALE_T
1617  if (mylocale)
1618  result = wcscoll_l((LPWSTR) a1p, (LPWSTR) a2p, mylocale->info.lt);
1619  else
1620 #endif
1621  result = wcscoll((LPWSTR) a1p, (LPWSTR) a2p);
1622  if (result == 2147483647) /* _NLSCMPERROR; missing from mingw
1623  * headers */
1624  ereport(ERROR,
1625  (errmsg("could not compare Unicode strings: %m")));
1626 
1627  /* Break tie if necessary. */
1628  if (result == 0 &&
1629  (!mylocale || mylocale->deterministic))
1630  {
1631  result = memcmp(arg1, arg2, Min(len1, len2));
1632  if ((result == 0) && (len1 != len2))
1633  result = (len1 < len2) ? -1 : 1;
1634  }
1635 
1636  if (a1p != a1buf)
1637  pfree(a1p);
1638  if (a2p != a2buf)
1639  pfree(a2p);
1640 
1641  return result;
1642  }
1643 #endif /* WIN32 */
1644 
1645  if (len1 >= TEXTBUFLEN)
1646  a1p = (char *) palloc(len1 + 1);
1647  else
1648  a1p = a1buf;
1649  if (len2 >= TEXTBUFLEN)
1650  a2p = (char *) palloc(len2 + 1);
1651  else
1652  a2p = a2buf;
1653 
1654  memcpy(a1p, arg1, len1);
1655  a1p[len1] = '\0';
1656  memcpy(a2p, arg2, len2);
1657  a2p[len2] = '\0';
1658 
1659  if (mylocale)
1660  {
1661  if (mylocale->provider == COLLPROVIDER_ICU)
1662  {
1663 #ifdef USE_ICU
1664 #ifdef HAVE_UCOL_STRCOLLUTF8
1665  if (GetDatabaseEncoding() == PG_UTF8)
1666  {
1667  UErrorCode status;
1668 
1669  status = U_ZERO_ERROR;
1670  result = ucol_strcollUTF8(mylocale->info.icu.ucol,
1671  arg1, len1,
1672  arg2, len2,
1673  &status);
1674  if (U_FAILURE(status))
1675  ereport(ERROR,
1676  (errmsg("collation failed: %s", u_errorName(status))));
1677  }
1678  else
1679 #endif
1680  {
1681  int32_t ulen1,
1682  ulen2;
1683  UChar *uchar1,
1684  *uchar2;
1685 
1686  ulen1 = icu_to_uchar(&uchar1, arg1, len1);
1687  ulen2 = icu_to_uchar(&uchar2, arg2, len2);
1688 
1689  result = ucol_strcoll(mylocale->info.icu.ucol,
1690  uchar1, ulen1,
1691  uchar2, ulen2);
1692 
1693  pfree(uchar1);
1694  pfree(uchar2);
1695  }
1696 #else /* not USE_ICU */
1697  /* shouldn't happen */
1698  elog(ERROR, "unsupported collprovider: %c", mylocale->provider);
1699 #endif /* not USE_ICU */
1700  }
1701  else
1702  {
1703 #ifdef HAVE_LOCALE_T
1704  result = strcoll_l(a1p, a2p, mylocale->info.lt);
1705 #else
1706  /* shouldn't happen */
1707  elog(ERROR, "unsupported collprovider: %c", mylocale->provider);
1708 #endif
1709  }
1710  }
1711  else
1712  result = strcoll(a1p, a2p);
1713 
1714  /* Break tie if necessary. */
1715  if (result == 0 &&
1716  (!mylocale || mylocale->deterministic))
1717  result = strcmp(a1p, a2p);
1718 
1719  if (a1p != a1buf)
1720  pfree(a1p);
1721  if (a2p != a2buf)
1722  pfree(a2p);
1723  }
1724 
1725  return result;
1726 }
1727 
1728 /* text_cmp()
1729  * Internal comparison function for text strings.
1730  * Returns -1, 0 or 1
1731  */
1732 static int
1733 text_cmp(text *arg1, text *arg2, Oid collid)
1734 {
1735  char *a1p,
1736  *a2p;
1737  int len1,
1738  len2;
1739 
1740  a1p = VARDATA_ANY(arg1);
1741  a2p = VARDATA_ANY(arg2);
1742 
1743  len1 = VARSIZE_ANY_EXHDR(arg1);
1744  len2 = VARSIZE_ANY_EXHDR(arg2);
1745 
1746  return varstr_cmp(a1p, len1, a2p, len2, collid);
1747 }
1748 
1749 /*
1750  * Comparison functions for text strings.
1751  *
1752  * Note: btree indexes need these routines not to leak memory; therefore,
1753  * be careful to free working copies of toasted datums. Most places don't
1754  * need to be so careful.
1755  */
1756 
1757 Datum
1759 {
1760  Oid collid = PG_GET_COLLATION();
1761  bool locale_is_c = false;
1762  pg_locale_t mylocale = 0;
1763  bool result;
1764 
1765  check_collation_set(collid);
1766 
1767  if (lc_collate_is_c(collid))
1768  locale_is_c = true;
1769  else
1770  mylocale = pg_newlocale_from_collation(collid);
1771 
1772  if (locale_is_c || !mylocale || mylocale->deterministic)
1773  {
1774  Datum arg1 = PG_GETARG_DATUM(0);
1775  Datum arg2 = PG_GETARG_DATUM(1);
1776  Size len1,
1777  len2;
1778 
1779  /*
1780  * Since we only care about equality or not-equality, we can avoid all
1781  * the expense of strcoll() here, and just do bitwise comparison. In
1782  * fact, we don't even have to do a bitwise comparison if we can show
1783  * the lengths of the strings are unequal; which might save us from
1784  * having to detoast one or both values.
1785  */
1786  len1 = toast_raw_datum_size(arg1);
1787  len2 = toast_raw_datum_size(arg2);
1788  if (len1 != len2)
1789  result = false;
1790  else
1791  {
1792  text *targ1 = DatumGetTextPP(arg1);
1793  text *targ2 = DatumGetTextPP(arg2);
1794 
1795  result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1796  len1 - VARHDRSZ) == 0);
1797 
1798  PG_FREE_IF_COPY(targ1, 0);
1799  PG_FREE_IF_COPY(targ2, 1);
1800  }
1801  }
1802  else
1803  {
1804  text *arg1 = PG_GETARG_TEXT_PP(0);
1805  text *arg2 = PG_GETARG_TEXT_PP(1);
1806 
1807  result = (text_cmp(arg1, arg2, collid) == 0);
1808 
1809  PG_FREE_IF_COPY(arg1, 0);
1810  PG_FREE_IF_COPY(arg2, 1);
1811  }
1812 
1813  PG_RETURN_BOOL(result);
1814 }
1815 
1816 Datum
1818 {
1819  Oid collid = PG_GET_COLLATION();
1820  bool locale_is_c = false;
1821  pg_locale_t mylocale = 0;
1822  bool result;
1823 
1824  check_collation_set(collid);
1825 
1826  if (lc_collate_is_c(collid))
1827  locale_is_c = true;
1828  else
1829  mylocale = pg_newlocale_from_collation(collid);
1830 
1831  if (locale_is_c || !mylocale || mylocale->deterministic)
1832  {
1833  Datum arg1 = PG_GETARG_DATUM(0);
1834  Datum arg2 = PG_GETARG_DATUM(1);
1835  Size len1,
1836  len2;
1837 
1838  /* See comment in texteq() */
1839  len1 = toast_raw_datum_size(arg1);
1840  len2 = toast_raw_datum_size(arg2);
1841  if (len1 != len2)
1842  result = true;
1843  else
1844  {
1845  text *targ1 = DatumGetTextPP(arg1);
1846  text *targ2 = DatumGetTextPP(arg2);
1847 
1848  result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1849  len1 - VARHDRSZ) != 0);
1850 
1851  PG_FREE_IF_COPY(targ1, 0);
1852  PG_FREE_IF_COPY(targ2, 1);
1853  }
1854  }
1855  else
1856  {
1857  text *arg1 = PG_GETARG_TEXT_PP(0);
1858  text *arg2 = PG_GETARG_TEXT_PP(1);
1859 
1860  result = (text_cmp(arg1, arg2, collid) != 0);
1861 
1862  PG_FREE_IF_COPY(arg1, 0);
1863  PG_FREE_IF_COPY(arg2, 1);
1864  }
1865 
1866  PG_RETURN_BOOL(result);
1867 }
1868 
1869 Datum
1871 {
1872  text *arg1 = PG_GETARG_TEXT_PP(0);
1873  text *arg2 = PG_GETARG_TEXT_PP(1);
1874  bool result;
1875 
1876  result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) < 0);
1877 
1878  PG_FREE_IF_COPY(arg1, 0);
1879  PG_FREE_IF_COPY(arg2, 1);
1880 
1881  PG_RETURN_BOOL(result);
1882 }
1883 
1884 Datum
1886 {
1887  text *arg1 = PG_GETARG_TEXT_PP(0);
1888  text *arg2 = PG_GETARG_TEXT_PP(1);
1889  bool result;
1890 
1891  result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) <= 0);
1892 
1893  PG_FREE_IF_COPY(arg1, 0);
1894  PG_FREE_IF_COPY(arg2, 1);
1895 
1896  PG_RETURN_BOOL(result);
1897 }
1898 
1899 Datum
1901 {
1902  text *arg1 = PG_GETARG_TEXT_PP(0);
1903  text *arg2 = PG_GETARG_TEXT_PP(1);
1904  bool result;
1905 
1906  result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) > 0);
1907 
1908  PG_FREE_IF_COPY(arg1, 0);
1909  PG_FREE_IF_COPY(arg2, 1);
1910 
1911  PG_RETURN_BOOL(result);
1912 }
1913 
1914 Datum
1916 {
1917  text *arg1 = PG_GETARG_TEXT_PP(0);
1918  text *arg2 = PG_GETARG_TEXT_PP(1);
1919  bool result;
1920 
1921  result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) >= 0);
1922 
1923  PG_FREE_IF_COPY(arg1, 0);
1924  PG_FREE_IF_COPY(arg2, 1);
1925 
1926  PG_RETURN_BOOL(result);
1927 }
1928 
1929 Datum
1931 {
1932  Datum arg1 = PG_GETARG_DATUM(0);
1933  Datum arg2 = PG_GETARG_DATUM(1);
1934  Oid collid = PG_GET_COLLATION();
1935  pg_locale_t mylocale = 0;
1936  bool result;
1937  Size len1,
1938  len2;
1939 
1940  check_collation_set(collid);
1941 
1942  if (!lc_collate_is_c(collid))
1943  mylocale = pg_newlocale_from_collation(collid);
1944 
1945  if (mylocale && !mylocale->deterministic)
1946  ereport(ERROR,
1947  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1948  errmsg("nondeterministic collations are not supported for substring searches")));
1949 
1950  len1 = toast_raw_datum_size(arg1);
1951  len2 = toast_raw_datum_size(arg2);
1952  if (len2 > len1)
1953  result = false;
1954  else
1955  {
1956  text *targ1 = text_substring(arg1, 1, len2, false);
1957  text *targ2 = DatumGetTextPP(arg2);
1958 
1959  result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1960  VARSIZE_ANY_EXHDR(targ2)) == 0);
1961 
1962  PG_FREE_IF_COPY(targ1, 0);
1963  PG_FREE_IF_COPY(targ2, 1);
1964  }
1965 
1966  PG_RETURN_BOOL(result);
1967 }
1968 
1969 Datum
1971 {
1972  text *arg1 = PG_GETARG_TEXT_PP(0);
1973  text *arg2 = PG_GETARG_TEXT_PP(1);
1974  int32 result;
1975 
1976  result = text_cmp(arg1, arg2, PG_GET_COLLATION());
1977 
1978  PG_FREE_IF_COPY(arg1, 0);
1979  PG_FREE_IF_COPY(arg2, 1);
1980 
1981  PG_RETURN_INT32(result);
1982 }
1983 
1984 Datum
1986 {
1988  Oid collid = ssup->ssup_collation;
1989  MemoryContext oldcontext;
1990 
1991  oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
1992 
1993  /* Use generic string SortSupport */
1994  varstr_sortsupport(ssup, TEXTOID, collid);
1995 
1996  MemoryContextSwitchTo(oldcontext);
1997 
1998  PG_RETURN_VOID();
1999 }
2000 
2001 /*
2002  * Generic sortsupport interface for character type's operator classes.
2003  * Includes locale support, and support for BpChar semantics (i.e. removing
2004  * trailing spaces before comparison).
2005  *
2006  * Relies on the assumption that text, VarChar, BpChar, and bytea all have the
2007  * same representation. Callers that always use the C collation (e.g.
2008  * non-collatable type callers like bytea) may have NUL bytes in their strings;
2009  * this will not work with any other collation, though.
2010  */
2011 void
2013 {
2014  bool abbreviate = ssup->abbreviate;
2015  bool collate_c = false;
2016  VarStringSortSupport *sss;
2017  pg_locale_t locale = 0;
2018 
2019  check_collation_set(collid);
2020 
2021  /*
2022  * If possible, set ssup->comparator to a function which can be used to
2023  * directly compare two datums. If we can do this, we'll avoid the
2024  * overhead of a trip through the fmgr layer for every comparison, which
2025  * can be substantial.
2026  *
2027  * Most typically, we'll set the comparator to varlenafastcmp_locale,
2028  * which uses strcoll() to perform comparisons. We use that for the
2029  * BpChar case too, but type NAME uses namefastcmp_locale. However, if
2030  * LC_COLLATE = C, we can make things quite a bit faster with
2031  * varstrfastcmp_c, bpcharfastcmp_c, or namefastcmp_c, all of which use
2032  * memcmp() rather than strcoll().
2033  */
2034  if (lc_collate_is_c(collid))
2035  {
2036  if (typid == BPCHAROID)
2037  ssup->comparator = bpcharfastcmp_c;
2038  else if (typid == NAMEOID)
2039  {
2040  ssup->comparator = namefastcmp_c;
2041  /* Not supporting abbreviation with type NAME, for now */
2042  abbreviate = false;
2043  }
2044  else
2045  ssup->comparator = varstrfastcmp_c;
2046 
2047  collate_c = true;
2048  }
2049  else
2050  {
2051  /*
2052  * We need a collation-sensitive comparison. To make things faster,
2053  * we'll figure out the collation based on the locale id and cache the
2054  * result.
2055  */
2057 
2058  /*
2059  * There is a further exception on Windows. When the database
2060  * encoding is UTF-8 and we are not using the C collation, complex
2061  * hacks are required. We don't currently have a comparator that
2062  * handles that case, so we fall back on the slow method of having the
2063  * sort code invoke bttextcmp() (in the case of text) via the fmgr
2064  * trampoline. ICU locales work just the same on Windows, however.
2065  */
2066 #ifdef WIN32
2067  if (GetDatabaseEncoding() == PG_UTF8 &&
2068  !(locale && locale->provider == COLLPROVIDER_ICU))
2069  return;
2070 #endif
2071 
2072  /*
2073  * We use varlenafastcmp_locale except for type NAME.
2074  */
2075  if (typid == NAMEOID)
2076  {
2078  /* Not supporting abbreviation with type NAME, for now */
2079  abbreviate = false;
2080  }
2081  else
2083  }
2084 
2085  /*
2086  * Unfortunately, it seems that abbreviation for non-C collations is
2087  * broken on many common platforms; testing of multiple versions of glibc
2088  * reveals that, for many locales, strcoll() and strxfrm() do not return
2089  * consistent results, which is fatal to this optimization. While no
2090  * other libc other than Cygwin has so far been shown to have a problem,
2091  * we take the conservative course of action for right now and disable
2092  * this categorically. (Users who are certain this isn't a problem on
2093  * their system can define TRUST_STRXFRM.)
2094  *
2095  * Even apart from the risk of broken locales, it's possible that there
2096  * are platforms where the use of abbreviated keys should be disabled at
2097  * compile time. Having only 4 byte datums could make worst-case
2098  * performance drastically more likely, for example. Moreover, macOS's
2099  * strxfrm() implementation is known to not effectively concentrate a
2100  * significant amount of entropy from the original string in earlier
2101  * transformed blobs. It's possible that other supported platforms are
2102  * similarly encumbered. So, if we ever get past disabling this
2103  * categorically, we may still want or need to disable it for particular
2104  * platforms.
2105  */
2106 #ifndef TRUST_STRXFRM
2107  if (!collate_c && !(locale && locale->provider == COLLPROVIDER_ICU))
2108  abbreviate = false;
2109 #endif
2110 
2111  /*
2112  * If we're using abbreviated keys, or if we're using a locale-aware
2113  * comparison, we need to initialize a VarStringSortSupport object. Both
2114  * cases will make use of the temporary buffers we initialize here for
2115  * scratch space (and to detect requirement for BpChar semantics from
2116  * caller), and the abbreviation case requires additional state.
2117  */
2118  if (abbreviate || !collate_c)
2119  {
2120  sss = palloc(sizeof(VarStringSortSupport));
2121  sss->buf1 = palloc(TEXTBUFLEN);
2122  sss->buflen1 = TEXTBUFLEN;
2123  sss->buf2 = palloc(TEXTBUFLEN);
2124  sss->buflen2 = TEXTBUFLEN;
2125  /* Start with invalid values */
2126  sss->last_len1 = -1;
2127  sss->last_len2 = -1;
2128  /* Initialize */
2129  sss->last_returned = 0;
2130  sss->locale = locale;
2131 
2132  /*
2133  * To avoid somehow confusing a strxfrm() blob and an original string,
2134  * constantly keep track of the variety of data that buf1 and buf2
2135  * currently contain.
2136  *
2137  * Comparisons may be interleaved with conversion calls. Frequently,
2138  * conversions and comparisons are batched into two distinct phases,
2139  * but the correctness of caching cannot hinge upon this. For
2140  * comparison caching, buffer state is only trusted if cache_blob is
2141  * found set to false, whereas strxfrm() caching only trusts the state
2142  * when cache_blob is found set to true.
2143  *
2144  * Arbitrarily initialize cache_blob to true.
2145  */
2146  sss->cache_blob = true;
2147  sss->collate_c = collate_c;
2148  sss->typid = typid;
2149  ssup->ssup_extra = sss;
2150 
2151  /*
2152  * If possible, plan to use the abbreviated keys optimization. The
2153  * core code may switch back to authoritative comparator should
2154  * abbreviation be aborted.
2155  */
2156  if (abbreviate)
2157  {
2158  sss->prop_card = 0.20;
2159  initHyperLogLog(&sss->abbr_card, 10);
2160  initHyperLogLog(&sss->full_card, 10);
2161  ssup->abbrev_full_comparator = ssup->comparator;
2165  }
2166  }
2167 }
2168 
2169 /*
2170  * sortsupport comparison func (for C locale case)
2171  */
2172 static int
2174 {
2175  VarString *arg1 = DatumGetVarStringPP(x);
2176  VarString *arg2 = DatumGetVarStringPP(y);
2177  char *a1p,
2178  *a2p;
2179  int len1,
2180  len2,
2181  result;
2182 
2183  a1p = VARDATA_ANY(arg1);
2184  a2p = VARDATA_ANY(arg2);
2185 
2186  len1 = VARSIZE_ANY_EXHDR(arg1);
2187  len2 = VARSIZE_ANY_EXHDR(arg2);
2188 
2189  result = memcmp(a1p, a2p, Min(len1, len2));
2190  if ((result == 0) && (len1 != len2))
2191  result = (len1 < len2) ? -1 : 1;
2192 
2193  /* We can't afford to leak memory here. */
2194  if (PointerGetDatum(arg1) != x)
2195  pfree(arg1);
2196  if (PointerGetDatum(arg2) != y)
2197  pfree(arg2);
2198 
2199  return result;
2200 }
2201 
2202 /*
2203  * sortsupport comparison func (for BpChar C locale case)
2204  *
2205  * BpChar outsources its sortsupport to this module. Specialization for the
2206  * varstr_sortsupport BpChar case, modeled on
2207  * internal_bpchar_pattern_compare().
2208  */
2209 static int
2211 {
2212  BpChar *arg1 = DatumGetBpCharPP(x);
2213  BpChar *arg2 = DatumGetBpCharPP(y);
2214  char *a1p,
2215  *a2p;
2216  int len1,
2217  len2,
2218  result;
2219 
2220  a1p = VARDATA_ANY(arg1);
2221  a2p = VARDATA_ANY(arg2);
2222 
2223  len1 = bpchartruelen(a1p, VARSIZE_ANY_EXHDR(arg1));
2224  len2 = bpchartruelen(a2p, VARSIZE_ANY_EXHDR(arg2));
2225 
2226  result = memcmp(a1p, a2p, Min(len1, len2));
2227  if ((result == 0) && (len1 != len2))
2228  result = (len1 < len2) ? -1 : 1;
2229 
2230  /* We can't afford to leak memory here. */
2231  if (PointerGetDatum(arg1) != x)
2232  pfree(arg1);
2233  if (PointerGetDatum(arg2) != y)
2234  pfree(arg2);
2235 
2236  return result;
2237 }
2238 
2239 /*
2240  * sortsupport comparison func (for NAME C locale case)
2241  */
2242 static int
2244 {
2245  Name arg1 = DatumGetName(x);
2246  Name arg2 = DatumGetName(y);
2247 
2248  return strncmp(NameStr(*arg1), NameStr(*arg2), NAMEDATALEN);
2249 }
2250 
2251 /*
2252  * sortsupport comparison func (for locale case with all varlena types)
2253  */
2254 static int
2256 {
2257  VarString *arg1 = DatumGetVarStringPP(x);
2258  VarString *arg2 = DatumGetVarStringPP(y);
2259  char *a1p,
2260  *a2p;
2261  int len1,
2262  len2,
2263  result;
2264 
2265  a1p = VARDATA_ANY(arg1);
2266  a2p = VARDATA_ANY(arg2);
2267 
2268  len1 = VARSIZE_ANY_EXHDR(arg1);
2269  len2 = VARSIZE_ANY_EXHDR(arg2);
2270 
2271  result = varstrfastcmp_locale(a1p, len1, a2p, len2, ssup);
2272 
2273  /* We can't afford to leak memory here. */
2274  if (PointerGetDatum(arg1) != x)
2275  pfree(arg1);
2276  if (PointerGetDatum(arg2) != y)
2277  pfree(arg2);
2278 
2279  return result;
2280 }
2281 
2282 /*
2283  * sortsupport comparison func (for locale case with NAME type)
2284  */
2285 static int
2287 {
2288  Name arg1 = DatumGetName(x);
2289  Name arg2 = DatumGetName(y);
2290 
2291  return varstrfastcmp_locale(NameStr(*arg1), strlen(NameStr(*arg1)),
2292  NameStr(*arg2), strlen(NameStr(*arg2)),
2293  ssup);
2294 }
2295 
2296 /*
2297  * sortsupport comparison func for locale cases
2298  */
2299 static int
2300 varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup)
2301 {
2303  int result;
2304  bool arg1_match;
2305 
2306  /* Fast pre-check for equality, as discussed in varstr_cmp() */
2307  if (len1 == len2 && memcmp(a1p, a2p, len1) == 0)
2308  {
2309  /*
2310  * No change in buf1 or buf2 contents, so avoid changing last_len1 or
2311  * last_len2. Existing contents of buffers might still be used by
2312  * next call.
2313  *
2314  * It's fine to allow the comparison of BpChar padding bytes here,
2315  * even though that implies that the memcmp() will usually be
2316  * performed for BpChar callers (though multibyte characters could
2317  * still prevent that from occurring). The memcmp() is still very
2318  * cheap, and BpChar's funny semantics have us remove trailing spaces
2319  * (not limited to padding), so we need make no distinction between
2320  * padding space characters and "real" space characters.
2321  */
2322  return 0;
2323  }
2324 
2325  if (sss->typid == BPCHAROID)
2326  {
2327  /* Get true number of bytes, ignoring trailing spaces */
2328  len1 = bpchartruelen(a1p, len1);
2329  len2 = bpchartruelen(a2p, len2);
2330  }
2331 
2332  if (len1 >= sss->buflen1)
2333  {
2334  sss->buflen1 = Max(len1 + 1, Min(sss->buflen1 * 2, MaxAllocSize));
2335  sss->buf1 = repalloc(sss->buf1, sss->buflen1);
2336  }
2337  if (len2 >= sss->buflen2)
2338  {
2339  sss->buflen2 = Max(len2 + 1, Min(sss->buflen2 * 2, MaxAllocSize));
2340  sss->buf2 = repalloc(sss->buf2, sss->buflen2);
2341  }
2342 
2343  /*
2344  * We're likely to be asked to compare the same strings repeatedly, and
2345  * memcmp() is so much cheaper than strcoll() that it pays to try to cache
2346  * comparisons, even though in general there is no reason to think that
2347  * that will work out (every string datum may be unique). Caching does
2348  * not slow things down measurably when it doesn't work out, and can speed
2349  * things up by rather a lot when it does. In part, this is because the
2350  * memcmp() compares data from cachelines that are needed in L1 cache even
2351  * when the last comparison's result cannot be reused.
2352  */
2353  arg1_match = true;
2354  if (len1 != sss->last_len1 || memcmp(sss->buf1, a1p, len1) != 0)
2355  {
2356  arg1_match = false;
2357  memcpy(sss->buf1, a1p, len1);
2358  sss->buf1[len1] = '\0';
2359  sss->last_len1 = len1;
2360  }
2361 
2362  /*
2363  * If we're comparing the same two strings as last time, we can return the
2364  * same answer without calling strcoll() again. This is more likely than
2365  * it seems (at least with moderate to low cardinality sets), because
2366  * quicksort compares the same pivot against many values.
2367  */
2368  if (len2 != sss->last_len2 || memcmp(sss->buf2, a2p, len2) != 0)
2369  {
2370  memcpy(sss->buf2, a2p, len2);
2371  sss->buf2[len2] = '\0';
2372  sss->last_len2 = len2;
2373  }
2374  else if (arg1_match && !sss->cache_blob)
2375  {
2376  /* Use result cached following last actual strcoll() call */
2377  return sss->last_returned;
2378  }
2379 
2380  if (sss->locale)
2381  {
2382  if (sss->locale->provider == COLLPROVIDER_ICU)
2383  {
2384 #ifdef USE_ICU
2385 #ifdef HAVE_UCOL_STRCOLLUTF8
2386  if (GetDatabaseEncoding() == PG_UTF8)
2387  {
2388  UErrorCode status;
2389 
2390  status = U_ZERO_ERROR;
2391  result = ucol_strcollUTF8(sss->locale->info.icu.ucol,
2392  a1p, len1,
2393  a2p, len2,
2394  &status);
2395  if (U_FAILURE(status))
2396  ereport(ERROR,
2397  (errmsg("collation failed: %s", u_errorName(status))));
2398  }
2399  else
2400 #endif
2401  {
2402  int32_t ulen1,
2403  ulen2;
2404  UChar *uchar1,
2405  *uchar2;
2406 
2407  ulen1 = icu_to_uchar(&uchar1, a1p, len1);
2408  ulen2 = icu_to_uchar(&uchar2, a2p, len2);
2409 
2410  result = ucol_strcoll(sss->locale->info.icu.ucol,
2411  uchar1, ulen1,
2412  uchar2, ulen2);
2413 
2414  pfree(uchar1);
2415  pfree(uchar2);
2416  }
2417 #else /* not USE_ICU */
2418  /* shouldn't happen */
2419  elog(ERROR, "unsupported collprovider: %c", sss->locale->provider);
2420 #endif /* not USE_ICU */
2421  }
2422  else
2423  {
2424 #ifdef HAVE_LOCALE_T
2425  result = strcoll_l(sss->buf1, sss->buf2, sss->locale->info.lt);
2426 #else
2427  /* shouldn't happen */
2428  elog(ERROR, "unsupported collprovider: %c", sss->locale->provider);
2429 #endif
2430  }
2431  }
2432  else
2433  result = strcoll(sss->buf1, sss->buf2);
2434 
2435  /* Break tie if necessary. */
2436  if (result == 0 &&
2437  (!sss->locale || sss->locale->deterministic))
2438  result = strcmp(sss->buf1, sss->buf2);
2439 
2440  /* Cache result, perhaps saving an expensive strcoll() call next time */
2441  sss->cache_blob = false;
2442  sss->last_returned = result;
2443  return result;
2444 }
2445 
2446 /*
2447  * Conversion routine for sortsupport. Converts original to abbreviated key
2448  * representation. Our encoding strategy is simple -- pack the first 8 bytes
2449  * of a strxfrm() blob into a Datum (on little-endian machines, the 8 bytes are
2450  * stored in reverse order), and treat it as an unsigned integer. When the "C"
2451  * locale is used, or in case of bytea, just memcpy() from original instead.
2452  */
2453 static Datum
2455 {
2457  VarString *authoritative = DatumGetVarStringPP(original);
2458  char *authoritative_data = VARDATA_ANY(authoritative);
2459 
2460  /* working state */
2461  Datum res;
2462  char *pres;
2463  int len;
2464  uint32 hash;
2465 
2466  pres = (char *) &res;
2467  /* memset(), so any non-overwritten bytes are NUL */
2468  memset(pres, 0, sizeof(Datum));
2469  len = VARSIZE_ANY_EXHDR(authoritative);
2470 
2471  /* Get number of bytes, ignoring trailing spaces */
2472  if (sss->typid == BPCHAROID)
2473  len = bpchartruelen(authoritative_data, len);
2474 
2475  /*
2476  * If we're using the C collation, use memcpy(), rather than strxfrm(), to
2477  * abbreviate keys. The full comparator for the C locale is always
2478  * memcmp(). It would be incorrect to allow bytea callers (callers that
2479  * always force the C collation -- bytea isn't a collatable type, but this
2480  * approach is convenient) to use strxfrm(). This is because bytea
2481  * strings may contain NUL bytes. Besides, this should be faster, too.
2482  *
2483  * More generally, it's okay that bytea callers can have NUL bytes in
2484  * strings because abbreviated cmp need not make a distinction between
2485  * terminating NUL bytes, and NUL bytes representing actual NULs in the
2486  * authoritative representation. Hopefully a comparison at or past one
2487  * abbreviated key's terminating NUL byte will resolve the comparison
2488  * without consulting the authoritative representation; specifically, some
2489  * later non-NUL byte in the longer string can resolve the comparison
2490  * against a subsequent terminating NUL in the shorter string. There will
2491  * usually be what is effectively a "length-wise" resolution there and
2492  * then.
2493  *
2494  * If that doesn't work out -- if all bytes in the longer string
2495  * positioned at or past the offset of the smaller string's (first)
2496  * terminating NUL are actually representative of NUL bytes in the
2497  * authoritative binary string (perhaps with some *terminating* NUL bytes
2498  * towards the end of the longer string iff it happens to still be small)
2499  * -- then an authoritative tie-breaker will happen, and do the right
2500  * thing: explicitly consider string length.
2501  */
2502  if (sss->collate_c)
2503  memcpy(pres, authoritative_data, Min(len, sizeof(Datum)));
2504  else
2505  {
2506  Size bsize;
2507 #ifdef USE_ICU
2508  int32_t ulen = -1;
2509  UChar *uchar = NULL;
2510 #endif
2511 
2512  /*
2513  * We're not using the C collation, so fall back on strxfrm or ICU
2514  * analogs.
2515  */
2516 
2517  /* By convention, we use buffer 1 to store and NUL-terminate */
2518  if (len >= sss->buflen1)
2519  {
2520  sss->buflen1 = Max(len + 1, Min(sss->buflen1 * 2, MaxAllocSize));
2521  sss->buf1 = repalloc(sss->buf1, sss->buflen1);
2522  }
2523 
2524  /* Might be able to reuse strxfrm() blob from last call */
2525  if (sss->last_len1 == len && sss->cache_blob &&
2526  memcmp(sss->buf1, authoritative_data, len) == 0)
2527  {
2528  memcpy(pres, sss->buf2, Min(sizeof(Datum), sss->last_len2));
2529  /* No change affecting cardinality, so no hashing required */
2530  goto done;
2531  }
2532 
2533  memcpy(sss->buf1, authoritative_data, len);
2534 
2535  /*
2536  * Just like strcoll(), strxfrm() expects a NUL-terminated string. Not
2537  * necessary for ICU, but doesn't hurt.
2538  */
2539  sss->buf1[len] = '\0';
2540  sss->last_len1 = len;
2541 
2542 #ifdef USE_ICU
2543  /* When using ICU and not UTF8, convert string to UChar. */
2544  if (sss->locale && sss->locale->provider == COLLPROVIDER_ICU &&
2546  ulen = icu_to_uchar(&uchar, sss->buf1, len);
2547 #endif
2548 
2549  /*
2550  * Loop: Call strxfrm() or ucol_getSortKey(), possibly enlarge buffer,
2551  * and try again. Both of these functions have the result buffer
2552  * content undefined if the result did not fit, so we need to retry
2553  * until everything fits, even though we only need the first few bytes
2554  * in the end. When using ucol_nextSortKeyPart(), however, we only
2555  * ask for as many bytes as we actually need.
2556  */
2557  for (;;)
2558  {
2559 #ifdef USE_ICU
2560  if (sss->locale && sss->locale->provider == COLLPROVIDER_ICU)
2561  {
2562  /*
2563  * When using UTF8, use the iteration interface so we only
2564  * need to produce as many bytes as we actually need.
2565  */
2566  if (GetDatabaseEncoding() == PG_UTF8)
2567  {
2568  UCharIterator iter;
2569  uint32_t state[2];
2570  UErrorCode status;
2571 
2572  uiter_setUTF8(&iter, sss->buf1, len);
2573  state[0] = state[1] = 0; /* won't need that again */
2574  status = U_ZERO_ERROR;
2575  bsize = ucol_nextSortKeyPart(sss->locale->info.icu.ucol,
2576  &iter,
2577  state,
2578  (uint8_t *) sss->buf2,
2579  Min(sizeof(Datum), sss->buflen2),
2580  &status);
2581  if (U_FAILURE(status))
2582  ereport(ERROR,
2583  (errmsg("sort key generation failed: %s",
2584  u_errorName(status))));
2585  }
2586  else
2587  bsize = ucol_getSortKey(sss->locale->info.icu.ucol,
2588  uchar, ulen,
2589  (uint8_t *) sss->buf2, sss->buflen2);
2590  }
2591  else
2592 #endif
2593 #ifdef HAVE_LOCALE_T
2594  if (sss->locale && sss->locale->provider == COLLPROVIDER_LIBC)
2595  bsize = strxfrm_l(sss->buf2, sss->buf1,
2596  sss->buflen2, sss->locale->info.lt);
2597  else
2598 #endif
2599  bsize = strxfrm(sss->buf2, sss->buf1, sss->buflen2);
2600 
2601  sss->last_len2 = bsize;
2602  if (bsize < sss->buflen2)
2603  break;
2604 
2605  /*
2606  * Grow buffer and retry.
2607  */
2608  sss->buflen2 = Max(bsize + 1,
2609  Min(sss->buflen2 * 2, MaxAllocSize));
2610  sss->buf2 = repalloc(sss->buf2, sss->buflen2);
2611  }
2612 
2613  /*
2614  * Every Datum byte is always compared. This is safe because the
2615  * strxfrm() blob is itself NUL terminated, leaving no danger of
2616  * misinterpreting any NUL bytes not intended to be interpreted as
2617  * logically representing termination.
2618  *
2619  * (Actually, even if there were NUL bytes in the blob it would be
2620  * okay. See remarks on bytea case above.)
2621  */
2622  memcpy(pres, sss->buf2, Min(sizeof(Datum), bsize));
2623 
2624 #ifdef USE_ICU
2625  if (uchar)
2626  pfree(uchar);
2627 #endif
2628  }
2629 
2630  /*
2631  * Maintain approximate cardinality of both abbreviated keys and original,
2632  * authoritative keys using HyperLogLog. Used as cheap insurance against
2633  * the worst case, where we do many string transformations for no saving
2634  * in full strcoll()-based comparisons. These statistics are used by
2635  * varstr_abbrev_abort().
2636  *
2637  * First, Hash key proper, or a significant fraction of it. Mix in length
2638  * in order to compensate for cases where differences are past
2639  * PG_CACHE_LINE_SIZE bytes, so as to limit the overhead of hashing.
2640  */
2641  hash = DatumGetUInt32(hash_any((unsigned char *) authoritative_data,
2643 
2644  if (len > PG_CACHE_LINE_SIZE)
2646 
2647  addHyperLogLog(&sss->full_card, hash);
2648 
2649  /* Hash abbreviated key */
2650 #if SIZEOF_DATUM == 8
2651  {
2652  uint32 lohalf,
2653  hihalf;
2654 
2655  lohalf = (uint32) res;
2656  hihalf = (uint32) (res >> 32);
2657  hash = DatumGetUInt32(hash_uint32(lohalf ^ hihalf));
2658  }
2659 #else /* SIZEOF_DATUM != 8 */
2661 #endif
2662 
2663  addHyperLogLog(&sss->abbr_card, hash);
2664 
2665  /* Cache result, perhaps saving an expensive strxfrm() call next time */
2666  sss->cache_blob = true;
2667 done:
2668 
2669  /*
2670  * Byteswap on little-endian machines.
2671  *
2672  * This is needed so that ssup_datum_unsigned_cmp() (an unsigned integer
2673  * 3-way comparator) works correctly on all platforms. If we didn't do
2674  * this, the comparator would have to call memcmp() with a pair of
2675  * pointers to the first byte of each abbreviated key, which is slower.
2676  */
2677  res = DatumBigEndianToNative(res);
2678 
2679  /* Don't leak memory here */
2680  if (PointerGetDatum(authoritative) != original)
2681  pfree(authoritative);
2682 
2683  return res;
2684 }
2685 
2686 /*
2687  * Callback for estimating effectiveness of abbreviated key optimization, using
2688  * heuristic rules. Returns value indicating if the abbreviation optimization
2689  * should be aborted, based on its projected effectiveness.
2690  */
2691 static bool
2692 varstr_abbrev_abort(int memtupcount, SortSupport ssup)
2693 {
2695  double abbrev_distinct,
2696  key_distinct;
2697 
2698  Assert(ssup->abbreviate);
2699 
2700  /* Have a little patience */
2701  if (memtupcount < 100)
2702  return false;
2703 
2704  abbrev_distinct = estimateHyperLogLog(&sss->abbr_card);
2705  key_distinct = estimateHyperLogLog(&sss->full_card);
2706 
2707  /*
2708  * Clamp cardinality estimates to at least one distinct value. While
2709  * NULLs are generally disregarded, if only NULL values were seen so far,
2710  * that might misrepresent costs if we failed to clamp.
2711  */
2712  if (abbrev_distinct <= 1.0)
2713  abbrev_distinct = 1.0;
2714 
2715  if (key_distinct <= 1.0)
2716  key_distinct = 1.0;
2717 
2718  /*
2719  * In the worst case all abbreviated keys are identical, while at the same
2720  * time there are differences within full key strings not captured in
2721  * abbreviations.
2722  */
2723 #ifdef TRACE_SORT
2724  if (trace_sort)
2725  {
2726  double norm_abbrev_card = abbrev_distinct / (double) memtupcount;
2727 
2728  elog(LOG, "varstr_abbrev: abbrev_distinct after %d: %f "
2729  "(key_distinct: %f, norm_abbrev_card: %f, prop_card: %f)",
2730  memtupcount, abbrev_distinct, key_distinct, norm_abbrev_card,
2731  sss->prop_card);
2732  }
2733 #endif
2734 
2735  /*
2736  * If the number of distinct abbreviated keys approximately matches the
2737  * number of distinct authoritative original keys, that's reason enough to
2738  * proceed. We can win even with a very low cardinality set if most
2739  * tie-breakers only memcmp(). This is by far the most important
2740  * consideration.
2741  *
2742  * While comparisons that are resolved at the abbreviated key level are
2743  * considerably cheaper than tie-breakers resolved with memcmp(), both of
2744  * those two outcomes are so much cheaper than a full strcoll() once
2745  * sorting is underway that it doesn't seem worth it to weigh abbreviated
2746  * cardinality against the overall size of the set in order to more
2747  * accurately model costs. Assume that an abbreviated comparison, and an
2748  * abbreviated comparison with a cheap memcmp()-based authoritative
2749  * resolution are equivalent.
2750  */
2751  if (abbrev_distinct > key_distinct * sss->prop_card)
2752  {
2753  /*
2754  * When we have exceeded 10,000 tuples, decay required cardinality
2755  * aggressively for next call.
2756  *
2757  * This is useful because the number of comparisons required on
2758  * average increases at a linearithmic rate, and at roughly 10,000
2759  * tuples that factor will start to dominate over the linear costs of
2760  * string transformation (this is a conservative estimate). The decay
2761  * rate is chosen to be a little less aggressive than halving -- which
2762  * (since we're called at points at which memtupcount has doubled)
2763  * would never see the cost model actually abort past the first call
2764  * following a decay. This decay rate is mostly a precaution against
2765  * a sudden, violent swing in how well abbreviated cardinality tracks
2766  * full key cardinality. The decay also serves to prevent a marginal
2767  * case from being aborted too late, when too much has already been
2768  * invested in string transformation.
2769  *
2770  * It's possible for sets of several million distinct strings with
2771  * mere tens of thousands of distinct abbreviated keys to still
2772  * benefit very significantly. This will generally occur provided
2773  * each abbreviated key is a proxy for a roughly uniform number of the
2774  * set's full keys. If it isn't so, we hope to catch that early and
2775  * abort. If it isn't caught early, by the time the problem is
2776  * apparent it's probably not worth aborting.
2777  */
2778  if (memtupcount > 10000)
2779  sss->prop_card *= 0.65;
2780 
2781  return false;
2782  }
2783 
2784  /*
2785  * Abort abbreviation strategy.
2786  *
2787  * The worst case, where all abbreviated keys are identical while all
2788  * original strings differ will typically only see a regression of about
2789  * 10% in execution time for small to medium sized lists of strings.
2790  * Whereas on modern CPUs where cache stalls are the dominant cost, we can
2791  * often expect very large improvements, particularly with sets of strings
2792  * of moderately high to high abbreviated cardinality. There is little to
2793  * lose but much to gain, which our strategy reflects.
2794  */
2795 #ifdef TRACE_SORT
2796  if (trace_sort)
2797  elog(LOG, "varstr_abbrev: aborted abbreviation at %d "
2798  "(abbrev_distinct: %f, key_distinct: %f, prop_card: %f)",
2799  memtupcount, abbrev_distinct, key_distinct, sss->prop_card);
2800 #endif
2801 
2802  return true;
2803 }
2804 
2805 /*
2806  * Generic equalimage support function for character type's operator classes.
2807  * Disables the use of deduplication with nondeterministic collations.
2808  */
2809 Datum
2811 {
2812  /* Oid opcintype = PG_GETARG_OID(0); */
2813  Oid collid = PG_GET_COLLATION();
2814 
2815  check_collation_set(collid);
2816 
2817  if (lc_collate_is_c(collid) ||
2818  collid == DEFAULT_COLLATION_OID ||
2820  PG_RETURN_BOOL(true);
2821  else
2822  PG_RETURN_BOOL(false);
2823 }
2824 
2825 Datum
2827 {
2828  text *arg1 = PG_GETARG_TEXT_PP(0);
2829  text *arg2 = PG_GETARG_TEXT_PP(1);
2830  text *result;
2831 
2832  result = ((text_cmp(arg1, arg2, PG_GET_COLLATION()) > 0) ? arg1 : arg2);
2833 
2834  PG_RETURN_TEXT_P(result);
2835 }
2836 
2837 Datum
2839 {
2840  text *arg1 = PG_GETARG_TEXT_PP(0);
2841  text *arg2 = PG_GETARG_TEXT_PP(1);
2842  text *result;
2843 
2844  result = ((text_cmp(arg1, arg2, PG_GET_COLLATION()) < 0) ? arg1 : arg2);
2845 
2846  PG_RETURN_TEXT_P(result);
2847 }
2848 
2849 
2850 /*
2851  * Cross-type comparison functions for types text and name.
2852  */
2853 
2854 Datum
2856 {
2857  Name arg1 = PG_GETARG_NAME(0);
2858  text *arg2 = PG_GETARG_TEXT_PP(1);
2859  size_t len1 = strlen(NameStr(*arg1));
2860  size_t len2 = VARSIZE_ANY_EXHDR(arg2);
2861  Oid collid = PG_GET_COLLATION();
2862  bool result;
2863 
2864  check_collation_set(collid);
2865 
2866  if (collid == C_COLLATION_OID)
2867  result = (len1 == len2 &&
2868  memcmp(NameStr(*arg1), VARDATA_ANY(arg2), len1) == 0);
2869  else
2870  result = (varstr_cmp(NameStr(*arg1), len1,
2871  VARDATA_ANY(arg2), len2,
2872  collid) == 0);
2873 
2874  PG_FREE_IF_COPY(arg2, 1);
2875 
2876  PG_RETURN_BOOL(result);
2877 }
2878 
2879 Datum
2881 {
2882  text *arg1 = PG_GETARG_TEXT_PP(0);
2883  Name arg2 = PG_GETARG_NAME(1);
2884  size_t len1 = VARSIZE_ANY_EXHDR(arg1);
2885  size_t len2 = strlen(NameStr(*arg2));
2886  Oid collid = PG_GET_COLLATION();
2887  bool result;
2888 
2889  check_collation_set(collid);
2890 
2891  if (collid == C_COLLATION_OID)
2892  result = (len1 == len2 &&
2893  memcmp(VARDATA_ANY(arg1), NameStr(*arg2), len1) == 0);
2894  else
2895  result = (varstr_cmp(VARDATA_ANY(arg1), len1,
2896  NameStr(*arg2), len2,
2897  collid) == 0);
2898 
2899  PG_FREE_IF_COPY(arg1, 0);
2900 
2901  PG_RETURN_BOOL(result);
2902 }
2903 
2904 Datum
2906 {
2907  Name arg1 = PG_GETARG_NAME(0);
2908  text *arg2 = PG_GETARG_TEXT_PP(1);
2909  size_t len1 = strlen(NameStr(*arg1));
2910  size_t len2 = VARSIZE_ANY_EXHDR(arg2);
2911  Oid collid = PG_GET_COLLATION();
2912  bool result;
2913 
2914  check_collation_set(collid);
2915 
2916  if (collid == C_COLLATION_OID)
2917  result = !(len1 == len2 &&
2918  memcmp(NameStr(*arg1), VARDATA_ANY(arg2), len1) == 0);
2919  else
2920  result = !(varstr_cmp(NameStr(*arg1), len1,
2921  VARDATA_ANY(arg2), len2,
2922  collid) == 0);
2923 
2924  PG_FREE_IF_COPY(arg2, 1);
2925 
2926  PG_RETURN_BOOL(result);
2927 }
2928 
2929 Datum
2931 {
2932  text *arg1 = PG_GETARG_TEXT_PP(0);
2933  Name arg2 = PG_GETARG_NAME(1);
2934  size_t len1 = VARSIZE_ANY_EXHDR(arg1);
2935  size_t len2 = strlen(NameStr(*arg2));
2936  Oid collid = PG_GET_COLLATION();
2937  bool result;
2938 
2939  check_collation_set(collid);
2940 
2941  if (collid == C_COLLATION_OID)
2942  result = !(len1 == len2 &&
2943  memcmp(VARDATA_ANY(arg1), NameStr(*arg2), len1) == 0);
2944  else
2945  result = !(varstr_cmp(VARDATA_ANY(arg1), len1,
2946  NameStr(*arg2), len2,
2947  collid) == 0);
2948 
2949  PG_FREE_IF_COPY(arg1, 0);
2950 
2951  PG_RETURN_BOOL(result);
2952 }
2953 
2954 Datum
2956 {
2957  Name arg1 = PG_GETARG_NAME(0);
2958  text *arg2 = PG_GETARG_TEXT_PP(1);
2959  int32 result;
2960 
2961  result = varstr_cmp(NameStr(*arg1), strlen(NameStr(*arg1)),
2962  VARDATA_ANY(arg2), VARSIZE_ANY_EXHDR(arg2),
2963  PG_GET_COLLATION());
2964 
2965  PG_FREE_IF_COPY(arg2, 1);
2966 
2967  PG_RETURN_INT32(result);
2968 }
2969 
2970 Datum
2972 {
2973  text *arg1 = PG_GETARG_TEXT_PP(0);
2974  Name arg2 = PG_GETARG_NAME(1);
2975  int32 result;
2976 
2977  result = varstr_cmp(VARDATA_ANY(arg1), VARSIZE_ANY_EXHDR(arg1),
2978  NameStr(*arg2), strlen(NameStr(*arg2)),
2979  PG_GET_COLLATION());
2980 
2981  PG_FREE_IF_COPY(arg1, 0);
2982 
2983  PG_RETURN_INT32(result);
2984 }
2985 
2986 #define CmpCall(cmpfunc) \
2987  DatumGetInt32(DirectFunctionCall2Coll(cmpfunc, \
2988  PG_GET_COLLATION(), \
2989  PG_GETARG_DATUM(0), \
2990  PG_GETARG_DATUM(1)))
2991 
2992 Datum
2994 {
2996 }
2997 
2998 Datum
3000 {
3002 }
3003 
3004 Datum
3006 {
3008 }
3009 
3010 Datum
3012 {
3014 }
3015 
3016 Datum
3018 {
3020 }
3021 
3022 Datum
3024 {
3026 }
3027 
3028 Datum
3030 {
3032 }
3033 
3034 Datum
3036 {
3038 }
3039 
3040 #undef CmpCall
3041 
3042 
3043 /*
3044  * The following operators support character-by-character comparison
3045  * of text datums, to allow building indexes suitable for LIKE clauses.
3046  * Note that the regular texteq/textne comparison operators, and regular
3047  * support functions 1 and 2 with "C" collation are assumed to be
3048  * compatible with these!
3049  */
3050 
3051 static int
3053 {
3054  int result;
3055  int len1,
3056  len2;
3057 
3058  len1 = VARSIZE_ANY_EXHDR(arg1);
3059  len2 = VARSIZE_ANY_EXHDR(arg2);
3060 
3061  result = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3062  if (result != 0)
3063  return result;
3064  else if (len1 < len2)
3065  return -1;
3066  else if (len1 > len2)
3067  return 1;
3068  else
3069  return 0;
3070 }
3071 
3072 
3073 Datum
3075 {
3076  text *arg1 = PG_GETARG_TEXT_PP(0);
3077  text *arg2 = PG_GETARG_TEXT_PP(1);
3078  int result;
3079 
3080  result = internal_text_pattern_compare(arg1, arg2);
3081 
3082  PG_FREE_IF_COPY(arg1, 0);
3083  PG_FREE_IF_COPY(arg2, 1);
3084 
3085  PG_RETURN_BOOL(result < 0);
3086 }
3087 
3088 
3089 Datum
3091 {
3092  text *arg1 = PG_GETARG_TEXT_PP(0);
3093  text *arg2 = PG_GETARG_TEXT_PP(1);
3094  int result;
3095 
3096  result = internal_text_pattern_compare(arg1, arg2);
3097 
3098  PG_FREE_IF_COPY(arg1, 0);
3099  PG_FREE_IF_COPY(arg2, 1);
3100 
3101  PG_RETURN_BOOL(result <= 0);
3102 }
3103 
3104 
3105 Datum
3107 {
3108  text *arg1 = PG_GETARG_TEXT_PP(0);
3109  text *arg2 = PG_GETARG_TEXT_PP(1);
3110  int result;
3111 
3112  result = internal_text_pattern_compare(arg1, arg2);
3113 
3114  PG_FREE_IF_COPY(arg1, 0);
3115  PG_FREE_IF_COPY(arg2, 1);
3116 
3117  PG_RETURN_BOOL(result >= 0);
3118 }
3119 
3120 
3121 Datum
3123 {
3124  text *arg1 = PG_GETARG_TEXT_PP(0);
3125  text *arg2 = PG_GETARG_TEXT_PP(1);
3126  int result;
3127 
3128  result = internal_text_pattern_compare(arg1, arg2);
3129 
3130  PG_FREE_IF_COPY(arg1, 0);
3131  PG_FREE_IF_COPY(arg2, 1);
3132 
3133  PG_RETURN_BOOL(result > 0);
3134 }
3135 
3136 
3137 Datum
3139 {
3140  text *arg1 = PG_GETARG_TEXT_PP(0);
3141  text *arg2 = PG_GETARG_TEXT_PP(1);
3142  int result;
3143 
3144  result = internal_text_pattern_compare(arg1, arg2);
3145 
3146  PG_FREE_IF_COPY(arg1, 0);
3147  PG_FREE_IF_COPY(arg2, 1);
3148 
3149  PG_RETURN_INT32(result);
3150 }
3151 
3152 
3153 Datum
3155 {
3157  MemoryContext oldcontext;
3158 
3159  oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
3160 
3161  /* Use generic string SortSupport, forcing "C" collation */
3162  varstr_sortsupport(ssup, TEXTOID, C_COLLATION_OID);
3163 
3164  MemoryContextSwitchTo(oldcontext);
3165 
3166  PG_RETURN_VOID();
3167 }
3168 
3169 
3170 /*-------------------------------------------------------------
3171  * byteaoctetlen
3172  *
3173  * get the number of bytes contained in an instance of type 'bytea'
3174  *-------------------------------------------------------------
3175  */
3176 Datum
3178 {
3179  Datum str = PG_GETARG_DATUM(0);
3180 
3181  /* We need not detoast the input at all */
3183 }
3184 
3185 /*
3186  * byteacat -
3187  * takes two bytea* and returns a bytea* that is the concatenation of
3188  * the two.
3189  *
3190  * Cloned from textcat and modified as required.
3191  */
3192 Datum
3194 {
3195  bytea *t1 = PG_GETARG_BYTEA_PP(0);
3196  bytea *t2 = PG_GETARG_BYTEA_PP(1);
3197 
3199 }
3200 
3201 /*
3202  * bytea_catenate
3203  * Guts of byteacat(), broken out so it can be used by other functions
3204  *
3205  * Arguments can be in short-header form, but not compressed or out-of-line
3206  */
3207 static bytea *
3209 {
3210  bytea *result;
3211  int len1,
3212  len2,
3213  len;
3214  char *ptr;
3215 
3216  len1 = VARSIZE_ANY_EXHDR(t1);
3217  len2 = VARSIZE_ANY_EXHDR(t2);
3218 
3219  /* paranoia ... probably should throw error instead? */
3220  if (len1 < 0)
3221  len1 = 0;
3222  if (len2 < 0)
3223  len2 = 0;
3224 
3225  len = len1 + len2 + VARHDRSZ;
3226  result = (bytea *) palloc(len);
3227 
3228  /* Set size of result string... */
3229  SET_VARSIZE(result, len);
3230 
3231  /* Fill data field of result string... */
3232  ptr = VARDATA(result);
3233  if (len1 > 0)
3234  memcpy(ptr, VARDATA_ANY(t1), len1);
3235  if (len2 > 0)
3236  memcpy(ptr + len1, VARDATA_ANY(t2), len2);
3237 
3238  return result;
3239 }
3240 
3241 #define PG_STR_GET_BYTEA(str_) \
3242  DatumGetByteaPP(DirectFunctionCall1(byteain, CStringGetDatum(str_)))
3243 
3244 /*
3245  * bytea_substr()
3246  * Return a substring starting at the specified position.
3247  * Cloned from text_substr and modified as required.
3248  *
3249  * Input:
3250  * - string
3251  * - starting position (is one-based)
3252  * - string length (optional)
3253  *
3254  * If the starting position is zero or less, then return from the start of the string
3255  * adjusting the length to be consistent with the "negative start" per SQL.
3256  * If the length is less than zero, an ERROR is thrown. If no third argument
3257  * (length) is provided, the length to the end of the string is assumed.
3258  */
3259 Datum
3261 {
3263  PG_GETARG_INT32(1),
3264  PG_GETARG_INT32(2),
3265  false));
3266 }
3267 
3268 /*
3269  * bytea_substr_no_len -
3270  * Wrapper to avoid opr_sanity failure due to
3271  * one function accepting a different number of args.
3272  */
3273 Datum
3275 {
3277  PG_GETARG_INT32(1),
3278  -1,
3279  true));
3280 }
3281 
3282 static bytea *
3284  int S,
3285  int L,
3286  bool length_not_specified)
3287 {
3288  int32 S1; /* adjusted start position */
3289  int32 L1; /* adjusted substring length */
3290  int32 E; /* end position */
3291 
3292  /*
3293  * The logic here should generally match text_substring().
3294  */
3295  S1 = Max(S, 1);
3296 
3297  if (length_not_specified)
3298  {
3299  /*
3300  * Not passed a length - DatumGetByteaPSlice() grabs everything to the
3301  * end of the string if we pass it a negative value for length.
3302  */
3303  L1 = -1;
3304  }
3305  else if (L < 0)
3306  {
3307  /* SQL99 says to throw an error for E < S, i.e., negative length */
3308  ereport(ERROR,
3309  (errcode(ERRCODE_SUBSTRING_ERROR),
3310  errmsg("negative substring length not allowed")));
3311  L1 = -1; /* silence stupider compilers */
3312  }
3313  else if (pg_add_s32_overflow(S, L, &E))
3314  {
3315  /*
3316  * L could be large enough for S + L to overflow, in which case the
3317  * substring must run to end of string.
3318  */
3319  L1 = -1;
3320  }
3321  else
3322  {
3323  /*
3324  * A zero or negative value for the end position can happen if the
3325  * start was negative or one. SQL99 says to return a zero-length
3326  * string.
3327  */
3328  if (E < 1)
3329  return PG_STR_GET_BYTEA("");
3330 
3331  L1 = E - S1;
3332  }
3333 
3334  /*
3335  * If the start position is past the end of the string, SQL99 says to
3336  * return a zero-length string -- DatumGetByteaPSlice() will do that for
3337  * us. We need only convert S1 to zero-based starting position.
3338  */
3339  return DatumGetByteaPSlice(str, S1 - 1, L1);
3340 }
3341 
3342 /*
3343  * byteaoverlay
3344  * Replace specified substring of first string with second
3345  *
3346  * The SQL standard defines OVERLAY() in terms of substring and concatenation.
3347  * This code is a direct implementation of what the standard says.
3348  */
3349 Datum
3351 {
3352  bytea *t1 = PG_GETARG_BYTEA_PP(0);
3353  bytea *t2 = PG_GETARG_BYTEA_PP(1);
3354  int sp = PG_GETARG_INT32(2); /* substring start position */
3355  int sl = PG_GETARG_INT32(3); /* substring length */
3356 
3357  PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl));
3358 }
3359 
3360 Datum
3362 {
3363  bytea *t1 = PG_GETARG_BYTEA_PP(0);
3364  bytea *t2 = PG_GETARG_BYTEA_PP(1);
3365  int sp = PG_GETARG_INT32(2); /* substring start position */
3366  int sl;
3367 
3368  sl = VARSIZE_ANY_EXHDR(t2); /* defaults to length(t2) */
3369  PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl));
3370 }
3371 
3372 static bytea *
3373 bytea_overlay(bytea *t1, bytea *t2, int sp, int sl)
3374 {
3375  bytea *result;
3376  bytea *s1;
3377  bytea *s2;
3378  int sp_pl_sl;
3379 
3380  /*
3381  * Check for possible integer-overflow cases. For negative sp, throw a
3382  * "substring length" error because that's what should be expected
3383  * according to the spec's definition of OVERLAY().
3384  */
3385  if (sp <= 0)
3386  ereport(ERROR,
3387  (errcode(ERRCODE_SUBSTRING_ERROR),
3388  errmsg("negative substring length not allowed")));
3389  if (pg_add_s32_overflow(sp, sl, &sp_pl_sl))
3390  ereport(ERROR,
3391  (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
3392  errmsg("integer out of range")));
3393 
3394  s1 = bytea_substring(PointerGetDatum(t1), 1, sp - 1, false);
3395  s2 = bytea_substring(PointerGetDatum(t1), sp_pl_sl, -1, true);
3396  result = bytea_catenate(s1, t2);
3397  result = bytea_catenate(result, s2);
3398 
3399  return result;
3400 }
3401 
3402 /*
3403  * bit_count
3404  */
3405 Datum
3407 {
3408  bytea *t1 = PG_GETARG_BYTEA_PP(0);
3409 
3411 }
3412 
3413 /*
3414  * byteapos -
3415  * Return the position of the specified substring.
3416  * Implements the SQL POSITION() function.
3417  * Cloned from textpos and modified as required.
3418  */
3419 Datum
3421 {
3422  bytea *t1 = PG_GETARG_BYTEA_PP(0);
3423  bytea *t2 = PG_GETARG_BYTEA_PP(1);
3424  int pos;
3425  int px,
3426  p;
3427  int len1,
3428  len2;
3429  char *p1,
3430  *p2;
3431 
3432  len1 = VARSIZE_ANY_EXHDR(t1);
3433  len2 = VARSIZE_ANY_EXHDR(t2);
3434 
3435  if (len2 <= 0)
3436  PG_RETURN_INT32(1); /* result for empty pattern */
3437 
3438  p1 = VARDATA_ANY(t1);
3439  p2 = VARDATA_ANY(t2);
3440 
3441  pos = 0;
3442  px = (len1 - len2);
3443  for (p = 0; p <= px; p++)
3444  {
3445  if ((*p2 == *p1) && (memcmp(p1, p2, len2) == 0))
3446  {
3447  pos = p + 1;
3448  break;
3449  };
3450  p1++;
3451  };
3452 
3453  PG_RETURN_INT32(pos);
3454 }
3455 
3456 /*-------------------------------------------------------------
3457  * byteaGetByte
3458  *
3459  * this routine treats "bytea" as an array of bytes.
3460  * It returns the Nth byte (a number between 0 and 255).
3461  *-------------------------------------------------------------
3462  */
3463 Datum
3465 {
3466  bytea *v = PG_GETARG_BYTEA_PP(0);
3467  int32 n = PG_GETARG_INT32(1);
3468  int len;
3469  int byte;
3470 
3471  len = VARSIZE_ANY_EXHDR(v);
3472 
3473  if (n < 0 || n >= len)
3474  ereport(ERROR,
3475  (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3476  errmsg("index %d out of valid range, 0..%d",
3477  n, len - 1)));
3478 
3479  byte = ((unsigned char *) VARDATA_ANY(v))[n];
3480 
3481  PG_RETURN_INT32(byte);
3482 }
3483 
3484 /*-------------------------------------------------------------
3485  * byteaGetBit
3486  *
3487  * This routine treats a "bytea" type like an array of bits.
3488  * It returns the value of the Nth bit (0 or 1).
3489  *
3490  *-------------------------------------------------------------
3491  */
3492 Datum
3494 {
3495  bytea *v = PG_GETARG_BYTEA_PP(0);
3496  int64 n = PG_GETARG_INT64(1);
3497  int byteNo,
3498  bitNo;
3499  int len;
3500  int byte;
3501 
3502  len = VARSIZE_ANY_EXHDR(v);
3503 
3504  if (n < 0 || n >= (int64) len * 8)
3505  ereport(ERROR,
3506  (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3507  errmsg("index %lld out of valid range, 0..%lld",
3508  (long long) n, (long long) len * 8 - 1)));
3509 
3510  /* n/8 is now known < len, so safe to cast to int */
3511  byteNo = (int) (n / 8);
3512  bitNo = (int) (n % 8);
3513 
3514  byte = ((unsigned char *) VARDATA_ANY(v))[byteNo];
3515 
3516  if (byte & (1 << bitNo))
3517  PG_RETURN_INT32(1);
3518  else
3519  PG_RETURN_INT32(0);
3520 }
3521 
3522 /*-------------------------------------------------------------
3523  * byteaSetByte
3524  *
3525  * Given an instance of type 'bytea' creates a new one with
3526  * the Nth byte set to the given value.
3527  *
3528  *-------------------------------------------------------------
3529  */
3530 Datum
3532 {
3534  int32 n = PG_GETARG_INT32(1);
3535  int32 newByte = PG_GETARG_INT32(2);
3536  int len;
3537 
3538  len = VARSIZE(res) - VARHDRSZ;
3539 
3540  if (n < 0 || n >= len)
3541  ereport(ERROR,
3542  (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3543  errmsg("index %d out of valid range, 0..%d",
3544  n, len - 1)));
3545 
3546  /*
3547  * Now set the byte.
3548  */
3549  ((unsigned char *) VARDATA(res))[n] = newByte;
3550 
3552 }
3553 
3554 /*-------------------------------------------------------------
3555  * byteaSetBit
3556  *
3557  * Given an instance of type 'bytea' creates a new one with
3558  * the Nth bit set to the given value.
3559  *
3560  *-------------------------------------------------------------
3561  */
3562 Datum
3564 {
3566  int64 n = PG_GETARG_INT64(1);
3567  int32 newBit = PG_GETARG_INT32(2);
3568  int len;
3569  int oldByte,
3570  newByte;
3571  int byteNo,
3572  bitNo;
3573 
3574  len = VARSIZE(res) - VARHDRSZ;
3575 
3576  if (n < 0 || n >= (int64) len * 8)
3577  ereport(ERROR,
3578  (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3579  errmsg("index %lld out of valid range, 0..%lld",
3580  (long long) n, (long long) len * 8 - 1)));
3581 
3582  /* n/8 is now known < len, so safe to cast to int */
3583  byteNo = (int) (n / 8);
3584  bitNo = (int) (n % 8);
3585 
3586  /*
3587  * sanity check!
3588  */
3589  if (newBit != 0 && newBit != 1)
3590  ereport(ERROR,
3591  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
3592  errmsg("new bit must be 0 or 1")));
3593 
3594  /*
3595  * Update the byte.
3596  */
3597  oldByte = ((unsigned char *) VARDATA(res))[byteNo];
3598 
3599  if (newBit == 0)
3600  newByte = oldByte & (~(1 << bitNo));
3601  else
3602  newByte = oldByte | (1 << bitNo);
3603 
3604  ((unsigned char *) VARDATA(res))[byteNo] = newByte;
3605 
3607 }
3608 
3609 
3610 /* text_name()
3611  * Converts a text type to a Name type.
3612  */
3613 Datum
3615 {
3616  text *s = PG_GETARG_TEXT_PP(0);
3617  Name result;
3618  int len;
3619 
3620  len = VARSIZE_ANY_EXHDR(s);
3621 
3622  /* Truncate oversize input */
3623  if (len >= NAMEDATALEN)
3625 
3626  /* We use palloc0 here to ensure result is zero-padded */
3627  result = (Name) palloc0(NAMEDATALEN);
3628  memcpy(NameStr(*result), VARDATA_ANY(s), len);
3629 
3630  PG_RETURN_NAME(result);
3631 }
3632 
3633 /* name_text()
3634  * Converts a Name type to a text type.
3635  */
3636 Datum
3638 {
3639  Name s = PG_GETARG_NAME(0);
3640 
3642 }
3643 
3644 
3645 /*
3646  * textToQualifiedNameList - convert a text object to list of names
3647  *
3648  * This implements the input parsing needed by nextval() and other
3649  * functions that take a text parameter representing a qualified name.
3650  * We split the name at dots, downcase if not double-quoted, and
3651  * truncate names if they're too long.
3652  */
3653 List *
3655 {
3656  char *rawname;
3657  List *result = NIL;
3658  List *namelist;
3659  ListCell *l;
3660 
3661  /* Convert to C string (handles possible detoasting). */
3662  /* Note we rely on being able to modify rawname below. */
3663  rawname = text_to_cstring(textval);
3664 
3665  if (!SplitIdentifierString(rawname, '.', &namelist))
3666  ereport(ERROR,
3667  (errcode(ERRCODE_INVALID_NAME),
3668  errmsg("invalid name syntax")));
3669 
3670  if (namelist == NIL)
3671  ereport(ERROR,
3672  (errcode(ERRCODE_INVALID_NAME),
3673  errmsg("invalid name syntax")));
3674 
3675  foreach(l, namelist)
3676  {
3677  char *curname = (char *) lfirst(l);
3678 
3679  result = lappend(result, makeString(pstrdup(curname)));
3680  }
3681 
3682  pfree(rawname);
3683  list_free(namelist);
3684 
3685  return result;
3686 }
3687 
3688 /*
3689  * SplitIdentifierString --- parse a string containing identifiers
3690  *
3691  * This is the guts of textToQualifiedNameList, and is exported for use in
3692  * other situations such as parsing GUC variables. In the GUC case, it's
3693  * important to avoid memory leaks, so the API is designed to minimize the
3694  * amount of stuff that needs to be allocated and freed.
3695  *
3696  * Inputs:
3697  * rawstring: the input string; must be overwritable! On return, it's
3698  * been modified to contain the separated identifiers.
3699  * separator: the separator punctuation expected between identifiers
3700  * (typically '.' or ','). Whitespace may also appear around
3701  * identifiers.
3702  * Outputs:
3703  * namelist: filled with a palloc'd list of pointers to identifiers within
3704  * rawstring. Caller should list_free() this even on error return.
3705  *
3706  * Returns true if okay, false if there is a syntax error in the string.
3707  *
3708  * Note that an empty string is considered okay here, though not in
3709  * textToQualifiedNameList.
3710  */
3711 bool
3712 SplitIdentifierString(char *rawstring, char separator,
3713  List **namelist)
3714 {
3715  char *nextp = rawstring;
3716  bool done = false;
3717 
3718  *namelist = NIL;
3719 
3720  while (scanner_isspace(*nextp))
3721  nextp++; /* skip leading whitespace */
3722 
3723  if (*nextp == '\0')
3724  return true; /* allow empty string */
3725 
3726  /* At the top of the loop, we are at start of a new identifier. */
3727  do
3728  {
3729  char *curname;
3730  char *endp;
3731 
3732  if (*nextp == '"')
3733  {
3734  /* Quoted name --- collapse quote-quote pairs, no downcasing */
3735  curname = nextp + 1;
3736  for (;;)
3737  {
3738  endp = strchr(nextp + 1, '"');
3739  if (endp == NULL)
3740  return false; /* mismatched quotes */
3741  if (endp[1] != '"')
3742  break; /* found end of quoted name */
3743  /* Collapse adjacent quotes into one quote, and look again */
3744  memmove(endp, endp + 1, strlen(endp));
3745  nextp = endp;
3746  }
3747  /* endp now points at the terminating quote */
3748  nextp = endp + 1;
3749  }
3750  else
3751  {
3752  /* Unquoted name --- extends to separator or whitespace */
3753  char *downname;
3754  int len;
3755 
3756  curname = nextp;
3757  while (*nextp && *nextp != separator &&
3758  !scanner_isspace(*nextp))
3759  nextp++;
3760  endp = nextp;
3761  if (curname == nextp)
3762  return false; /* empty unquoted name not allowed */
3763 
3764  /*
3765  * Downcase the identifier, using same code as main lexer does.
3766  *
3767  * XXX because we want to overwrite the input in-place, we cannot
3768  * support a downcasing transformation that increases the string
3769  * length. This is not a problem given the current implementation
3770  * of downcase_truncate_identifier, but we'll probably have to do
3771  * something about this someday.
3772  */
3773  len = endp - curname;
3774  downname = downcase_truncate_identifier(curname, len, false);
3775  Assert(strlen(downname) <= len);
3776  strncpy(curname, downname, len); /* strncpy is required here */
3777  pfree(downname);
3778  }
3779 
3780  while (scanner_isspace(*nextp))
3781  nextp++; /* skip trailing whitespace */
3782 
3783  if (*nextp == separator)
3784  {
3785  nextp++;
3786  while (scanner_isspace(*nextp))
3787  nextp++; /* skip leading whitespace for next */
3788  /* we expect another name, so done remains false */
3789  }
3790  else if (*nextp == '\0')
3791  done = true;
3792  else
3793  return false; /* invalid syntax */
3794 
3795  /* Now safe to overwrite separator with a null */
3796  *endp = '\0';
3797 
3798  /* Truncate name if it's overlength */
3799  truncate_identifier(curname, strlen(curname), false);
3800 
3801  /*
3802  * Finished isolating current name --- add it to list
3803  */
3804  *namelist = lappend(*namelist, curname);
3805 
3806  /* Loop back if we didn't reach end of string */
3807  } while (!done);
3808 
3809  return true;
3810 }
3811 
3812 
3813 /*
3814  * SplitDirectoriesString --- parse a string containing file/directory names
3815  *
3816  * This works fine on file names too; the function name is historical.
3817  *
3818  * This is similar to SplitIdentifierString, except that the parsing
3819  * rules are meant to handle pathnames instead of identifiers: there is
3820  * no downcasing, embedded spaces are allowed, the max length is MAXPGPATH-1,
3821  * and we apply canonicalize_path() to each extracted string. Because of the
3822  * last, the returned strings are separately palloc'd rather than being
3823  * pointers into rawstring --- but we still scribble on rawstring.
3824  *
3825  * Inputs:
3826  * rawstring: the input string; must be modifiable!
3827  * separator: the separator punctuation expected between directories
3828  * (typically ',' or ';'). Whitespace may also appear around
3829  * directories.
3830  * Outputs:
3831  * namelist: filled with a palloc'd list of directory names.
3832  * Caller should list_free_deep() this even on error return.
3833  *
3834  * Returns true if okay, false if there is a syntax error in the string.
3835  *
3836  * Note that an empty string is considered okay here.
3837  */
3838 bool
3839 SplitDirectoriesString(char *rawstring, char separator,
3840  List **namelist)
3841 {
3842  char *nextp = rawstring;
3843  bool done = false;
3844 
3845  *namelist = NIL;
3846 
3847  while (scanner_isspace(*nextp))
3848  nextp++; /* skip leading whitespace */
3849 
3850  if (*nextp == '\0')
3851  return true; /* allow empty string */
3852 
3853  /* At the top of the loop, we are at start of a new directory. */
3854  do
3855  {
3856  char *curname;
3857  char *endp;
3858 
3859  if (*nextp == '"')
3860  {
3861  /* Quoted name --- collapse quote-quote pairs */
3862  curname = nextp + 1;
3863  for (;;)
3864  {
3865  endp = strchr(nextp + 1, '"');
3866  if (endp == NULL)
3867  return false; /* mismatched quotes */
3868  if (endp[1] != '"')
3869  break; /* found end of quoted name */
3870  /* Collapse adjacent quotes into one quote, and look again */
3871  memmove(endp, endp + 1, strlen(endp));
3872  nextp = endp;
3873  }
3874  /* endp now points at the terminating quote */
3875  nextp = endp + 1;
3876  }
3877  else
3878  {
3879  /* Unquoted name --- extends to separator or end of string */
3880  curname = endp = nextp;
3881  while (*nextp && *nextp != separator)
3882  {
3883  /* trailing whitespace should not be included in name */
3884  if (!scanner_isspace(*nextp))
3885  endp = nextp + 1;
3886  nextp++;
3887  }
3888  if (curname == endp)
3889  return false; /* empty unquoted name not allowed */
3890  }
3891 
3892  while (scanner_isspace(*nextp))
3893  nextp++; /* skip trailing whitespace */
3894 
3895  if (*nextp == separator)
3896  {
3897  nextp++;
3898  while (scanner_isspace(*nextp))
3899  nextp++; /* skip leading whitespace for next */
3900  /* we expect another name, so done remains false */
3901  }
3902  else if (*nextp == '\0')
3903  done = true;
3904  else
3905  return false; /* invalid syntax */
3906 
3907  /* Now safe to overwrite separator with a null */
3908  *endp = '\0';
3909 
3910  /* Truncate path if it's overlength */
3911  if (strlen(curname) >= MAXPGPATH)
3912  curname[MAXPGPATH - 1] = '\0';
3913 
3914  /*
3915  * Finished isolating current name --- add it to list
3916  */
3917  curname = pstrdup(curname);
3918  canonicalize_path(curname);
3919  *namelist = lappend(*namelist, curname);
3920 
3921  /* Loop back if we didn't reach end of string */
3922  } while (!done);
3923 
3924  return true;
3925 }
3926 
3927 
3928 /*
3929  * SplitGUCList --- parse a string containing identifiers or file names
3930  *
3931  * This is used to split the value of a GUC_LIST_QUOTE GUC variable, without
3932  * presuming whether the elements will be taken as identifiers or file names.
3933  * We assume the input has already been through flatten_set_variable_args(),
3934  * so that we need never downcase (if appropriate, that was done already).
3935  * Nor do we ever truncate, since we don't know the correct max length.
3936  * We disallow embedded whitespace for simplicity (it shouldn't matter,
3937  * because any embedded whitespace should have led to double-quoting).
3938  * Otherwise the API is identical to SplitIdentifierString.
3939  *
3940  * XXX it's annoying to have so many copies of this string-splitting logic.
3941  * However, it's not clear that having one function with a bunch of option
3942  * flags would be much better.
3943  *
3944  * XXX there is a version of this function in src/bin/pg_dump/dumputils.c.
3945  * Be sure to update that if you have to change this.
3946  *
3947  * Inputs:
3948  * rawstring: the input string; must be overwritable! On return, it's
3949  * been modified to contain the separated identifiers.
3950  * separator: the separator punctuation expected between identifiers
3951  * (typically '.' or ','). Whitespace may also appear around
3952  * identifiers.
3953  * Outputs:
3954  * namelist: filled with a palloc'd list of pointers to identifiers within
3955  * rawstring. Caller should list_free() this even on error return.
3956  *
3957  * Returns true if okay, false if there is a syntax error in the string.
3958  */
3959 bool
3960 SplitGUCList(char *rawstring, char separator,
3961  List **namelist)
3962 {
3963  char *nextp = rawstring;
3964  bool done = false;
3965 
3966  *namelist = NIL;
3967 
3968  while (scanner_isspace(*nextp))
3969  nextp++; /* skip leading whitespace */
3970 
3971  if (*nextp == '\0')
3972  return true; /* allow empty string */
3973 
3974  /* At the top of the loop, we are at start of a new identifier. */
3975  do
3976  {
3977  char *curname;
3978  char *endp;
3979 
3980  if (*nextp == '"')
3981  {
3982  /* Quoted name --- collapse quote-quote pairs */
3983  curname = nextp + 1;
3984  for (;;)
3985  {
3986  endp = strchr(nextp + 1, '"');
3987  if (endp == NULL)
3988  return false; /* mismatched quotes */
3989  if (endp[1] != '"')
3990  break; /* found end of quoted name */
3991  /* Collapse adjacent quotes into one quote, and look again */
3992  memmove(endp, endp + 1, strlen(endp));
3993  nextp = endp;
3994  }
3995  /* endp now points at the terminating quote */
3996  nextp = endp + 1;
3997  }
3998  else
3999  {
4000  /* Unquoted name --- extends to separator or whitespace */
4001  curname = nextp;
4002  while (*nextp && *nextp != separator &&
4003  !scanner_isspace(*nextp))
4004  nextp++;
4005  endp = nextp;
4006  if (curname == nextp)
4007  return false; /* empty unquoted name not allowed */
4008  }
4009 
4010  while (scanner_isspace(*nextp))
4011  nextp++; /* skip trailing whitespace */
4012 
4013  if (*nextp == separator)
4014  {
4015  nextp++;
4016  while (scanner_isspace(*nextp))
4017  nextp++; /* skip leading whitespace for next */
4018  /* we expect another name, so done remains false */
4019  }
4020  else if (*nextp == '\0')
4021  done = true;
4022  else
4023  return false; /* invalid syntax */
4024 
4025  /* Now safe to overwrite separator with a null */
4026  *endp = '\0';
4027 
4028  /*
4029  * Finished isolating current name --- add it to list
4030  */
4031  *namelist = lappend(*namelist, curname);
4032 
4033  /* Loop back if we didn't reach end of string */
4034  } while (!done);
4035 
4036  return true;
4037 }
4038 
4039 
4040 /*****************************************************************************
4041  * Comparison Functions used for bytea
4042  *
4043  * Note: btree indexes need these routines not to leak memory; therefore,
4044  * be careful to free working copies of toasted datums. Most places don't
4045  * need to be so careful.
4046  *****************************************************************************/
4047 
4048 Datum
4050 {
4051  Datum arg1 = PG_GETARG_DATUM(0);
4052  Datum arg2 = PG_GETARG_DATUM(1);
4053  bool result;
4054  Size len1,
4055  len2;
4056 
4057  /*
4058  * We can use a fast path for unequal lengths, which might save us from
4059  * having to detoast one or both values.
4060  */
4061  len1 = toast_raw_datum_size(arg1);
4062  len2 = toast_raw_datum_size(arg2);
4063  if (len1 != len2)
4064  result = false;
4065  else
4066  {
4067  bytea *barg1 = DatumGetByteaPP(arg1);
4068  bytea *barg2 = DatumGetByteaPP(arg2);
4069 
4070  result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2),
4071  len1 - VARHDRSZ) == 0);
4072 
4073  PG_FREE_IF_COPY(barg1, 0);
4074  PG_FREE_IF_COPY(barg2, 1);
4075  }
4076 
4077  PG_RETURN_BOOL(result);
4078 }
4079 
4080 Datum
4082 {
4083  Datum arg1 = PG_GETARG_DATUM(0);
4084  Datum arg2 = PG_GETARG_DATUM(1);
4085  bool result;
4086  Size len1,
4087  len2;
4088 
4089  /*
4090  * We can use a fast path for unequal lengths, which might save us from
4091  * having to detoast one or both values.
4092  */
4093  len1 = toast_raw_datum_size(arg1);
4094  len2 = toast_raw_datum_size(arg2);
4095  if (len1 != len2)
4096  result = true;
4097  else
4098  {
4099  bytea *barg1 = DatumGetByteaPP(arg1);
4100  bytea *barg2 = DatumGetByteaPP(arg2);
4101 
4102  result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2),
4103  len1 - VARHDRSZ) != 0);
4104 
4105  PG_FREE_IF_COPY(barg1, 0);
4106  PG_FREE_IF_COPY(barg2, 1);
4107  }
4108 
4109  PG_RETURN_BOOL(result);
4110 }
4111 
4112 Datum
4114 {
4115  bytea *arg1 = PG_GETARG_BYTEA_PP(0);
4116  bytea *arg2 = PG_GETARG_BYTEA_PP(1);
4117  int len1,
4118  len2;
4119  int cmp;
4120 
4121  len1 = VARSIZE_ANY_EXHDR(arg1);
4122  len2 = VARSIZE_ANY_EXHDR(arg2);
4123 
4124  cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4125 
4126  PG_FREE_IF_COPY(arg1, 0);
4127  PG_FREE_IF_COPY(arg2, 1);
4128 
4129  PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 < len2)));
4130 }
4131 
4132 Datum
4134 {
4135  bytea *arg1 = PG_GETARG_BYTEA_PP(0);
4136  bytea *arg2 = PG_GETARG_BYTEA_PP(1);
4137  int len1,
4138  len2;
4139  int cmp;
4140 
4141  len1 = VARSIZE_ANY_EXHDR(arg1);
4142  len2 = VARSIZE_ANY_EXHDR(arg2);
4143 
4144  cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4145 
4146  PG_FREE_IF_COPY(arg1, 0);
4147  PG_FREE_IF_COPY(arg2, 1);
4148 
4149  PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 <= len2)));
4150 }
4151 
4152 Datum
4154 {
4155  bytea *arg1 = PG_GETARG_BYTEA_PP(0);
4156  bytea *arg2 = PG_GETARG_BYTEA_PP(1);
4157  int len1,
4158  len2;
4159  int cmp;
4160 
4161  len1 = VARSIZE_ANY_EXHDR(arg1);
4162  len2 = VARSIZE_ANY_EXHDR(arg2);
4163 
4164  cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4165 
4166  PG_FREE_IF_COPY(arg1, 0);
4167  PG_FREE_IF_COPY(arg2, 1);
4168 
4169  PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 > len2)));
4170 }
4171 
4172 Datum
4174 {
4175  bytea *arg1 = PG_GETARG_BYTEA_PP(0);
4176  bytea *arg2 = PG_GETARG_BYTEA_PP(1);
4177  int len1,
4178  len2;
4179  int cmp;
4180 
4181  len1 = VARSIZE_ANY_EXHDR(arg1);
4182  len2 = VARSIZE_ANY_EXHDR(arg2);
4183 
4184  cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4185 
4186  PG_FREE_IF_COPY(arg1, 0);
4187  PG_FREE_IF_COPY(arg2, 1);
4188 
4189  PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 >= len2)));
4190 }
4191 
4192 Datum
4194 {
4195  bytea *arg1 = PG_GETARG_BYTEA_PP(0);
4196  bytea *arg2 = PG_GETARG_BYTEA_PP(1);
4197  int len1,
4198  len2;
4199  int cmp;
4200 
4201  len1 = VARSIZE_ANY_EXHDR(arg1);
4202  len2 = VARSIZE_ANY_EXHDR(arg2);
4203 
4204  cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4205  if ((cmp == 0) && (len1 != len2))
4206  cmp = (len1 < len2) ? -1 : 1;
4207 
4208  PG_FREE_IF_COPY(arg1, 0);
4209  PG_FREE_IF_COPY(arg2, 1);
4210 
4212 }
4213 
4214 Datum
4216 {
4218  MemoryContext oldcontext;
4219 
4220  oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
4221 
4222  /* Use generic string SortSupport, forcing "C" collation */
4223  varstr_sortsupport(ssup, BYTEAOID, C_COLLATION_OID);
4224 
4225  MemoryContextSwitchTo(oldcontext);
4226 
4227  PG_RETURN_VOID();
4228 }
4229 
4230 /*
4231  * appendStringInfoText
4232  *
4233  * Append a text to str.
4234  * Like appendStringInfoString(str, text_to_cstring(t)) but faster.
4235  */
4236 static void
4238 {
4240 }
4241 
4242 /*
4243  * replace_text
4244  * replace all occurrences of 'old_sub_str' in 'orig_str'
4245  * with 'new_sub_str' to form 'new_str'
4246  *
4247  * returns 'orig_str' if 'old_sub_str' == '' or 'orig_str' == ''
4248  * otherwise returns 'new_str'
4249  */
4250 Datum
4252 {
4253  text *src_text = PG_GETARG_TEXT_PP(0);
4254  text *from_sub_text = PG_GETARG_TEXT_PP(1);
4255  text *to_sub_text = PG_GETARG_TEXT_PP(2);
4256  int src_text_len;
4257  int from_sub_text_len;
4259  text *ret_text;
4260  int chunk_len;
4261  char *curr_ptr;
4262  char *start_ptr;
4264  bool found;
4265 
4266  src_text_len = VARSIZE_ANY_EXHDR(src_text);
4267  from_sub_text_len = VARSIZE_ANY_EXHDR(from_sub_text);
4268 
4269  /* Return unmodified source string if empty source or pattern */
4270  if (src_text_len < 1 || from_sub_text_len < 1)
4271  {
4272  PG_RETURN_TEXT_P(src_text);
4273  }
4274 
4275  text_position_setup(src_text, from_sub_text, PG_GET_COLLATION(), &state);
4276 
4277  found = text_position_next(&state);
4278 
4279  /* When the from_sub_text is not found, there is nothing to do. */
4280  if (!found)
4281  {
4283  PG_RETURN_TEXT_P(src_text);
4284  }
4285  curr_ptr = text_position_get_match_ptr(&state);
4286  start_ptr = VARDATA_ANY(src_text);
4287 
4288  initStringInfo(&str);
4289 
4290  do
4291  {
4293 
4294  /* copy the data skipped over by last text_position_next() */
4295  chunk_len = curr_ptr - start_ptr;
4296  appendBinaryStringInfo(&str, start_ptr, chunk_len);
4297 
4298  appendStringInfoText(&str, to_sub_text);
4299 
4300  start_ptr = curr_ptr + from_sub_text_len;
4301 
4302  found = text_position_next(&state);
4303  if (found)
4304  curr_ptr = text_position_get_match_ptr(&state);
4305  }
4306  while (found);
4307 
4308  /* copy trailing data */
4309  chunk_len = ((char *) src_text + VARSIZE_ANY(src_text)) - start_ptr;
4310  appendBinaryStringInfo(&str, start_ptr, chunk_len);
4311 
4313 
4314  ret_text = cstring_to_text_with_len(str.data, str.len);
4315  pfree(str.data);
4316 
4317  PG_RETURN_TEXT_P(ret_text);
4318 }
4319 
4320 /*
4321  * check_replace_text_has_escape
4322  *
4323  * Returns 0 if text contains no backslashes that need processing.
4324  * Returns 1 if text contains backslashes, but not regexp submatch specifiers.
4325  * Returns 2 if text contains regexp submatch specifiers (\1 .. \9).
4326  */
4327 static int
4329 {
4330  int result = 0;
4331  const char *p = VARDATA_ANY(replace_text);
4332  const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);
4333 
4334  while (p < p_end)
4335  {
4336  /* Find next escape char, if any. */
4337  p = memchr(p, '\\', p_end - p);
4338  if (p == NULL)
4339  break;
4340  p++;
4341  /* Note: a backslash at the end doesn't require extra processing. */
4342  if (p < p_end)
4343  {
4344  if (*p >= '1' && *p <= '9')
4345  return 2; /* Found a submatch specifier, so done */
4346  result = 1; /* Found some other sequence, keep looking */
4347  p++;
4348  }
4349  }
4350  return result;
4351 }
4352 
4353 /*
4354  * appendStringInfoRegexpSubstr
4355  *
4356  * Append replace_text to str, substituting regexp back references for
4357  * \n escapes. start_ptr is the start of the match in the source string,
4358  * at logical character position data_pos.
4359  */
4360 static void
4362  regmatch_t *pmatch,
4363  char *start_ptr, int data_pos)
4364 {
4365  const char *p = VARDATA_ANY(replace_text);
4366  const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);
4367 
4368  while (p < p_end)
4369  {
4370  const char *chunk_start = p;
4371  int so;
4372  int eo;
4373 
4374  /* Find next escape char, if any. */
4375  p = memchr(p, '\\', p_end - p);
4376  if (p == NULL)
4377  p = p_end;
4378 
4379  /* Copy the text we just scanned over, if any. */
4380  if (p > chunk_start)
4381  appendBinaryStringInfo(str, chunk_start, p - chunk_start);
4382 
4383  /* Done if at end of string, else advance over escape char. */
4384  if (p >= p_end)
4385  break;
4386  p++;
4387 
4388  if (p >= p_end)
4389  {
4390  /* Escape at very end of input. Treat same as unexpected char */
4391  appendStringInfoChar(str, '\\');
4392  break;
4393  }
4394 
4395  if (*p >= '1' && *p <= '9')
4396  {
4397  /* Use the back reference of regexp. */
4398  int idx = *p - '0';
4399 
4400  so = pmatch[idx].rm_so;
4401  eo = pmatch[idx].rm_eo;
4402  p++;
4403  }
4404  else if (*p == '&')
4405  {
4406  /* Use the entire matched string. */
4407  so = pmatch[0].rm_so;
4408  eo = pmatch[0].rm_eo;
4409  p++;
4410  }
4411  else if (*p == '\\')
4412  {
4413  /* \\ means transfer one \ to output. */
4414  appendStringInfoChar(str, '\\');
4415  p++;
4416  continue;
4417  }
4418  else
4419  {
4420  /*
4421  * If escape char is not followed by any expected char, just treat
4422  * it as ordinary data to copy. (XXX would it be better to throw
4423  * an error?)
4424  */
4425  appendStringInfoChar(str, '\\');
4426  continue;
4427  }
4428 
4429  if (so >= 0 && eo >= 0)
4430  {
4431  /*
4432  * Copy the text that is back reference of regexp. Note so and eo
4433  * are counted in characters not bytes.
4434  */
4435  char *chunk_start;
4436  int chunk_len;
4437 
4438  Assert(so >= data_pos);
4439  chunk_start = start_ptr;
4440  chunk_start += charlen_to_bytelen(chunk_start, so - data_pos);
4441  chunk_len = charlen_to_bytelen(chunk_start, eo - so);
4442  appendBinaryStringInfo(str, chunk_start, chunk_len);
4443  }
4444  }
4445 }
4446 
4447 /*
4448  * replace_text_regexp
4449  *
4450  * replace substring(s) in src_text that match pattern with replace_text.
4451  * The replace_text can contain backslash markers to substitute
4452  * (parts of) the matched text.
4453  *
4454  * cflags: regexp compile flags.
4455  * collation: collation to use.
4456  * search_start: the character (not byte) offset in src_text at which to
4457  * begin searching.
4458  * n: if 0, replace all matches; if > 0, replace only the N'th match.
4459  */
4460 text *
4461 replace_text_regexp(text *src_text, text *pattern_text,
4462  text *replace_text,
4463  int cflags, Oid collation,
4464  int search_start, int n)
4465 {
4466  text *ret_text;
4467  regex_t *re;
4468  int src_text_len = VARSIZE_ANY_EXHDR(src_text);
4469  int nmatches = 0;
4471  regmatch_t pmatch[10]; /* main match, plus \1 to \9 */
4472  int nmatch = lengthof(pmatch);
4473  pg_wchar *data;
4474  size_t data_len;
4475  int data_pos;
4476  char *start_ptr;
4477  int escape_status;
4478 
4479  initStringInfo(&buf);
4480 
4481  /* Convert data string to wide characters. */
4482  data = (pg_wchar *) palloc((src_text_len + 1) * sizeof(pg_wchar));
4483  data_len = pg_mb2wchar_with_len(VARDATA_ANY(src_text), data, src_text_len);
4484 
4485  /* Check whether replace_text has escapes, especially regexp submatches. */
4487 
4488  /* If no regexp submatches, we can use REG_NOSUB. */
4489  if (escape_status < 2)
4490  {
4491  cflags |= REG_NOSUB;
4492  /* Also tell pg_regexec we only want the whole-match location. */
4493  nmatch = 1;
4494  }
4495 
4496  /* Prepare the regexp. */
4497  re = RE_compile_and_cache(pattern_text, cflags, collation);
4498 
4499  /* start_ptr points to the data_pos'th character of src_text */
4500  start_ptr = (char *) VARDATA_ANY(src_text);
4501  data_pos = 0;
4502 
4503  while (search_start <= data_len)
4504  {
4505  int regexec_result;
4506 
4508 
4509  regexec_result = pg_regexec(re,
4510  data,
4511  data_len,
4512  search_start,
4513  NULL, /* no details */
4514  nmatch,
4515  pmatch,
4516  0);
4517 
4518  if (regexec_result == REG_NOMATCH)
4519  break;
4520 
4521  if (regexec_result != REG_OKAY)
4522  {
4523  char errMsg[100];
4524 
4526  pg_regerror(regexec_result, re, errMsg, sizeof(errMsg));
4527  ereport(ERROR,
4528  (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
4529  errmsg("regular expression failed: %s", errMsg)));
4530  }
4531 
4532  /*
4533  * Count matches, and decide whether to replace this match.
4534  */
4535  nmatches++;
4536  if (n > 0 && nmatches != n)
4537  {
4538  /*
4539  * No, so advance search_start, but not start_ptr/data_pos. (Thus,
4540  * we treat the matched text as if it weren't matched, and copy it
4541  * to the output later.)
4542  */
4543  search_start = pmatch[0].rm_eo;
4544  if (pmatch[0].rm_so == pmatch[0].rm_eo)
4545  search_start++;
4546  continue;
4547  }
4548 
4549  /*
4550  * Copy the text to the left of the match position. Note we are given
4551  * character not byte indexes.
4552  */
4553  if (pmatch[0].rm_so - data_pos > 0)
4554  {
4555  int chunk_len;
4556 
4557  chunk_len = charlen_to_bytelen(start_ptr,
4558  pmatch[0].rm_so - data_pos);
4559  appendBinaryStringInfo(&buf, start_ptr, chunk_len);
4560 
4561  /*
4562  * Advance start_ptr over that text, to avoid multiple rescans of
4563  * it if the replace_text contains multiple back-references.
4564  */
4565  start_ptr += chunk_len;
4566  data_pos = pmatch[0].rm_so;
4567  }
4568 
4569  /*
4570  * Copy the replace_text, processing escapes if any are present.
4571  */
4572  if (escape_status > 0)
4574  start_ptr, data_pos);
4575  else
4577 
4578  /* Advance start_ptr and data_pos over the matched text. */
4579  start_ptr += charlen_to_bytelen(start_ptr,
4580  pmatch[0].rm_eo - data_pos);
4581  data_pos = pmatch[0].rm_eo;
4582 
4583  /*
4584  * If we only want to replace one occurrence, we're done.
4585  */
4586  if (n > 0)
4587  break;
4588 
4589  /*
4590  * Advance search position. Normally we start the next search at the
4591  * end of the previous match; but if the match was of zero length, we
4592  * have to advance by one character, or we'd just find the same match
4593  * again.
4594  */
4595  search_start = data_pos;
4596  if (pmatch[0].rm_so == pmatch[0].rm_eo)
4597  search_start++;
4598  }
4599 
4600  /*
4601  * Copy the text to the right of the last match.
4602  */
4603  if (data_pos < data_len)
4604  {
4605  int chunk_len;
4606 
4607  chunk_len = ((char *) src_text + VARSIZE_ANY(src_text)) - start_ptr;
4608  appendBinaryStringInfo(&buf, start_ptr, chunk_len);
4609  }
4610 
4611  ret_text = cstring_to_text_with_len(buf.data, buf.len);
4612  pfree(buf.data);
4613  pfree(data);
4614 
4615  return ret_text;
4616 }
4617 
4618 /*
4619  * split_part
4620  * parse input string based on provided field separator
4621  * return N'th item (1 based, negative counts from end)
4622  */
4623 Datum
4625 {
4626  text *inputstring = PG_GETARG_TEXT_PP(0);
4627  text *fldsep = PG_GETARG_TEXT_PP(1);
4628  int fldnum = PG_GETARG_INT32(2);
4629  int inputstring_len;
4630  int fldsep_len;
4632  char *start_ptr;
4633  char *end_ptr;
4634  text *result_text;
4635  bool found;
4636 
4637  /* field number is 1 based */
4638  if (fldnum == 0)
4639  ereport(ERROR,
4640  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4641  errmsg("field position must not be zero")));
4642 
4643  inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
4644  fldsep_len = VARSIZE_ANY_EXHDR(fldsep);
4645 
4646  /* return empty string for empty input string */
4647  if (inputstring_len < 1)
4649 
4650  /* handle empty field separator */
4651  if (fldsep_len < 1)
4652  {
4653  /* if first or last field, return input string, else empty string */
4654  if (fldnum == 1 || fldnum == -1)
4655  PG_RETURN_TEXT_P(inputstring);
4656  else
4658  }
4659 
4660  /* find the first field separator */
4661  text_position_setup(inputstring, fldsep, PG_GET_COLLATION(), &state);
4662 
4663  found = text_position_next(&state);
4664 
4665  /* special case if fldsep not found at all */
4666  if (!found)
4667  {
4669  /* if first or last field, return input string, else empty string */
4670  if (fldnum == 1 || fldnum == -1)
4671  PG_RETURN_TEXT_P(inputstring);
4672  else
4674  }
4675 
4676  /*
4677  * take care of a negative field number (i.e. count from the right) by
4678  * converting to a positive field number; we need total number of fields
4679  */
4680  if (fldnum < 0)
4681  {
4682  /* we found a fldsep, so there are at least two fields */
4683  int numfields = 2;
4684 
4685  while (text_position_next(&state))
4686  numfields++;
4687 
4688  /* special case of last field does not require an extra pass */
4689  if (fldnum == -1)
4690  {
4691  start_ptr = text_position_get_match_ptr(&state) + fldsep_len;
4692  end_ptr = VARDATA_ANY(inputstring) + inputstring_len;
4695  end_ptr - start_ptr));
4696  }
4697 
4698  /* else, convert fldnum to positive notation */
4699  fldnum += numfields + 1;
4700 
4701  /* if nonexistent field, return empty string */
4702  if (fldnum <= 0)
4703  {
4706  }
4707 
4708  /* reset to pointing at first match, but now with positive fldnum */
4710  found = text_position_next(&state);
4711  Assert(found);
4712  }
4713 
4714  /* identify bounds of first field */
4715  start_ptr = VARDATA_ANY(inputstring);
4716  end_ptr = text_position_get_match_ptr(&state);
4717 
4718  while (found && --fldnum > 0)
4719  {
4720  /* identify bounds of next field */
4721  start_ptr = end_ptr + fldsep_len;
4722  found = text_position_next(&state);
4723  if (found)
4724  end_ptr = text_position_get_match_ptr(&state);
4725  }
4726 
4728 
4729  if (fldnum > 0)
4730  {
4731  /* N'th field separator not found */
4732  /* if last field requested, return it, else empty string */
4733  if (fldnum == 1)
4734  {
4735  int last_len = start_ptr - VARDATA_ANY(inputstring);
4736 
4737  result_text = cstring_to_text_with_len(start_ptr,
4738  inputstring_len - last_len);
4739  }
4740  else
4741  result_text = cstring_to_text("");
4742  }
4743  else
4744  {
4745  /* non-last field requested */
4746  result_text = cstring_to_text_with_len(start_ptr, end_ptr - start_ptr);
4747  }
4748 
4749  PG_RETURN_TEXT_P(result_text);
4750 }
4751 
4752 /*
4753  * Convenience function to return true when two text params are equal.
4754  */
4755 static bool
4756 text_isequal(text *txt1, text *txt2, Oid collid)
4757 {
4759  collid,
4760  PointerGetDatum(txt1),
4761  PointerGetDatum(txt2)));
4762 }
4763 
4764 /*
4765  * text_to_array
4766  * parse input string and return text array of elements,
4767  * based on provided field separator
4768  */
4769 Datum
4771 {
4772  SplitTextOutputData tstate;
4773 
4774  /* For array output, tstate should start as all zeroes */
4775  memset(&tstate, 0, sizeof(tstate));
4776 
4777  if (!split_text(fcinfo, &tstate))
4778  PG_RETURN_NULL();
4779 
4780  if (tstate.astate == NULL)
4782 
4785 }
4786 
4787 /*
4788  * text_to_array_null
4789  * parse input string and return text array of elements,
4790  * based on provided field separator and null string
4791  *
4792  * This is a separate entry point only to prevent the regression tests from
4793  * complaining about different argument sets for the same internal function.
4794  */
4795 Datum
4797 {
4798  return text_to_array(fcinfo);
4799 }
4800 
4801 /*
4802  * text_to_table
4803  * parse input string and return table of elements,
4804  * based on provided field separator
4805  */
4806 Datum
4808 {
4809  ReturnSetInfo *rsi = (ReturnSetInfo *) fcinfo->resultinfo;
4810  SplitTextOutputData tstate;
4811 
4812  tstate.astate = NULL;
4814  tstate.tupstore = rsi->setResult;
4815  tstate.tupdesc = rsi->setDesc;
4816 
4817  (void) split_text(fcinfo, &tstate);
4818 
4819  return (Datum) 0;
4820 }
4821 
4822 /*
4823  * text_to_table_null
4824  * parse input string and return table of elements,
4825  * based on provided field separator and null string
4826  *
4827  * This is a separate entry point only to prevent the regression tests from
4828  * complaining about different argument sets for the same internal function.
4829  */
4830 Datum
4832 {
4833  return text_to_table(fcinfo);
4834 }
4835 
4836 /*
4837  * Common code for text_to_array, text_to_array_null, text_to_table
4838  * and text_to_table_null functions.
4839  *
4840  * These are not strict so we have to test for null inputs explicitly.
4841  * Returns false if result is to be null, else returns true.
4842  *
4843  * Note that if the result is valid but empty (zero elements), we return
4844  * without changing *tstate --- caller must handle that case, too.
4845  */
4846 static bool
4848 {
4849  text *inputstring;
4850  text *fldsep;
4851  text *null_string;
4852  Oid collation = PG_GET_COLLATION();
4853  int inputstring_len;
4854  int fldsep_len;
4855  char *start_ptr;
4856  text *result_text;
4857 
4858  /* when input string is NULL, then result is NULL too */
4859  if (PG_ARGISNULL(0))
4860  return false;
4861 
4862  inputstring = PG_GETARG_TEXT_PP(0);
4863 
4864  /* fldsep can be NULL */
4865  if (!PG_ARGISNULL(1))
4866  fldsep = PG_GETARG_TEXT_PP(1);
4867  else
4868  fldsep = NULL;
4869 
4870  /* null_string can be NULL or omitted */
4871  if (PG_NARGS() > 2 && !PG_ARGISNULL(2))
4872  null_string = PG_GETARG_TEXT_PP(2);
4873  else
4874  null_string = NULL;
4875 
4876  if (fldsep != NULL)
4877  {
4878  /*
4879  * Normal case with non-null fldsep. Use the text_position machinery
4880  * to search for occurrences of fldsep.
4881  */
4883 
4884  inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
4885  fldsep_len = VARSIZE_ANY_EXHDR(fldsep);
4886 
4887  /* return empty set for empty input string */
4888  if (inputstring_len < 1)
4889  return true;
4890 
4891  /* empty field separator: return input string as a one-element set */
4892  if (fldsep_len < 1)
4893  {
4894  split_text_accum_result(tstate, inputstring,
4895  null_string, collation);
4896  return true;
4897  }
4898 
4899  text_position_setup(inputstring, fldsep, collation, &state);
4900 
4901  start_ptr = VARDATA_ANY(inputstring);
4902 
4903  for (;;)
4904  {
4905  bool found;
4906  char *end_ptr;
4907  int chunk_len;
4908 
4910 
4911  found = text_position_next(&state);
4912  if (!found)
4913  {
4914  /* fetch last field */
4915  chunk_len = ((char *) inputstring + VARSIZE_ANY(inputstring)) - start_ptr;
4916  end_ptr = NULL; /* not used, but some compilers complain */
4917  }
4918  else
4919  {
4920  /* fetch non-last field */
4921  end_ptr = text_position_get_match_ptr(&state);
4922  chunk_len = end_ptr - start_ptr;
4923  }
4924 
4925  /* build a temp text datum to pass to split_text_accum_result */
4926  result_text = cstring_to_text_with_len(start_ptr, chunk_len);
4927 
4928  /* stash away this field */
4929  split_text_accum_result(tstate, result_text,
4930  null_string, collation);
4931 
4932  pfree(result_text);
4933 
4934  if (!found)
4935  break;
4936 
4937  start_ptr = end_ptr + fldsep_len;
4938  }
4939 
4941  }
4942  else
4943  {
4944  /*
4945  * When fldsep is NULL, each character in the input string becomes a
4946  * separate element in the result set. The separator is effectively
4947  * the space between characters.
4948  */
4949  inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
4950 
4951  start_ptr = VARDATA_ANY(inputstring);
4952 
4953  while (inputstring_len > 0)
4954  {
4955  int chunk_len = pg_mblen(start_ptr);
4956 
4958 
4959  /* build a temp text datum to pass to split_text_accum_result */
4960  result_text = cstring_to_text_with_len(start_ptr, chunk_len);
4961 
4962  /* stash away this field */
4963  split_text_accum_result(tstate, result_text,
4964  null_string, collation);
4965 
4966  pfree(result_text);
4967 
4968  start_ptr += chunk_len;
4969  inputstring_len -= chunk_len;
4970  }
4971  }
4972 
4973  return true;
4974 }
4975 
4976 /*
4977  * Add text item to result set (table or array).
4978  *
4979  * This is also responsible for checking to see if the item matches
4980  * the null_string, in which case we should emit NULL instead.
4981  */
4982 static void
4984  text *field_value,
4985  text *null_string,
4986  Oid collation)
4987 {
4988  bool is_null = false;
4989 
4990  if (null_string && text_isequal(field_value, null_string, collation))
4991  is_null = true;
4992 
4993  if (tstate->tupstore)
4994  {
4995  Datum values[1];
4996  bool nulls[1];
4997 
4998  values[0] = PointerGetDatum(field_value);
4999  nulls[0] = is_null;
5000 
5002  tstate->tupdesc,
5003  values,
5004  nulls);
5005  }
5006  else
5007  {
5008  tstate->astate = accumArrayResult(tstate->astate,
5009  PointerGetDatum(field_value),
5010  is_null,
5011  TEXTOID,
5013  }
5014 }
5015 
5016 /*
5017  * array_to_text
5018  * concatenate Cstring representation of input array elements
5019  * using provided field separator
5020  */
5021 Datum
5023 {
5025  char *fldsep = text_to_cstring(PG_GETARG_TEXT_PP(1));
5026 
5027  PG_RETURN_TEXT_P(array_to_text_internal(fcinfo, v, fldsep, NULL));
5028 }
5029 
5030 /*
5031  * array_to_text_null
5032  * concatenate Cstring representation of input array elements
5033  * using provided field separator and null string
5034  *
5035  * This version is not strict so we have to test for null inputs explicitly.
5036  */
5037 Datum
5039 {
5040  ArrayType *v;
5041  char *fldsep;
5042  char *null_string;
5043 
5044  /* returns NULL when first or second parameter is NULL */
5045  if (PG_ARGISNULL(0) || PG_ARGISNULL(1))
5046  PG_RETURN_NULL();
5047 
5048  v = PG_GETARG_ARRAYTYPE_P(0);
5049  fldsep = text_to_cstring(PG_GETARG_TEXT_PP(1));
5050 
5051  /* NULL null string is passed through as a null pointer */
5052  if (!PG_ARGISNULL(2))
5053  null_string = text_to_cstring(PG_GETARG_TEXT_PP(2));
5054  else
5055  null_string = NULL;
5056 
5057  PG_RETURN_TEXT_P(array_to_text_internal(fcinfo, v, fldsep, null_string));
5058 }
5059 
5060 /*
5061  * common code for array_to_text and array_to_text_null functions
5062  */
5063 static text *
5065  const char *fldsep, const char *null_string)
5066 {
5067  text *result;
5068  int nitems,
5069  *dims,
5070  ndims;
5071  Oid element_type;
5072  int typlen;
5073  bool typbyval;
5074  char typalign;
5076  bool printed = false;
5077  char *p;
5078  bits8 *bitmap;
5079  int bitmask;
5080  int i;
5081  ArrayMetaState *my_extra;
5082 
5083  ndims = ARR_NDIM(v);
5084  dims = ARR_DIMS(v);
5085  nitems = ArrayGetNItems(ndims, dims);
5086 
5087  /* if there are no elements, return an empty string */
5088  if (nitems == 0)
5089  return cstring_to_text_with_len("", 0);
5090 
5091  element_type = ARR_ELEMTYPE(v);
5092  initStringInfo(&buf);
5093 
5094  /*
5095  * We arrange to look up info about element type, including its output
5096  * conversion proc, only once per series of calls, assuming the element
5097  * type doesn't change underneath us.
5098  */
5099  my_extra = (ArrayMetaState *) fcinfo->flinfo->fn_extra;
5100  if (my_extra == NULL)
5101  {
5102  fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
5103  sizeof(ArrayMetaState));
5104  my_extra = (ArrayMetaState *) fcinfo->flinfo->fn_extra;
5105  my_extra->element_type = ~element_type;
5106  }
5107 
5108  if (my_extra->element_type != element_type)
5109  {
5110  /*
5111  * Get info about element type, including its output conversion proc
5112  */
5113  get_type_io_data(element_type, IOFunc_output,
5114  &my_extra->typlen, &my_extra->typbyval,
5115  &my_extra->typalign, &my_extra->typdelim,
5116  &my_extra->typioparam, &my_extra->typiofunc);
5117  fmgr_info_cxt(my_extra->typiofunc, &my_extra->proc,
5118  fcinfo->flinfo->fn_mcxt);
5119  my_extra->element_type = element_type;
5120  }
5121  typlen = my_extra->typlen;
5122  typbyval = my_extra->typbyval;
5123  typalign = my_extra->typalign;
5124 
5125  p = ARR_DATA_PTR(v);
5126  bitmap = ARR_NULLBITMAP(v);
5127  bitmask = 1;
5128 
5129  for (i = 0; i < nitems; i++)
5130  {
5131  Datum itemvalue;
5132  char *value;
5133 
5134  /* Get source element, checking for NULL */
5135  if (bitmap && (*bitmap & bitmask) == 0)
5136  {
5137  /* if null_string is NULL, we just ignore null elements */
5138  if (null_string != NULL)
5139  {
5140  if (printed)
5141  appendStringInfo(&buf, "%s%s", fldsep, null_string);
5142  else
5143  appendStringInfoString(&buf, null_string);
5144  printed = true;
5145  }
5146  }
5147  else
5148  {
5149  itemvalue = fetch_att(p, typbyval, typlen);
5150 
5151  value = OutputFunctionCall(&my_extra->proc, itemvalue);
5152 
5153  if (printed)
5154  appendStringInfo(&buf, "%s%s", fldsep, value);
5155  else
5157  printed = true;
5158 
5159  p = att_addlength_pointer(p, typlen, p);
5160  p = (char *) att_align_nominal(p, typalign);
5161  }
5162 
5163  /* advance bitmap pointer if any */
5164  if (bitmap)
5165  {
5166  bitmask <<= 1;
5167  if (bitmask == 0x100)
5168  {
5169  bitmap++;
5170  bitmask = 1;
5171  }
5172  }
5173  }
5174 
5175  result = cstring_to_text_with_len(buf.data, buf.len);
5176  pfree(buf.data);
5177 
5178  return result;
5179 }
5180 
5181 #define HEXBASE 16
5182 /*
5183  * Convert an int32 to a string containing a base 16 (hex) representation of
5184  * the number.
5185  */
5186 Datum
5188 {
5190  char *ptr;
5191  const char *digits = "0123456789abcdef";
5192  char buf[32]; /* bigger than needed, but reasonable */
5193 
5194  ptr = buf + sizeof(buf) - 1;
5195  *ptr = '\0';
5196 
5197  do
5198  {
5199  *--ptr = digits[value % HEXBASE];
5200  value /= HEXBASE;
5201  } while (ptr > buf && value);
5202 
5204 }
5205 
5206 /*
5207  * Convert an int64 to a string containing a base 16 (hex) representation of
5208  * the number.
5209  */
5210 Datum
5212 {
5213  uint64 value = (uint64) PG_GETARG_INT64(0);
5214  char *ptr;
5215  const char *digits = "0123456789abcdef";
5216  char buf[32]; /* bigger than needed, but reasonable */
5217 
5218  ptr = buf + sizeof(buf) - 1;
5219  *ptr = '\0';
5220 
5221  do
5222  {
5223  *--ptr = digits[value % HEXBASE];
5224  value /= HEXBASE;
5225  } while (ptr > buf && value);
5226 
5228 }
5229 
5230 /*
5231  * Return the size of a datum, possibly compressed
5232  *
5233  * Works on any data type
5234  */
5235 Datum
5237 {
5239  int32 result;
5240  int typlen;
5241 
5242  /* On first call, get the input type's typlen, and save at *fn_extra */
5243  if (fcinfo->flinfo->fn_extra == NULL)
5244  {
5245  /* Lookup the datatype of the supplied argument */
5246  Oid argtypeid = get_fn_expr_argtype(fcinfo->flinfo, 0);
5247 
5248  typlen = get_typlen(argtypeid);
5249  if (typlen == 0) /* should not happen */
5250  elog(ERROR, "cache lookup failed for type %u", argtypeid);
5251 
5252  fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
5253  sizeof(int));
5254  *((int *) fcinfo->flinfo->fn_extra) = typlen;
5255  }
5256  else
5257  typlen = *((int *) fcinfo->flinfo->fn_extra);
5258 
5259  if (typlen == -1)
5260  {
5261  /* varlena type, possibly toasted */
5262  result = toast_datum_size(value);
5263  }
5264  else if (typlen == -2)
5265  {
5266  /* cstring */
5267  result = strlen(DatumGetCString(value)) + 1;
5268  }
5269  else
5270  {
5271  /* ordinary fixed-width type */
5272  result = typlen;
5273  }
5274 
5275  PG_RETURN_INT32(result);
5276 }
5277 
5278 /*
5279  * Return the compression method stored in the compressed attribute. Return
5280  * NULL for non varlena type or uncompressed data.
5281  */
5282 Datum
5284 {
5285  int typlen;
5286  char *result;
5287  ToastCompressionId cmid;
5288 
5289  /* On first call, get the input type's typlen, and save at *fn_extra */
5290  if (fcinfo->flinfo->fn_extra == NULL)
5291  {
5292  /* Lookup the datatype of the supplied argument */
5293  Oid argtypeid = get_fn_expr_argtype(fcinfo->flinfo, 0);
5294 
5295  typlen = get_typlen(argtypeid);
5296  if (typlen == 0) /* should not happen */
5297  elog(ERROR, "cache lookup failed for type %u", argtypeid);
5298 
5299  fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
5300  sizeof(int));
5301  *((int *) fcinfo->flinfo->fn_extra) = typlen;
5302  }
5303  else
5304  typlen = *((int *) fcinfo->flinfo->fn_extra);
5305 
5306  if (typlen != -1)
5307  PG_RETURN_NULL();
5308 
5309  /* get the compression method id stored in the compressed varlena */
5310  cmid = toast_get_compression_id((struct varlena *)
5312  if (cmid == TOAST_INVALID_COMPRESSION_ID)
5313  PG_RETURN_NULL();
5314 
5315  /* convert compression method id to compression method name */
5316  switch (cmid)
5317  {
5319  result = "pglz";
5320  break;
5322  result = "lz4";
5323  break;
5324  default:
5325  elog(ERROR, "invalid compression method id %d", cmid);
5326  }
5327 
5329 }
5330 
5331 /*
5332  * string_agg - Concatenates values and returns string.
5333  *
5334  * Syntax: string_agg(value text, delimiter text) RETURNS text
5335  *
5336  * Note: Any NULL values are ignored. The first-call delimiter isn't
5337  * actually used at all, and on subsequent calls the delimiter precedes
5338  * the associated value.
5339  */
5340 
5341 /* subroutine to initialize state */
5342 static StringInfo
5344 {
5345  StringInfo state;
5346  MemoryContext aggcontext;
5347  MemoryContext oldcontext;
5348 
5349  if (!AggCheckCallContext(fcinfo, &aggcontext))
5350  {
5351  /* cannot be called directly because of internal-type argument */
5352  elog(ERROR, "string_agg_transfn called in non-aggregate context");
5353  }
5354 
5355  /*
5356  * Create state in aggregate context. It'll stay there across subsequent
5357  * calls.
5358  */
5359  oldcontext = MemoryContextSwitchTo(aggcontext);
5360  state = makeStringInfo();
5361  MemoryContextSwitchTo(oldcontext);
5362 
5363  return state;
5364 }
5365 
5366 Datum
5368 {
5369  StringInfo state;
5370 
5371  state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
5372 
5373  /* Append the value unless null. */
5374  if (!PG_ARGISNULL(1))
5375  {
5376  /* On the first time through, we ignore the delimiter. */
5377  if (state == NULL)
5378  state = makeStringAggState(fcinfo);
5379  else if (!PG_ARGISNULL(2))
5380  appendStringInfoText(state, PG_GETARG_TEXT_PP(2)); /* delimiter */
5381 
5382  appendStringInfoText(state, PG_GETARG_TEXT_PP(1)); /* value */
5383  }
5384 
5385  /*
5386  * The transition type for string_agg() is declared to be "internal",
5387  * which is a pass-by-value type the same size as a pointer.
5388  */
5390 }
5391 
5392 Datum
5394 {
5395  StringInfo state;
5396 
5397  /* cannot be called directly because of internal-type argument */
5398  Assert(AggCheckCallContext(fcinfo, NULL));
5399 
5400  state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
5401 
5402  if (state != NULL)
5404  else
5405  PG_RETURN_NULL();
5406 }
5407 
5408 /*
5409  * Prepare cache with fmgr info for the output functions of the datatypes of
5410  * the arguments of a concat-like function, beginning with argument "argidx".
5411  * (Arguments before that will have corresponding slots in the resulting
5412  * FmgrInfo array, but we don't fill those slots.)
5413  */
5414 static FmgrInfo *
5416 {
5417  FmgrInfo *foutcache;
5418  int i;
5419 
5420  /* We keep the info in fn_mcxt so it survives across calls */
5421  foutcache = (FmgrInfo *) MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
5422  PG_NARGS() * sizeof(FmgrInfo));
5423 
5424  for (i = argidx; i < PG_NARGS(); i++)
5425  {
5426  Oid valtype;
5427  Oid typOutput;
5428  bool typIsVarlena;
5429 
5430  valtype = get_fn_expr_argtype(fcinfo->flinfo, i);
5431  if (!OidIsValid(valtype))
5432  elog(ERROR, "could not determine data type of concat() input");
5433 
5434  getTypeOutputInfo(valtype, &typOutput, &typIsVarlena);
5435  fmgr_info_cxt(typOutput, &foutcache[i], fcinfo->flinfo->fn_mcxt);
5436  }
5437 
5438  fcinfo->flinfo->fn_extra = foutcache;
5439 
5440  return foutcache;
5441 }
5442 
5443 /*
5444  * Implementation of both concat() and concat_ws().
5445  *
5446  * sepstr is the separator string to place between values.
5447  * argidx identifies the first argument to concatenate (counting from zero);
5448  * note that this must be constant across any one series of calls.
5449  *
5450  * Returns NULL if result should be NULL, else text value.
5451  */
5452 static text *
5453 concat_internal(const char *sepstr, int argidx,
5454  FunctionCallInfo fcinfo)
5455 {
5456  text *result;
5458  FmgrInfo *foutcache;
5459  bool first_arg = true;
5460  int i;
5461 
5462  /*
5463  * concat(VARIADIC some-array) is essentially equivalent to
5464  * array_to_text(), ie concat the array elements with the given separator.
5465  * So we just pass the case off to that code.
5466  */
5467  if (get_fn_expr_variadic(fcinfo->flinfo))
5468  {
5469  ArrayType *arr;
5470 
5471  /* Should have just the one argument */
5472  Assert(argidx == PG_NARGS() - 1);
5473 
5474  /* concat(VARIADIC NULL) is defined as NULL */
5475  if (PG_ARGISNULL(argidx))
5476  return NULL;
5477 
5478  /*
5479  * Non-null argument had better be an array. We assume that any call
5480  * context that could let get_fn_expr_variadic return true will have
5481  * checked that a VARIADIC-labeled parameter actually is an array. So
5482  * it should be okay to just Assert that it's an array rather than
5483  * doing a full-fledged error check.
5484  */
5486 
5487  /* OK, safe to fetch the array value */
5488  arr = PG_GETARG_ARRAYTYPE_P(argidx);
5489 
5490  /*
5491  * And serialize the array. We tell array_to_text to ignore null
5492  * elements, which matches the behavior of the loop below.
5493  */
5494  return array_to_text_internal(fcinfo, arr, sepstr, NULL);
5495  }
5496 
5497  /* Normal case without explicit VARIADIC marker */
5498  initStringInfo(&str);
5499 
5500  /* Get output function info, building it if first time through */
5501  foutcache = (FmgrInfo *) fcinfo->flinfo->fn_extra;
5502  if (foutcache == NULL)
5503  foutcache = build_concat_foutcache(fcinfo, argidx);
5504 
5505  for (i = argidx; i < PG_NARGS(); i++)
5506  {
5507  if (!PG_ARGISNULL(i))
5508  {
5510 
5511  /* add separator if appropriate */
5512  if (first_arg)
5513  first_arg = false;
5514  else
5515  appendStringInfoString(&str, sepstr);
5516 
5517  /* call the appropriate type output function, append the result */
5519  OutputFunctionCall(&foutcache[i], value));
5520  }
5521  }
5522 
5523  result = cstring_to_text_with_len(str.data, str.len);
5524  pfree(str.data);
5525 
5526  return result;
5527 }
5528 
5529 /*
5530  * Concatenate all arguments. NULL arguments are ignored.
5531  */
5532 Datum
5534 {
5535  text *result;
5536 
5537  result = concat_internal("", 0, fcinfo);
5538  if (result == NULL)
5539  PG_RETURN_NULL();
5540  PG_RETURN_TEXT_P(result);
5541 }
5542 
5543 /*
5544  * Concatenate all but first argument value with separators. The first
5545  * parameter is used as the separator. NULL arguments are ignored.
5546  */
5547 Datum
5549 {
5550  char *sep;
5551  text *result;
5552 
5553  /* return NULL when separator is NULL */
5554  if (PG_ARGISNULL(0))
5555  PG_RETURN_NULL();
5557 
5558  result = concat_internal(sep, 1, fcinfo);
5559  if (result == NULL)
5560  PG_RETURN_NULL();
5561  PG_RETURN_TEXT_P(result);
5562 }
5563 
5564 /*
5565  * Return first n characters in the string. When n is negative,
5566  * return all but last |n| characters.
5567  */
5568 Datum
5570 {
5571  int n = PG_GETARG_INT32(1);
5572 
5573  if (n < 0)
5574  {
5575  text *str = PG_GETARG_TEXT_PP(0);
5576  const char *p = VARDATA_ANY(str);
5577  int len = VARSIZE_ANY_EXHDR(str);
5578  int rlen;
5579 
5580  n = pg_mbstrlen_with_len(p, len) + n;
5581  rlen = pg_mbcharcliplen(p, len, n);
5583  }
5584  else
5586 }
5587 
5588 /*
5589  * Return last n characters in the string. When n is negative,
5590  * return all but first |n| characters.
5591  */
5592 Datum
5594 {
5595  text *str = PG_GETARG_TEXT_PP(0);
5596  const char *p = VARDATA_ANY(str);
5597  int len = VARSIZE_ANY_EXHDR(str);
5598  int n = PG_GETARG_INT32(1);
5599  int off;
5600 
5601  if (n < 0)
5602  n = -n;
5603  else
5604  n = pg_mbstrlen_with_len(p, len) - n;
5605  off = pg_mbcharcliplen(p, len, n);
5606 
5608 }
5609 
5610 /*
5611  * Return reversed string
5612  */
5613 Datum
5615 {
5616  text *str = PG_GETARG_TEXT_PP(0);
5617  const char *p = VARDATA_ANY(str);
5618  int len = VARSIZE_ANY_EXHDR(str);
5619  const char *endp = p + len;
5620  text *result;
5621  char *dst;
5622 
5623  result = palloc(len + VARHDRSZ);
5624  dst = (char *) VARDATA(result) + len;
5625  SET_VARSIZE(result, len + VARHDRSZ);
5626 
5628  {
5629  /* multibyte version */
5630  while (p < endp)
5631  {
5632  int sz;
5633 
5634  sz = pg_mblen(p);
5635  dst -= sz;
5636  memcpy(dst, p, sz);
5637  p += sz;
5638  }
5639  }
5640  else
5641  {
5642  /* single byte version */
5643  while (p < endp)
5644  *(--dst) = *p++;
5645  }
5646 
5647  PG_RETURN_TEXT_P(result);
5648 }
5649 
5650 
5651 /*
5652  * Support macros for text_format()
5653  */
5654 #define TEXT_FORMAT_FLAG_MINUS 0x0001 /* is minus flag present? */
5655 
5656 #define ADVANCE_PARSE_POINTER(ptr,end_ptr) \
5657  do { \
5658  if (++(ptr) >= (end_ptr)) \
5659  ereport(ERROR, \
5660  (errcode(ERRCODE_INVALID_PARAMETER_VALUE), \
5661  errmsg("unterminated format() type specifier"), \
5662  errhint("For a single \"%%\" use \"%%%%\"."))); \
5663  } while (0)
5664 
5665 /*
5666  * Returns a formatted string
5667  */
5668 Datum
5670 {
5671  text *fmt;
5673  const char *cp;
5674  const char *start_ptr;
5675  const char *end_ptr;
5676  text *result;
5677  int arg;
5678  bool funcvariadic;
5679  int nargs;
5680  Datum *elements = NULL;
5681  bool *nulls = NULL;
5682  Oid element_type = InvalidOid;
5683  Oid prev_type = InvalidOid;
5684  Oid prev_width_type = InvalidOid;
5685  FmgrInfo typoutputfinfo;
5686  FmgrInfo typoutputinfo_width;
5687 
5688  /* When format string is null, immediately return null */
5689  if (PG_ARGISNULL(0))
5690  PG_RETURN_NULL();
5691 
5692  /* If argument is marked VARIADIC, expand array into elements */
5693  if (get_fn_expr_variadic(fcinfo->flinfo))
5694  {
5695  ArrayType *arr;
5696  int16 elmlen;
5697  bool elmbyval;
5698  char elmalign;
5699  int nitems;
5700 
5701  /* Should have just the one argument */
5702  Assert(PG_NARGS() == 2);
5703 
5704  /* If argument is NULL, we treat it as zero-length array */
5705  if (PG_ARGISNULL(1))
5706  nitems = 0;
5707  else
5708  {
5709  /*
5710  * Non-null argument had better be an array. We assume that any
5711  * call context that could let get_fn_expr_variadic return true
5712  * will have checked that a VARIADIC-labeled parameter actually is
5713  * an array. So it should be okay to just Assert that it's an
5714  * array rather than doing a full-fledged error check.
5715  */
5717 
5718  /* OK, safe to fetch the array value */
5719  arr = PG_GETARG_ARRAYTYPE_P(1);
5720 
5721  /* Get info about array element type */
5722  element_type = ARR_ELEMTYPE(arr);
5723  get_typlenbyvalalign(element_type,
5724  &elmlen, &elmbyval, &elmalign);
5725 
5726  /* Extract all array elements */
5727  deconstruct_array(arr, element_type, elmlen, elmbyval, elmalign,
5728  &elements, &nulls, &nitems);
5729  }
5730 
5731  nargs = nitems + 1;
5732  funcvariadic = true;
5733  }
5734  else
5735  {
5736  /* Non-variadic case, we'll process the arguments individually */
5737  nargs = PG_NARGS();
5738  funcvariadic = false;
5739  }
5740 
5741  /* Setup for main loop. */
5742  fmt = PG_GETARG_TEXT_PP(0);
5743  start_ptr = VARDATA_ANY(fmt);
5744  end_ptr = start_ptr + VARSIZE_ANY_EXHDR(fmt);
5745  initStringInfo(&str);
5746  arg = 1; /* next argument position to print */
5747 
5748  /* Scan format string, looking for conversion specifiers. */
5749  for (cp = start_ptr; cp < end_ptr; cp++)
5750  {
5751  int argpos;
5752  int widthpos;
5753  int flags;
5754  int width;
5755  Datum value;
5756  bool isNull;
5757  Oid typid;
5758 
5759  /*
5760  * If it's not the start of a conversion specifier, just copy it to
5761  * the output buffer.
5762  */
5763  if (*cp != '%')
5764  {
5766  continue;
5767  }
5768 
5769  ADVANCE_PARSE_POINTER(cp, end_ptr);
5770 
5771  /* Easy case: %% outputs a single % */
5772  if (*cp == '%')
5773  {
5775  continue;
5776  }
5777 
5778  /* Parse the optional portions of the format specifier */
5779  cp = text_format_parse_format(cp, end_ptr,
5780  &argpos, &widthpos,
5781  &flags, &width);
5782 
5783  /*
5784  * Next we should see the main conversion specifier. Whether or not
5785  * an argument position was present, it's known that at least one
5786  * character remains in the string at this point. Experience suggests
5787  * that it's worth checking that that character is one of the expected
5788  * ones before we try to fetch arguments, so as to produce the least
5789  * confusing response to a mis-formatted specifier.
5790  */
5791  if (strchr("sIL", *cp) == NULL)
5792  ereport(ERROR,
5793  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5794  errmsg("unrecognized format() type specifier \"%.*s\"",
5795  pg_mblen(cp), cp),
5796  errhint("For a single \"%%\" use \"%%%%\".")));
5797 
5798  /* If indirect width was specified, get its value */
5799  if (widthpos >= 0)
5800  {
5801  /* Collect the specified or next argument position */
5802  if (widthpos > 0)
5803  arg = widthpos;
5804  if (arg >= nargs)
5805  ereport(ERROR,
5806  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5807  errmsg("too few arguments for format()")));
5808 
5809  /* Get the value and type of the selected argument */
5810  if (!funcvariadic)
5811  {
5813  isNull = PG_ARGISNULL(arg);
5814  typid = get_fn_expr_argtype(fcinfo->flinfo, arg);
5815  }
5816  else
5817  {
5818  value = elements[arg - 1];
5819  isNull = nulls[arg - 1];
5820  typid = element_type;
5821  }
5822  if (!OidIsValid(typid))
5823  elog(ERROR, "could not determine data type of format() input");
5824 
5825  arg++;
5826 
5827  /* We can treat NULL width the same as zero */
5828  if (isNull)
5829  width = 0;
5830  else if (typid == INT4OID)
5831  width = DatumGetInt32(value);
5832  else if (typid == INT2OID)
5833  width = DatumGetInt16(value);
5834  else
5835  {
5836  /* For less-usual datatypes, convert to text then to int */
5837  char *str;
5838 
5839  if (typid != prev_width_type)
5840  {
5841  Oid typoutputfunc;
5842  bool typIsVarlena;
5843 
5844  getTypeOutputInfo(typid, &typoutputfunc, &typIsVarlena);
5845  fmgr_info(typoutputfunc, &typoutputinfo_width);
5846  prev_width_type = typid;
5847  }
5848 
5849  str = OutputFunctionCall(&typoutputinfo_width, value);
5850 
5851  /* pg_strtoint32 will complain about bad data or overflow */
5852  width = pg_strtoint32(str);
5853 
5854  pfree(str);
5855  }
5856  }
5857 
5858  /* Collect the specified or next argument position */
5859  if (argpos > 0)
5860  arg = argpos;
5861  if (arg >= nargs)
5862  ereport(ERROR,
5863  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5864  errmsg("too few arguments for format()")));
5865 
5866  /* Get the value and type of the selected argument */
5867  if (!funcvariadic)
5868  {
5870  isNull = PG_ARGISNULL(arg);
5871  typid = get_fn_expr_argtype(fcinfo->flinfo, arg);
5872  }
5873  else
5874  {
5875  value = elements[arg - 1];
5876  isNull = nulls[arg - 1];
5877  typid = element_type;
5878  }
5879  if (!OidIsValid(typid))
5880  elog(ERROR, "could not determine data type of format() input");
5881 
5882  arg++;
5883 
5884  /*
5885  * Get the appropriate typOutput function, reusing previous one if
5886  * same type as previous argument. That's particularly useful in the
5887  * variadic-array case, but often saves work even for ordinary calls.
5888  */
5889  if (typid != prev_type)
5890  {
5891  Oid typoutputfunc;
5892  bool typIsVarlena;
5893 
5894  getTypeOutputInfo(typid, &typoutputfunc, &typIsVarlena);
5895  fmgr_info(typoutputfunc, &typoutputfinfo);
5896  prev_type = typid;
5897  }
5898 
5899  /*
5900  * And now we can format the value.
5901  */
5902  switch (*cp)
5903  {
5904  case 's':
5905  case 'I':
5906  case 'L':
5907  text_format_string_conversion(&str, *cp, &typoutputfinfo,
5908  value, isNull,
5909  flags, width);
5910  break;
5911  default:
5912  /* should not get here, because of previous check */
5913  ereport(ERROR,
5914  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5915  errmsg("unrecognized format() type specifier \"%.*s\"",
5916  pg_mblen(cp), cp),
5917  errhint("For a single \"%%\" use \"%%%%\".")));
5918  break;
5919  }
5920  }
5921 
5922  /* Don't need deconstruct_array results anymore. */
5923  if (elements != NULL)
5924  pfree(elements);
5925  if (nulls != NULL)
5926  pfree(nulls);
5927 
5928  /* Generate results. */
5929  result = cstring_to_text_with_len(str.data, str.len);
5930  pfree(str.data);
5931 
5932  PG_RETURN_TEXT_P(result);
5933 }
5934 
5935 /*
5936  * Parse contiguous digits as a decimal number.
5937  *
5938  * Returns true if some digits could be parsed.
5939  * The value is returned into *value, and *ptr is advanced to the next
5940  * character to be parsed.
5941  *
5942  * Note parsing invariant: at least one character is known available before
5943  * string end (end_ptr) at entry, and this is still true at exit.
5944  */
5945 static bool
5946 text_format_parse_digits(const char **ptr, const char *end_ptr, int *value)
5947 {
5948  bool found = false;
5949  const char *cp = *ptr;
5950  int val = 0;
5951 
5952  while (*cp >= '0' && *cp <= '9')
5953  {
5954  int8 digit = (*cp - '0');
5955 
5956  if (unlikely(pg_mul_s32_overflow(val, 10, &val)) ||
5957  unlikely(pg_add_s32_overflow(val, digit, &val)))
5958  ereport(ERROR,
5959  (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
5960  errmsg("number is out of range")));
5961  ADVANCE_PARSE_POINTER(cp, end_ptr);
5962  found = true;
5963  }
5964 
5965  *ptr = cp;
5966  *value = val;
5967 
5968  return found;
5969 }
5970 
5971 /*
5972  * Parse a format specifier (generally following the SUS printf spec).
5973  *
5974  * We have already advanced over the initial '%', and we are looking for
5975  * [argpos][flags][width]type (but the type character is not consumed here).
5976  *
5977  * Inputs are start_ptr (the position after '%') and end_ptr (string end + 1).
5978  * Output parameters:
5979  * argpos: argument position for value to be printed. -1 means unspecified.
5980  * widthpos: argument position for width. Zero means the argument position
5981  * was unspecified (ie, take the next arg) and -1 means no width
5982  * argument (width was omitted or specified as a constant).
5983  * flags: bitmask of flags.
5984  * width: directly-specified width value. Zero means the width was omitted
5985  * (note it's not necessary to distinguish this case from an explicit
5986  * zero width value).
5987  *
5988  * The function result is the next character position to be parsed, ie, the
5989  * location where the type character is/should be.
5990  *
5991  * Note parsing invariant: at least one character is known available before
5992  * string end (end_ptr) at entry, and this is still true at exit.
5993  */
5994 static const char *
5995 text_format_parse_format(const char *start_ptr, const char *end_ptr,
5996  int *argpos, int *widthpos,
5997  int *flags, int *width)
5998 {
5999  const char *cp = start_ptr;
6000  int n;
6001 
6002  /* set defaults for output parameters */
6003  *argpos = -1;
6004  *widthpos = -1;
6005  *flags = 0;
6006  *width = 0;
6007 
6008  /* try to identify first number */
6009  if (text_format_parse_digits(&cp, end_ptr, &n))
6010  {
6011  if (*cp != '$')
6012  {
6013  /* Must be just a width and a type, so we're done */
6014  *width = n;
6015  return cp;
6016  }
6017  /* The number was argument position */
6018  *argpos = n;
6019  /* Explicit 0 for argument index is immediately refused */
6020  if (n == 0)
6021  ereport(ERROR,
6022  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6023  errmsg("format specifies argument 0, but arguments are numbered from 1")));
6024  ADVANCE_PARSE_POINTER(cp, end_ptr);
6025  }
6026 
6027  /* Handle flags (only minus is supported now) */
6028  while (*cp == '-')
6029  {
6030  *flags |= TEXT_FORMAT_FLAG_MINUS;
6031  ADVANCE_PARSE_POINTER(cp, end_ptr);
6032  }