PostgreSQL Source Code  git master
varlena.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * varlena.c
4  * Functions for the variable-length built-in types.
5  *
6  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  * src/backend/utils/adt/varlena.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 #include "postgres.h"
16 
17 #include <ctype.h>
18 #include <limits.h>
19 
20 #include "access/detoast.h"
22 #include "catalog/pg_collation.h"
23 #include "catalog/pg_type.h"
24 #include "common/hashfn.h"
25 #include "common/hex.h"
26 #include "common/int.h"
27 #include "common/unicode_norm.h"
28 #include "lib/hyperloglog.h"
29 #include "libpq/pqformat.h"
30 #include "miscadmin.h"
31 #include "nodes/execnodes.h"
32 #include "parser/scansup.h"
33 #include "port/pg_bswap.h"
34 #include "regex/regex.h"
35 #include "utils/builtins.h"
36 #include "utils/bytea.h"
37 #include "utils/lsyscache.h"
38 #include "utils/memutils.h"
39 #include "utils/pg_locale.h"
40 #include "utils/sortsupport.h"
41 #include "utils/varlena.h"
42 
43 
44 /* GUC variable */
46 
47 typedef struct varlena unknown;
48 typedef struct varlena VarString;
49 
50 /*
51  * State for text_position_* functions.
52  */
53 typedef struct
54 {
55  bool is_multibyte; /* T if multibyte encoding */
56  bool is_multibyte_char_in_char; /* need to check char boundaries? */
57 
58  char *str1; /* haystack string */
59  char *str2; /* needle string */
60  int len1; /* string lengths in bytes */
61  int len2;
62 
63  /* Skip table for Boyer-Moore-Horspool search algorithm: */
64  int skiptablemask; /* mask for ANDing with skiptable subscripts */
65  int skiptable[256]; /* skip distance for given mismatched char */
66 
67  char *last_match; /* pointer to last match in 'str1' */
68 
69  /*
70  * Sometimes we need to convert the byte position of a match to a
71  * character position. These store the last position that was converted,
72  * so that on the next call, we can continue from that point, rather than
73  * count characters from the very beginning.
74  */
75  char *refpoint; /* pointer within original haystack string */
76  int refpos; /* 0-based character offset of the same point */
78 
79 typedef struct
80 {
81  char *buf1; /* 1st string, or abbreviation original string
82  * buf */
83  char *buf2; /* 2nd string, or abbreviation strxfrm() buf */
84  int buflen1;
85  int buflen2;
86  int last_len1; /* Length of last buf1 string/strxfrm() input */
87  int last_len2; /* Length of last buf2 string/strxfrm() blob */
88  int last_returned; /* Last comparison result (cache) */
89  bool cache_blob; /* Does buf2 contain strxfrm() blob, etc? */
90  bool collate_c;
91  Oid typid; /* Actual datatype (text/bpchar/bytea/name) */
92  hyperLogLogState abbr_card; /* Abbreviated key cardinality state */
93  hyperLogLogState full_card; /* Full key cardinality state */
94  double prop_card; /* Required cardinality proportion */
97 
98 /*
99  * Output data for split_text(): we output either to an array or a table.
100  * tupstore and tupdesc must be set up in advance to output to a table.
101  */
102 typedef struct
103 {
108 
109 /*
110  * This should be large enough that most strings will fit, but small enough
111  * that we feel comfortable putting it on the stack
112  */
113 #define TEXTBUFLEN 1024
114 
115 #define DatumGetUnknownP(X) ((unknown *) PG_DETOAST_DATUM(X))
116 #define DatumGetUnknownPCopy(X) ((unknown *) PG_DETOAST_DATUM_COPY(X))
117 #define PG_GETARG_UNKNOWN_P(n) DatumGetUnknownP(PG_GETARG_DATUM(n))
118 #define PG_GETARG_UNKNOWN_P_COPY(n) DatumGetUnknownPCopy(PG_GETARG_DATUM(n))
119 #define PG_RETURN_UNKNOWN_P(x) PG_RETURN_POINTER(x)
120 
121 #define DatumGetVarStringP(X) ((VarString *) PG_DETOAST_DATUM(X))
122 #define DatumGetVarStringPP(X) ((VarString *) PG_DETOAST_DATUM_PACKED(X))
123 
124 static int varstrfastcmp_c(Datum x, Datum y, SortSupport ssup);
125 static int bpcharfastcmp_c(Datum x, Datum y, SortSupport ssup);
126 static int namefastcmp_c(Datum x, Datum y, SortSupport ssup);
127 static int varlenafastcmp_locale(Datum x, Datum y, SortSupport ssup);
128 static int namefastcmp_locale(Datum x, Datum y, SortSupport ssup);
129 static int varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup);
130 static int varstrcmp_abbrev(Datum x, Datum y, SortSupport ssup);
131 static Datum varstr_abbrev_convert(Datum original, SortSupport ssup);
132 static bool varstr_abbrev_abort(int memtupcount, SortSupport ssup);
133 static int32 text_length(Datum str);
134 static text *text_catenate(text *t1, text *t2);
135 static text *text_substring(Datum str,
136  int32 start,
137  int32 length,
138  bool length_not_specified);
139 static text *text_overlay(text *t1, text *t2, int sp, int sl);
140 static int text_position(text *t1, text *t2, Oid collid);
141 static void text_position_setup(text *t1, text *t2, Oid collid, TextPositionState *state);
143 static char *text_position_next_internal(char *start_ptr, TextPositionState *state);
147 static void check_collation_set(Oid collid);
148 static int text_cmp(text *arg1, text *arg2, Oid collid);
149 static bytea *bytea_catenate(bytea *t1, bytea *t2);
151  int S,
152  int L,
153  bool length_not_specified);
154 static bytea *bytea_overlay(bytea *t1, bytea *t2, int sp, int sl);
155 static void appendStringInfoText(StringInfo str, const text *t);
156 static bool split_text(FunctionCallInfo fcinfo, SplitTextOutputData *tstate);
157 static void split_text_accum_result(SplitTextOutputData *tstate,
158  text *field_value,
159  text *null_string,
160  Oid collation);
162  const char *fldsep, const char *null_string);
164 static bool text_format_parse_digits(const char **ptr, const char *end_ptr,
165  int *value);
166 static const char *text_format_parse_format(const char *start_ptr,
167  const char *end_ptr,
168  int *argpos, int *widthpos,
169  int *flags, int *width);
170 static void text_format_string_conversion(StringInfo buf, char conversion,
171  FmgrInfo *typOutputInfo,
172  Datum value, bool isNull,
173  int flags, int width);
174 static void text_format_append_string(StringInfo buf, const char *str,
175  int flags, int width);
176 
177 
178 /*****************************************************************************
179  * CONVERSION ROUTINES EXPORTED FOR USE BY C CODE *
180  *****************************************************************************/
181 
182 /*
183  * cstring_to_text
184  *
185  * Create a text value from a null-terminated C string.
186  *
187  * The new text value is freshly palloc'd with a full-size VARHDR.
188  */
189 text *
190 cstring_to_text(const char *s)
191 {
192  return cstring_to_text_with_len(s, strlen(s));
193 }
194 
195 /*
196  * cstring_to_text_with_len
197  *
198  * Same as cstring_to_text except the caller specifies the string length;
199  * the string need not be null_terminated.
200  */
201 text *
202 cstring_to_text_with_len(const char *s, int len)
203 {
204  text *result = (text *) palloc(len + VARHDRSZ);
205 
206  SET_VARSIZE(result, len + VARHDRSZ);
207  memcpy(VARDATA(result), s, len);
208 
209  return result;
210 }
211 
212 /*
213  * text_to_cstring
214  *
215  * Create a palloc'd, null-terminated C string from a text value.
216  *
217  * We support being passed a compressed or toasted text value.
218  * This is a bit bogus since such values shouldn't really be referred to as
219  * "text *", but it seems useful for robustness. If we didn't handle that
220  * case here, we'd need another routine that did, anyway.
221  */
222 char *
224 {
225  /* must cast away the const, unfortunately */
226  text *tunpacked = pg_detoast_datum_packed(unconstify(text *, t));
227  int len = VARSIZE_ANY_EXHDR(tunpacked);
228  char *result;
229 
230  result = (char *) palloc(len + 1);
231  memcpy(result, VARDATA_ANY(tunpacked), len);
232  result[len] = '\0';
233 
234  if (tunpacked != t)
235  pfree(tunpacked);
236 
237  return result;
238 }
239 
240 /*
241  * text_to_cstring_buffer
242  *
243  * Copy a text value into a caller-supplied buffer of size dst_len.
244  *
245  * The text string is truncated if necessary to fit. The result is
246  * guaranteed null-terminated (unless dst_len == 0).
247  *
248  * We support being passed a compressed or toasted text value.
249  * This is a bit bogus since such values shouldn't really be referred to as
250  * "text *", but it seems useful for robustness. If we didn't handle that
251  * case here, we'd need another routine that did, anyway.
252  */
253 void
254 text_to_cstring_buffer(const text *src, char *dst, size_t dst_len)
255 {
256  /* must cast away the const, unfortunately */
257  text *srcunpacked = pg_detoast_datum_packed(unconstify(text *, src));
258  size_t src_len = VARSIZE_ANY_EXHDR(srcunpacked);
259 
260  if (dst_len > 0)
261  {
262  dst_len--;
263  if (dst_len >= src_len)
264  dst_len = src_len;
265  else /* ensure truncation is encoding-safe */
266  dst_len = pg_mbcliplen(VARDATA_ANY(srcunpacked), src_len, dst_len);
267  memcpy(dst, VARDATA_ANY(srcunpacked), dst_len);
268  dst[dst_len] = '\0';
269  }
270 
271  if (srcunpacked != src)
272  pfree(srcunpacked);
273 }
274 
275 
276 /*****************************************************************************
277  * USER I/O ROUTINES *
278  *****************************************************************************/
279 
280 
281 #define VAL(CH) ((CH) - '0')
282 #define DIG(VAL) ((VAL) + '0')
283 
284 /*
285  * byteain - converts from printable representation of byte array
286  *
287  * Non-printable characters must be passed as '\nnn' (octal) and are
288  * converted to internal form. '\' must be passed as '\\'.
289  * ereport(ERROR, ...) if bad form.
290  *
291  * BUGS:
292  * The input is scanned twice.
293  * The error checking of input is minimal.
294  */
295 Datum
297 {
298  char *inputText = PG_GETARG_CSTRING(0);
299  char *tp;
300  char *rp;
301  int bc;
302  bytea *result;
303 
304  /* Recognize hex input */
305  if (inputText[0] == '\\' && inputText[1] == 'x')
306  {
307  size_t len = strlen(inputText);
308  uint64 dstlen = pg_hex_dec_len(len - 2);
309 
310  bc = dstlen + VARHDRSZ; /* maximum possible length */
311  result = palloc(bc);
312 
313  bc = pg_hex_decode(inputText + 2, len - 2, VARDATA(result), dstlen);
314  SET_VARSIZE(result, bc + VARHDRSZ); /* actual length */
315 
316  PG_RETURN_BYTEA_P(result);
317  }
318 
319  /* Else, it's the traditional escaped style */
320  for (bc = 0, tp = inputText; *tp != '\0'; bc++)
321  {
322  if (tp[0] != '\\')
323  tp++;
324  else if ((tp[0] == '\\') &&
325  (tp[1] >= '0' && tp[1] <= '3') &&
326  (tp[2] >= '0' && tp[2] <= '7') &&
327  (tp[3] >= '0' && tp[3] <= '7'))
328  tp += 4;
329  else if ((tp[0] == '\\') &&
330  (tp[1] == '\\'))
331  tp += 2;
332  else
333  {
334  /*
335  * one backslash, not followed by another or ### valid octal
336  */
337  ereport(ERROR,
338  (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
339  errmsg("invalid input syntax for type %s", "bytea")));
340  }
341  }
342 
343  bc += VARHDRSZ;
344 
345  result = (bytea *) palloc(bc);
346  SET_VARSIZE(result, bc);
347 
348  tp = inputText;
349  rp = VARDATA(result);
350  while (*tp != '\0')
351  {
352  if (tp[0] != '\\')
353  *rp++ = *tp++;
354  else if ((tp[0] == '\\') &&
355  (tp[1] >= '0' && tp[1] <= '3') &&
356  (tp[2] >= '0' && tp[2] <= '7') &&
357  (tp[3] >= '0' && tp[3] <= '7'))
358  {
359  bc = VAL(tp[1]);
360  bc <<= 3;
361  bc += VAL(tp[2]);
362  bc <<= 3;
363  *rp++ = bc + VAL(tp[3]);
364 
365  tp += 4;
366  }
367  else if ((tp[0] == '\\') &&
368  (tp[1] == '\\'))
369  {
370  *rp++ = '\\';
371  tp += 2;
372  }
373  else
374  {
375  /*
376  * We should never get here. The first pass should not allow it.
377  */
378  ereport(ERROR,
379  (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
380  errmsg("invalid input syntax for type %s", "bytea")));
381  }
382  }
383 
384  PG_RETURN_BYTEA_P(result);
385 }
386 
387 /*
388  * byteaout - converts to printable representation of byte array
389  *
390  * In the traditional escaped format, non-printable characters are
391  * printed as '\nnn' (octal) and '\' as '\\'.
392  */
393 Datum
395 {
396  bytea *vlena = PG_GETARG_BYTEA_PP(0);
397  char *result;
398  char *rp;
399 
401  {
402  uint64 dstlen = pg_hex_enc_len(VARSIZE_ANY_EXHDR(vlena));
403 
404  /* Print hex format */
405  rp = result = palloc(dstlen + 2 + 1);
406  *rp++ = '\\';
407  *rp++ = 'x';
408 
409  rp += pg_hex_encode(VARDATA_ANY(vlena), VARSIZE_ANY_EXHDR(vlena), rp,
410  dstlen);
411  }
412  else if (bytea_output == BYTEA_OUTPUT_ESCAPE)
413  {
414  /* Print traditional escaped format */
415  char *vp;
416  uint64 len;
417  int i;
418 
419  len = 1; /* empty string has 1 char */
420  vp = VARDATA_ANY(vlena);
421  for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++)
422  {
423  if (*vp == '\\')
424  len += 2;
425  else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e)
426  len += 4;
427  else
428  len++;
429  }
430 
431  /*
432  * In principle len can't overflow uint32 if the input fit in 1GB, but
433  * for safety let's check rather than relying on palloc's internal
434  * check.
435  */
436  if (len > MaxAllocSize)
437  ereport(ERROR,
438  (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
439  errmsg_internal("result of bytea output conversion is too large")));
440  rp = result = (char *) palloc(len);
441 
442  vp = VARDATA_ANY(vlena);
443  for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++)
444  {
445  if (*vp == '\\')
446  {
447  *rp++ = '\\';
448  *rp++ = '\\';
449  }
450  else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e)
451  {
452  int val; /* holds unprintable chars */
453 
454  val = *vp;
455  rp[0] = '\\';
456  rp[3] = DIG(val & 07);
457  val >>= 3;
458  rp[2] = DIG(val & 07);
459  val >>= 3;
460  rp[1] = DIG(val & 03);
461  rp += 4;
462  }
463  else
464  *rp++ = *vp;
465  }
466  }
467  else
468  {
469  elog(ERROR, "unrecognized bytea_output setting: %d",
470  bytea_output);
471  rp = result = NULL; /* keep compiler quiet */
472  }
473  *rp = '\0';
474  PG_RETURN_CSTRING(result);
475 }
476 
477 /*
478  * bytearecv - converts external binary format to bytea
479  */
480 Datum
482 {
484  bytea *result;
485  int nbytes;
486 
487  nbytes = buf->len - buf->cursor;
488  result = (bytea *) palloc(nbytes + VARHDRSZ);
489  SET_VARSIZE(result, nbytes + VARHDRSZ);
490  pq_copymsgbytes(buf, VARDATA(result), nbytes);
491  PG_RETURN_BYTEA_P(result);
492 }
493 
494 /*
495  * byteasend - converts bytea to binary format
496  *
497  * This is a special case: just copy the input...
498  */
499 Datum
501 {
502  bytea *vlena = PG_GETARG_BYTEA_P_COPY(0);
503 
504  PG_RETURN_BYTEA_P(vlena);
505 }
506 
507 Datum
509 {
511 
512  state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
513 
514  /* Append the value unless null. */
515  if (!PG_ARGISNULL(1))
516  {
518 
519  /* On the first time through, we ignore the delimiter. */
520  if (state == NULL)
521  state = makeStringAggState(fcinfo);
522  else if (!PG_ARGISNULL(2))
523  {
524  bytea *delim = PG_GETARG_BYTEA_PP(2);
525 
527  }
528 
530  }
531 
532  /*
533  * The transition type for string_agg() is declared to be "internal",
534  * which is a pass-by-value type the same size as a pointer.
535  */
536  PG_RETURN_POINTER(state);
537 }
538 
539 Datum
541 {
543 
544  /* cannot be called directly because of internal-type argument */
545  Assert(AggCheckCallContext(fcinfo, NULL));
546 
547  state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
548 
549  if (state != NULL)
550  {
551  bytea *result;
552 
553  result = (bytea *) palloc(state->len + VARHDRSZ);
554  SET_VARSIZE(result, state->len + VARHDRSZ);
555  memcpy(VARDATA(result), state->data, state->len);
556  PG_RETURN_BYTEA_P(result);
557  }
558  else
559  PG_RETURN_NULL();
560 }
561 
562 /*
563  * textin - converts "..." to internal representation
564  */
565 Datum
567 {
568  char *inputText = PG_GETARG_CSTRING(0);
569 
570  PG_RETURN_TEXT_P(cstring_to_text(inputText));
571 }
572 
573 /*
574  * textout - converts internal representation to "..."
575  */
576 Datum
578 {
579  Datum txt = PG_GETARG_DATUM(0);
580 
582 }
583 
584 /*
585  * textrecv - converts external binary format to text
586  */
587 Datum
589 {
591  text *result;
592  char *str;
593  int nbytes;
594 
595  str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
596 
597  result = cstring_to_text_with_len(str, nbytes);
598  pfree(str);
599  PG_RETURN_TEXT_P(result);
600 }
601 
602 /*
603  * textsend - converts text to binary format
604  */
605 Datum
607 {
608  text *t = PG_GETARG_TEXT_PP(0);
610 
611  pq_begintypsend(&buf);
614 }
615 
616 
617 /*
618  * unknownin - converts "..." to internal representation
619  */
620 Datum
622 {
623  char *str = PG_GETARG_CSTRING(0);
624 
625  /* representation is same as cstring */
627 }
628 
629 /*
630  * unknownout - converts internal representation to "..."
631  */
632 Datum
634 {
635  /* representation is same as cstring */
636  char *str = PG_GETARG_CSTRING(0);
637 
639 }
640 
641 /*
642  * unknownrecv - converts external binary format to unknown
643  */
644 Datum
646 {
648  char *str;
649  int nbytes;
650 
651  str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
652  /* representation is same as cstring */
653  PG_RETURN_CSTRING(str);
654 }
655 
656 /*
657  * unknownsend - converts unknown to binary format
658  */
659 Datum
661 {
662  /* representation is same as cstring */
663  char *str = PG_GETARG_CSTRING(0);
665 
666  pq_begintypsend(&buf);
667  pq_sendtext(&buf, str, strlen(str));
669 }
670 
671 
672 /* ========== PUBLIC ROUTINES ========== */
673 
674 /*
675  * textlen -
676  * returns the logical length of a text*
677  * (which is less than the VARSIZE of the text*)
678  */
679 Datum
681 {
683 
684  /* try to avoid decompressing argument */
686 }
687 
688 /*
689  * text_length -
690  * Does the real work for textlen()
691  *
692  * This is broken out so it can be called directly by other string processing
693  * functions. Note that the argument is passed as a Datum, to indicate that
694  * it may still be in compressed form. We can avoid decompressing it at all
695  * in some cases.
696  */
697 static int32
699 {
700  /* fastpath when max encoding length is one */
703  else
704  {
705  text *t = DatumGetTextPP(str);
706 
708  VARSIZE_ANY_EXHDR(t)));
709  }
710 }
711 
712 /*
713  * textoctetlen -
714  * returns the physical length of a text*
715  * (which is less than the VARSIZE of the text*)
716  */
717 Datum
719 {
721 
722  /* We need not detoast the input at all */
724 }
725 
726 /*
727  * textcat -
728  * takes two text* and returns a text* that is the concatenation of
729  * the two.
730  *
731  * Rewritten by Sapa, sapa@hq.icb.chel.su. 8-Jul-96.
732  * Updated by Thomas, Thomas.Lockhart@jpl.nasa.gov 1997-07-10.
733  * Allocate space for output in all cases.
734  * XXX - thomas 1997-07-10
735  */
736 Datum
738 {
739  text *t1 = PG_GETARG_TEXT_PP(0);
740  text *t2 = PG_GETARG_TEXT_PP(1);
741 
743 }
744 
745 /*
746  * text_catenate
747  * Guts of textcat(), broken out so it can be used by other functions
748  *
749  * Arguments can be in short-header form, but not compressed or out-of-line
750  */
751 static text *
753 {
754  text *result;
755  int len1,
756  len2,
757  len;
758  char *ptr;
759 
760  len1 = VARSIZE_ANY_EXHDR(t1);
761  len2 = VARSIZE_ANY_EXHDR(t2);
762 
763  /* paranoia ... probably should throw error instead? */
764  if (len1 < 0)
765  len1 = 0;
766  if (len2 < 0)
767  len2 = 0;
768 
769  len = len1 + len2 + VARHDRSZ;
770  result = (text *) palloc(len);
771 
772  /* Set size of result string... */
773  SET_VARSIZE(result, len);
774 
775  /* Fill data field of result string... */
776  ptr = VARDATA(result);
777  if (len1 > 0)
778  memcpy(ptr, VARDATA_ANY(t1), len1);
779  if (len2 > 0)
780  memcpy(ptr + len1, VARDATA_ANY(t2), len2);
781 
782  return result;
783 }
784 
785 /*
786  * charlen_to_bytelen()
787  * Compute the number of bytes occupied by n characters starting at *p
788  *
789  * It is caller's responsibility that there actually are n characters;
790  * the string need not be null-terminated.
791  */
792 static int
793 charlen_to_bytelen(const char *p, int n)
794 {
796  {
797  /* Optimization for single-byte encodings */
798  return n;
799  }
800  else
801  {
802  const char *s;
803 
804  for (s = p; n > 0; n--)
805  s += pg_mblen(s);
806 
807  return s - p;
808  }
809 }
810 
811 /*
812  * text_substr()
813  * Return a substring starting at the specified position.
814  * - thomas 1997-12-31
815  *
816  * Input:
817  * - string
818  * - starting position (is one-based)
819  * - string length
820  *
821  * If the starting position is zero or less, then return from the start of the string
822  * adjusting the length to be consistent with the "negative start" per SQL.
823  * If the length is less than zero, return the remaining string.
824  *
825  * Added multibyte support.
826  * - Tatsuo Ishii 1998-4-21
827  * Changed behavior if starting position is less than one to conform to SQL behavior.
828  * Formerly returned the entire string; now returns a portion.
829  * - Thomas Lockhart 1998-12-10
830  * Now uses faster TOAST-slicing interface
831  * - John Gray 2002-02-22
832  * Remove "#ifdef MULTIBYTE" and test for encoding_max_length instead. Change
833  * behaviors conflicting with SQL to meet SQL (if E = S + L < S throw
834  * error; if E < 1, return '', not entire string). Fixed MB related bug when
835  * S > LC and < LC + 4 sometimes garbage characters are returned.
836  * - Joe Conway 2002-08-10
837  */
838 Datum
840 {
842  PG_GETARG_INT32(1),
843  PG_GETARG_INT32(2),
844  false));
845 }
846 
847 /*
848  * text_substr_no_len -
849  * Wrapper to avoid opr_sanity failure due to
850  * one function accepting a different number of args.
851  */
852 Datum
854 {
856  PG_GETARG_INT32(1),
857  -1, true));
858 }
859 
860 /*
861  * text_substring -
862  * Does the real work for text_substr() and text_substr_no_len()
863  *
864  * This is broken out so it can be called directly by other string processing
865  * functions. Note that the argument is passed as a Datum, to indicate that
866  * it may still be in compressed/toasted form. We can avoid detoasting all
867  * of it in some cases.
868  *
869  * The result is always a freshly palloc'd datum.
870  */
871 static text *
872 text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
873 {
875  int32 S = start; /* start position */
876  int32 S1; /* adjusted start position */
877  int32 L1; /* adjusted substring length */
878  int32 E; /* end position */
879 
880  /*
881  * SQL99 says S can be zero or negative, but we still must fetch from the
882  * start of the string.
883  */
884  S1 = Max(S, 1);
885 
886  /* life is easy if the encoding max length is 1 */
887  if (eml == 1)
888  {
889  if (length_not_specified) /* special case - get length to end of
890  * string */
891  L1 = -1;
892  else if (length < 0)
893  {
894  /* SQL99 says to throw an error for E < S, i.e., negative length */
895  ereport(ERROR,
896  (errcode(ERRCODE_SUBSTRING_ERROR),
897  errmsg("negative substring length not allowed")));
898  L1 = -1; /* silence stupider compilers */
899  }
900  else if (pg_add_s32_overflow(S, length, &E))
901  {
902  /*
903  * L could be large enough for S + L to overflow, in which case
904  * the substring must run to end of string.
905  */
906  L1 = -1;
907  }
908  else
909  {
910  /*
911  * A zero or negative value for the end position can happen if the
912  * start was negative or one. SQL99 says to return a zero-length
913  * string.
914  */
915  if (E < 1)
916  return cstring_to_text("");
917 
918  L1 = E - S1;
919  }
920 
921  /*
922  * If the start position is past the end of the string, SQL99 says to
923  * return a zero-length string -- DatumGetTextPSlice() will do that
924  * for us. We need only convert S1 to zero-based starting position.
925  */
926  return DatumGetTextPSlice(str, S1 - 1, L1);
927  }
928  else if (eml > 1)
929  {
930  /*
931  * When encoding max length is > 1, we can't get LC without
932  * detoasting, so we'll grab a conservatively large slice now and go
933  * back later to do the right thing
934  */
935  int32 slice_start;
936  int32 slice_size;
937  int32 slice_strlen;
938  text *slice;
939  int32 E1;
940  int32 i;
941  char *p;
942  char *s;
943  text *ret;
944 
945  /*
946  * We need to start at position zero because there is no way to know
947  * in advance which byte offset corresponds to the supplied start
948  * position.
949  */
950  slice_start = 0;
951 
952  if (length_not_specified) /* special case - get length to end of
953  * string */
954  slice_size = L1 = -1;
955  else if (length < 0)
956  {
957  /* SQL99 says to throw an error for E < S, i.e., negative length */
958  ereport(ERROR,
959  (errcode(ERRCODE_SUBSTRING_ERROR),
960  errmsg("negative substring length not allowed")));
961  slice_size = L1 = -1; /* silence stupider compilers */
962  }
963  else if (pg_add_s32_overflow(S, length, &E))
964  {
965  /*
966  * L could be large enough for S + L to overflow, in which case
967  * the substring must run to end of string.
968  */
969  slice_size = L1 = -1;
970  }
971  else
972  {
973  /*
974  * A zero or negative value for the end position can happen if the
975  * start was negative or one. SQL99 says to return a zero-length
976  * string.
977  */
978  if (E < 1)
979  return cstring_to_text("");
980 
981  /*
982  * if E is past the end of the string, the tuple toaster will
983  * truncate the length for us
984  */
985  L1 = E - S1;
986 
987  /*
988  * Total slice size in bytes can't be any longer than the start
989  * position plus substring length times the encoding max length.
990  * If that overflows, we can just use -1.
991  */
992  if (pg_mul_s32_overflow(E, eml, &slice_size))
993  slice_size = -1;
994  }
995 
996  /*
997  * If we're working with an untoasted source, no need to do an extra
998  * copying step.
999  */
1002  slice = DatumGetTextPSlice(str, slice_start, slice_size);
1003  else
1004  slice = (text *) DatumGetPointer(str);
1005 
1006  /* see if we got back an empty string */
1007  if (VARSIZE_ANY_EXHDR(slice) == 0)
1008  {
1009  if (slice != (text *) DatumGetPointer(str))
1010  pfree(slice);
1011  return cstring_to_text("");
1012  }
1013 
1014  /* Now we can get the actual length of the slice in MB characters */
1015  slice_strlen = pg_mbstrlen_with_len(VARDATA_ANY(slice),
1016  VARSIZE_ANY_EXHDR(slice));
1017 
1018  /*
1019  * Check that the start position wasn't > slice_strlen. If so, SQL99
1020  * says to return a zero-length string.
1021  */
1022  if (S1 > slice_strlen)
1023  {
1024  if (slice != (text *) DatumGetPointer(str))
1025  pfree(slice);
1026  return cstring_to_text("");
1027  }
1028 
1029  /*
1030  * Adjust L1 and E1 now that we know the slice string length. Again
1031  * remember that S1 is one based, and slice_start is zero based.
1032  */
1033  if (L1 > -1)
1034  E1 = Min(S1 + L1, slice_start + 1 + slice_strlen);
1035  else
1036  E1 = slice_start + 1 + slice_strlen;
1037 
1038  /*
1039  * Find the start position in the slice; remember S1 is not zero based
1040  */
1041  p = VARDATA_ANY(slice);
1042  for (i = 0; i < S1 - 1; i++)
1043  p += pg_mblen(p);
1044 
1045  /* hang onto a pointer to our start position */
1046  s = p;
1047 
1048  /*
1049  * Count the actual bytes used by the substring of the requested
1050  * length.
1051  */
1052  for (i = S1; i < E1; i++)
1053  p += pg_mblen(p);
1054 
1055  ret = (text *) palloc(VARHDRSZ + (p - s));
1056  SET_VARSIZE(ret, VARHDRSZ + (p - s));
1057  memcpy(VARDATA(ret), s, (p - s));
1058 
1059  if (slice != (text *) DatumGetPointer(str))
1060  pfree(slice);
1061 
1062  return ret;
1063  }
1064  else
1065  elog(ERROR, "invalid backend encoding: encoding max length < 1");
1066 
1067  /* not reached: suppress compiler warning */
1068  return NULL;
1069 }
1070 
1071 /*
1072  * textoverlay
1073  * Replace specified substring of first string with second
1074  *
1075  * The SQL standard defines OVERLAY() in terms of substring and concatenation.
1076  * This code is a direct implementation of what the standard says.
1077  */
1078 Datum
1080 {
1081  text *t1 = PG_GETARG_TEXT_PP(0);
1082  text *t2 = PG_GETARG_TEXT_PP(1);
1083  int sp = PG_GETARG_INT32(2); /* substring start position */
1084  int sl = PG_GETARG_INT32(3); /* substring length */
1085 
1086  PG_RETURN_TEXT_P(text_overlay(t1, t2, sp, sl));
1087 }
1088 
1089 Datum
1091 {
1092  text *t1 = PG_GETARG_TEXT_PP(0);
1093  text *t2 = PG_GETARG_TEXT_PP(1);
1094  int sp = PG_GETARG_INT32(2); /* substring start position */
1095  int sl;
1096 
1097  sl = text_length(PointerGetDatum(t2)); /* defaults to length(t2) */
1098  PG_RETURN_TEXT_P(text_overlay(t1, t2, sp, sl));
1099 }
1100 
1101 static text *
1102 text_overlay(text *t1, text *t2, int sp, int sl)
1103 {
1104  text *result;
1105  text *s1;
1106  text *s2;
1107  int sp_pl_sl;
1108 
1109  /*
1110  * Check for possible integer-overflow cases. For negative sp, throw a
1111  * "substring length" error because that's what should be expected
1112  * according to the spec's definition of OVERLAY().
1113  */
1114  if (sp <= 0)
1115  ereport(ERROR,
1116  (errcode(ERRCODE_SUBSTRING_ERROR),
1117  errmsg("negative substring length not allowed")));
1118  if (pg_add_s32_overflow(sp, sl, &sp_pl_sl))
1119  ereport(ERROR,
1120  (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
1121  errmsg("integer out of range")));
1122 
1123  s1 = text_substring(PointerGetDatum(t1), 1, sp - 1, false);
1124  s2 = text_substring(PointerGetDatum(t1), sp_pl_sl, -1, true);
1125  result = text_catenate(s1, t2);
1126  result = text_catenate(result, s2);
1127 
1128  return result;
1129 }
1130 
1131 /*
1132  * textpos -
1133  * Return the position of the specified substring.
1134  * Implements the SQL POSITION() function.
1135  * Ref: A Guide To The SQL Standard, Date & Darwen, 1997
1136  * - thomas 1997-07-27
1137  */
1138 Datum
1140 {
1141  text *str = PG_GETARG_TEXT_PP(0);
1142  text *search_str = PG_GETARG_TEXT_PP(1);
1143 
1144  PG_RETURN_INT32((int32) text_position(str, search_str, PG_GET_COLLATION()));
1145 }
1146 
1147 /*
1148  * text_position -
1149  * Does the real work for textpos()
1150  *
1151  * Inputs:
1152  * t1 - string to be searched
1153  * t2 - pattern to match within t1
1154  * Result:
1155  * Character index of the first matched char, starting from 1,
1156  * or 0 if no match.
1157  *
1158  * This is broken out so it can be called directly by other string processing
1159  * functions.
1160  */
1161 static int
1162 text_position(text *t1, text *t2, Oid collid)
1163 {
1165  int result;
1166 
1167  /* Empty needle always matches at position 1 */
1168  if (VARSIZE_ANY_EXHDR(t2) < 1)
1169  return 1;
1170 
1171  /* Otherwise, can't match if haystack is shorter than needle */
1172  if (VARSIZE_ANY_EXHDR(t1) < VARSIZE_ANY_EXHDR(t2))
1173  return 0;
1174 
1175  text_position_setup(t1, t2, collid, &state);
1176  if (!text_position_next(&state))
1177  result = 0;
1178  else
1179  result = text_position_get_match_pos(&state);
1180  text_position_cleanup(&state);
1181  return result;
1182 }
1183 
1184 
1185 /*
1186  * text_position_setup, text_position_next, text_position_cleanup -
1187  * Component steps of text_position()
1188  *
1189  * These are broken out so that a string can be efficiently searched for
1190  * multiple occurrences of the same pattern. text_position_next may be
1191  * called multiple times, and it advances to the next match on each call.
1192  * text_position_get_match_ptr() and text_position_get_match_pos() return
1193  * a pointer or 1-based character position of the last match, respectively.
1194  *
1195  * The "state" variable is normally just a local variable in the caller.
1196  *
1197  * NOTE: text_position_next skips over the matched portion. For example,
1198  * searching for "xx" in "xxx" returns only one match, not two.
1199  */
1200 
1201 static void
1203 {
1204  int len1 = VARSIZE_ANY_EXHDR(t1);
1205  int len2 = VARSIZE_ANY_EXHDR(t2);
1206  pg_locale_t mylocale = 0;
1207 
1208  check_collation_set(collid);
1209 
1210  if (!lc_collate_is_c(collid) && collid != DEFAULT_COLLATION_OID)
1211  mylocale = pg_newlocale_from_collation(collid);
1212 
1213  if (mylocale && !mylocale->deterministic)
1214  ereport(ERROR,
1215  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1216  errmsg("nondeterministic collations are not supported for substring searches")));
1217 
1218  Assert(len1 > 0);
1219  Assert(len2 > 0);
1220 
1221  /*
1222  * Even with a multi-byte encoding, we perform the search using the raw
1223  * byte sequence, ignoring multibyte issues. For UTF-8, that works fine,
1224  * because in UTF-8 the byte sequence of one character cannot contain
1225  * another character. For other multi-byte encodings, we do the search
1226  * initially as a simple byte search, ignoring multibyte issues, but
1227  * verify afterwards that the match we found is at a character boundary,
1228  * and continue the search if it was a false match.
1229  */
1231  {
1232  state->is_multibyte = false;
1233  state->is_multibyte_char_in_char = false;
1234  }
1235  else if (GetDatabaseEncoding() == PG_UTF8)
1236  {
1237  state->is_multibyte = true;
1238  state->is_multibyte_char_in_char = false;
1239  }
1240  else
1241  {
1242  state->is_multibyte = true;
1243  state->is_multibyte_char_in_char = true;
1244  }
1245 
1246  state->str1 = VARDATA_ANY(t1);
1247  state->str2 = VARDATA_ANY(t2);
1248  state->len1 = len1;
1249  state->len2 = len2;
1250  state->last_match = NULL;
1251  state->refpoint = state->str1;
1252  state->refpos = 0;
1253 
1254  /*
1255  * Prepare the skip table for Boyer-Moore-Horspool searching. In these
1256  * notes we use the terminology that the "haystack" is the string to be
1257  * searched (t1) and the "needle" is the pattern being sought (t2).
1258  *
1259  * If the needle is empty or bigger than the haystack then there is no
1260  * point in wasting cycles initializing the table. We also choose not to
1261  * use B-M-H for needles of length 1, since the skip table can't possibly
1262  * save anything in that case.
1263  */
1264  if (len1 >= len2 && len2 > 1)
1265  {
1266  int searchlength = len1 - len2;
1267  int skiptablemask;
1268  int last;
1269  int i;
1270  const char *str2 = state->str2;
1271 
1272  /*
1273  * First we must determine how much of the skip table to use. The
1274  * declaration of TextPositionState allows up to 256 elements, but for
1275  * short search problems we don't really want to have to initialize so
1276  * many elements --- it would take too long in comparison to the
1277  * actual search time. So we choose a useful skip table size based on
1278  * the haystack length minus the needle length. The closer the needle
1279  * length is to the haystack length the less useful skipping becomes.
1280  *
1281  * Note: since we use bit-masking to select table elements, the skip
1282  * table size MUST be a power of 2, and so the mask must be 2^N-1.
1283  */
1284  if (searchlength < 16)
1285  skiptablemask = 3;
1286  else if (searchlength < 64)
1287  skiptablemask = 7;
1288  else if (searchlength < 128)
1289  skiptablemask = 15;
1290  else if (searchlength < 512)
1291  skiptablemask = 31;
1292  else if (searchlength < 2048)
1293  skiptablemask = 63;
1294  else if (searchlength < 4096)
1295  skiptablemask = 127;
1296  else
1297  skiptablemask = 255;
1298  state->skiptablemask = skiptablemask;
1299 
1300  /*
1301  * Initialize the skip table. We set all elements to the needle
1302  * length, since this is the correct skip distance for any character
1303  * not found in the needle.
1304  */
1305  for (i = 0; i <= skiptablemask; i++)
1306  state->skiptable[i] = len2;
1307 
1308  /*
1309  * Now examine the needle. For each character except the last one,
1310  * set the corresponding table element to the appropriate skip
1311  * distance. Note that when two characters share the same skip table
1312  * entry, the one later in the needle must determine the skip
1313  * distance.
1314  */
1315  last = len2 - 1;
1316 
1317  for (i = 0; i < last; i++)
1318  state->skiptable[(unsigned char) str2[i] & skiptablemask] = last - i;
1319  }
1320 }
1321 
1322 /*
1323  * Advance to the next match, starting from the end of the previous match
1324  * (or the beginning of the string, on first call). Returns true if a match
1325  * is found.
1326  *
1327  * Note that this refuses to match an empty-string needle. Most callers
1328  * will have handled that case specially and we'll never see it here.
1329  */
1330 static bool
1332 {
1333  int needle_len = state->len2;
1334  char *start_ptr;
1335  char *matchptr;
1336 
1337  if (needle_len <= 0)
1338  return false; /* result for empty pattern */
1339 
1340  /* Start from the point right after the previous match. */
1341  if (state->last_match)
1342  start_ptr = state->last_match + needle_len;
1343  else
1344  start_ptr = state->str1;
1345 
1346 retry:
1347  matchptr = text_position_next_internal(start_ptr, state);
1348 
1349  if (!matchptr)
1350  return false;
1351 
1352  /*
1353  * Found a match for the byte sequence. If this is a multibyte encoding,
1354  * where one character's byte sequence can appear inside a longer
1355  * multi-byte character, we need to verify that the match was at a
1356  * character boundary, not in the middle of a multi-byte character.
1357  */
1358  if (state->is_multibyte_char_in_char)
1359  {
1360  /* Walk one character at a time, until we reach the match. */
1361 
1362  /* the search should never move backwards. */
1363  Assert(state->refpoint <= matchptr);
1364 
1365  while (state->refpoint < matchptr)
1366  {
1367  /* step to next character. */
1368  state->refpoint += pg_mblen(state->refpoint);
1369  state->refpos++;
1370 
1371  /*
1372  * If we stepped over the match's start position, then it was a
1373  * false positive, where the byte sequence appeared in the middle
1374  * of a multi-byte character. Skip it, and continue the search at
1375  * the next character boundary.
1376  */
1377  if (state->refpoint > matchptr)
1378  {
1379  start_ptr = state->refpoint;
1380  goto retry;
1381  }
1382  }
1383  }
1384 
1385  state->last_match = matchptr;
1386  return true;
1387 }
1388 
1389 /*
1390  * Subroutine of text_position_next(). This searches for the raw byte
1391  * sequence, ignoring any multi-byte encoding issues. Returns the first
1392  * match starting at 'start_ptr', or NULL if no match is found.
1393  */
1394 static char *
1396 {
1397  int haystack_len = state->len1;
1398  int needle_len = state->len2;
1399  int skiptablemask = state->skiptablemask;
1400  const char *haystack = state->str1;
1401  const char *needle = state->str2;
1402  const char *haystack_end = &haystack[haystack_len];
1403  const char *hptr;
1404 
1405  Assert(start_ptr >= haystack && start_ptr <= haystack_end);
1406 
1407  if (needle_len == 1)
1408  {
1409  /* No point in using B-M-H for a one-character needle */
1410  char nchar = *needle;
1411 
1412  hptr = start_ptr;
1413  while (hptr < haystack_end)
1414  {
1415  if (*hptr == nchar)
1416  return (char *) hptr;
1417  hptr++;
1418  }
1419  }
1420  else
1421  {
1422  const char *needle_last = &needle[needle_len - 1];
1423 
1424  /* Start at startpos plus the length of the needle */
1425  hptr = start_ptr + needle_len - 1;
1426  while (hptr < haystack_end)
1427  {
1428  /* Match the needle scanning *backward* */
1429  const char *nptr;
1430  const char *p;
1431 
1432  nptr = needle_last;
1433  p = hptr;
1434  while (*nptr == *p)
1435  {
1436  /* Matched it all? If so, return 1-based position */
1437  if (nptr == needle)
1438  return (char *) p;
1439  nptr--, p--;
1440  }
1441 
1442  /*
1443  * No match, so use the haystack char at hptr to decide how far to
1444  * advance. If the needle had any occurrence of that character
1445  * (or more precisely, one sharing the same skiptable entry)
1446  * before its last character, then we advance far enough to align
1447  * the last such needle character with that haystack position.
1448  * Otherwise we can advance by the whole needle length.
1449  */
1450  hptr += state->skiptable[(unsigned char) *hptr & skiptablemask];
1451  }
1452  }
1453 
1454  return 0; /* not found */
1455 }
1456 
1457 /*
1458  * Return a pointer to the current match.
1459  *
1460  * The returned pointer points into the original haystack string.
1461  */
1462 static char *
1464 {
1465  return state->last_match;
1466 }
1467 
1468 /*
1469  * Return the offset of the current match.
1470  *
1471  * The offset is in characters, 1-based.
1472  */
1473 static int
1475 {
1476  if (!state->is_multibyte)
1477  return state->last_match - state->str1 + 1;
1478  else
1479  {
1480  /* Convert the byte position to char position. */
1481  while (state->refpoint < state->last_match)
1482  {
1483  state->refpoint += pg_mblen(state->refpoint);
1484  state->refpos++;
1485  }
1486  Assert(state->refpoint == state->last_match);
1487  return state->refpos + 1;
1488  }
1489 }
1490 
1491 /*
1492  * Reset search state to the initial state installed by text_position_setup.
1493  *
1494  * The next call to text_position_next will search from the beginning
1495  * of the string.
1496  */
1497 static void
1499 {
1500  state->last_match = NULL;
1501  state->refpoint = state->str1;
1502  state->refpos = 0;
1503 }
1504 
1505 static void
1507 {
1508  /* no cleanup needed */
1509 }
1510 
1511 
1512 static void
1514 {
1515  if (!OidIsValid(collid))
1516  {
1517  /*
1518  * This typically means that the parser could not resolve a conflict
1519  * of implicit collations, so report it that way.
1520  */
1521  ereport(ERROR,
1522  (errcode(ERRCODE_INDETERMINATE_COLLATION),
1523  errmsg("could not determine which collation to use for string comparison"),
1524  errhint("Use the COLLATE clause to set the collation explicitly.")));
1525  }
1526 }
1527 
1528 /* varstr_cmp()
1529  * Comparison function for text strings with given lengths.
1530  * Includes locale support, but must copy strings to temporary memory
1531  * to allow null-termination for inputs to strcoll().
1532  * Returns an integer less than, equal to, or greater than zero, indicating
1533  * whether arg1 is less than, equal to, or greater than arg2.
1534  *
1535  * Note: many functions that depend on this are marked leakproof; therefore,
1536  * avoid reporting the actual contents of the input when throwing errors.
1537  * All errors herein should be things that can't happen except on corrupt
1538  * data, anyway; otherwise we will have trouble with indexing strings that
1539  * would cause them.
1540  */
1541 int
1542 varstr_cmp(const char *arg1, int len1, const char *arg2, int len2, Oid collid)
1543 {
1544  int result;
1545 
1546  check_collation_set(collid);
1547 
1548  /*
1549  * Unfortunately, there is no strncoll(), so in the non-C locale case we
1550  * have to do some memory copying. This turns out to be significantly
1551  * slower, so we optimize the case where LC_COLLATE is C. We also try to
1552  * optimize relatively-short strings by avoiding palloc/pfree overhead.
1553  */
1554  if (lc_collate_is_c(collid))
1555  {
1556  result = memcmp(arg1, arg2, Min(len1, len2));
1557  if ((result == 0) && (len1 != len2))
1558  result = (len1 < len2) ? -1 : 1;
1559  }
1560  else
1561  {
1562  char a1buf[TEXTBUFLEN];
1563  char a2buf[TEXTBUFLEN];
1564  char *a1p,
1565  *a2p;
1566  pg_locale_t mylocale = 0;
1567 
1568  if (collid != DEFAULT_COLLATION_OID)
1569  mylocale = pg_newlocale_from_collation(collid);
1570 
1571  /*
1572  * memcmp() can't tell us which of two unequal strings sorts first,
1573  * but it's a cheap way to tell if they're equal. Testing shows that
1574  * memcmp() followed by strcoll() is only trivially slower than
1575  * strcoll() by itself, so we don't lose much if this doesn't work out
1576  * very often, and if it does - for example, because there are many
1577  * equal strings in the input - then we win big by avoiding expensive
1578  * collation-aware comparisons.
1579  */
1580  if (len1 == len2 && memcmp(arg1, arg2, len1) == 0)
1581  return 0;
1582 
1583 #ifdef WIN32
1584  /* Win32 does not have UTF-8, so we need to map to UTF-16 */
1585  if (GetDatabaseEncoding() == PG_UTF8
1586  && (!mylocale || mylocale->provider == COLLPROVIDER_LIBC))
1587  {
1588  int a1len;
1589  int a2len;
1590  int r;
1591 
1592  if (len1 >= TEXTBUFLEN / 2)
1593  {
1594  a1len = len1 * 2 + 2;
1595  a1p = palloc(a1len);
1596  }
1597  else
1598  {
1599  a1len = TEXTBUFLEN;
1600  a1p = a1buf;
1601  }
1602  if (len2 >= TEXTBUFLEN / 2)
1603  {
1604  a2len = len2 * 2 + 2;
1605  a2p = palloc(a2len);
1606  }
1607  else
1608  {
1609  a2len = TEXTBUFLEN;
1610  a2p = a2buf;
1611  }
1612 
1613  /* stupid Microsloth API does not work for zero-length input */
1614  if (len1 == 0)
1615  r = 0;
1616  else
1617  {
1618  r = MultiByteToWideChar(CP_UTF8, 0, arg1, len1,
1619  (LPWSTR) a1p, a1len / 2);
1620  if (!r)
1621  ereport(ERROR,
1622  (errmsg("could not convert string to UTF-16: error code %lu",
1623  GetLastError())));
1624  }
1625  ((LPWSTR) a1p)[r] = 0;
1626 
1627  if (len2 == 0)
1628  r = 0;
1629  else
1630  {
1631  r = MultiByteToWideChar(CP_UTF8, 0, arg2, len2,
1632  (LPWSTR) a2p, a2len / 2);
1633  if (!r)
1634  ereport(ERROR,
1635  (errmsg("could not convert string to UTF-16: error code %lu",
1636  GetLastError())));
1637  }
1638  ((LPWSTR) a2p)[r] = 0;
1639 
1640  errno = 0;
1641 #ifdef HAVE_LOCALE_T
1642  if (mylocale)
1643  result = wcscoll_l((LPWSTR) a1p, (LPWSTR) a2p, mylocale->info.lt);
1644  else
1645 #endif
1646  result = wcscoll((LPWSTR) a1p, (LPWSTR) a2p);
1647  if (result == 2147483647) /* _NLSCMPERROR; missing from mingw
1648  * headers */
1649  ereport(ERROR,
1650  (errmsg("could not compare Unicode strings: %m")));
1651 
1652  /* Break tie if necessary. */
1653  if (result == 0 &&
1654  (!mylocale || mylocale->deterministic))
1655  {
1656  result = memcmp(arg1, arg2, Min(len1, len2));
1657  if ((result == 0) && (len1 != len2))
1658  result = (len1 < len2) ? -1 : 1;
1659  }
1660 
1661  if (a1p != a1buf)
1662  pfree(a1p);
1663  if (a2p != a2buf)
1664  pfree(a2p);
1665 
1666  return result;
1667  }
1668 #endif /* WIN32 */
1669 
1670  if (len1 >= TEXTBUFLEN)
1671  a1p = (char *) palloc(len1 + 1);
1672  else
1673  a1p = a1buf;
1674  if (len2 >= TEXTBUFLEN)
1675  a2p = (char *) palloc(len2 + 1);
1676  else
1677  a2p = a2buf;
1678 
1679  memcpy(a1p, arg1, len1);
1680  a1p[len1] = '\0';
1681  memcpy(a2p, arg2, len2);
1682  a2p[len2] = '\0';
1683 
1684  if (mylocale)
1685  {
1686  if (mylocale->provider == COLLPROVIDER_ICU)
1687  {
1688 #ifdef USE_ICU
1689 #ifdef HAVE_UCOL_STRCOLLUTF8
1690  if (GetDatabaseEncoding() == PG_UTF8)
1691  {
1692  UErrorCode status;
1693 
1694  status = U_ZERO_ERROR;
1695  result = ucol_strcollUTF8(mylocale->info.icu.ucol,
1696  arg1, len1,
1697  arg2, len2,
1698  &status);
1699  if (U_FAILURE(status))
1700  ereport(ERROR,
1701  (errmsg("collation failed: %s", u_errorName(status))));
1702  }
1703  else
1704 #endif
1705  {
1706  int32_t ulen1,
1707  ulen2;
1708  UChar *uchar1,
1709  *uchar2;
1710 
1711  ulen1 = icu_to_uchar(&uchar1, arg1, len1);
1712  ulen2 = icu_to_uchar(&uchar2, arg2, len2);
1713 
1714  result = ucol_strcoll(mylocale->info.icu.ucol,
1715  uchar1, ulen1,
1716  uchar2, ulen2);
1717 
1718  pfree(uchar1);
1719  pfree(uchar2);
1720  }
1721 #else /* not USE_ICU */
1722  /* shouldn't happen */
1723  elog(ERROR, "unsupported collprovider: %c", mylocale->provider);
1724 #endif /* not USE_ICU */
1725  }
1726  else
1727  {
1728 #ifdef HAVE_LOCALE_T
1729  result = strcoll_l(a1p, a2p, mylocale->info.lt);
1730 #else
1731  /* shouldn't happen */
1732  elog(ERROR, "unsupported collprovider: %c", mylocale->provider);
1733 #endif
1734  }
1735  }
1736  else
1737  result = strcoll(a1p, a2p);
1738 
1739  /* Break tie if necessary. */
1740  if (result == 0 &&
1741  (!mylocale || mylocale->deterministic))
1742  result = strcmp(a1p, a2p);
1743 
1744  if (a1p != a1buf)
1745  pfree(a1p);
1746  if (a2p != a2buf)
1747  pfree(a2p);
1748  }
1749 
1750  return result;
1751 }
1752 
1753 /* text_cmp()
1754  * Internal comparison function for text strings.
1755  * Returns -1, 0 or 1
1756  */
1757 static int
1758 text_cmp(text *arg1, text *arg2, Oid collid)
1759 {
1760  char *a1p,
1761  *a2p;
1762  int len1,
1763  len2;
1764 
1765  a1p = VARDATA_ANY(arg1);
1766  a2p = VARDATA_ANY(arg2);
1767 
1768  len1 = VARSIZE_ANY_EXHDR(arg1);
1769  len2 = VARSIZE_ANY_EXHDR(arg2);
1770 
1771  return varstr_cmp(a1p, len1, a2p, len2, collid);
1772 }
1773 
1774 /*
1775  * Comparison functions for text strings.
1776  *
1777  * Note: btree indexes need these routines not to leak memory; therefore,
1778  * be careful to free working copies of toasted datums. Most places don't
1779  * need to be so careful.
1780  */
1781 
1782 Datum
1784 {
1785  Oid collid = PG_GET_COLLATION();
1786  bool result;
1787 
1788  check_collation_set(collid);
1789 
1790  if (lc_collate_is_c(collid) ||
1791  collid == DEFAULT_COLLATION_OID ||
1792  pg_newlocale_from_collation(collid)->deterministic)
1793  {
1794  Datum arg1 = PG_GETARG_DATUM(0);
1795  Datum arg2 = PG_GETARG_DATUM(1);
1796  Size len1,
1797  len2;
1798 
1799  /*
1800  * Since we only care about equality or not-equality, we can avoid all
1801  * the expense of strcoll() here, and just do bitwise comparison. In
1802  * fact, we don't even have to do a bitwise comparison if we can show
1803  * the lengths of the strings are unequal; which might save us from
1804  * having to detoast one or both values.
1805  */
1806  len1 = toast_raw_datum_size(arg1);
1807  len2 = toast_raw_datum_size(arg2);
1808  if (len1 != len2)
1809  result = false;
1810  else
1811  {
1812  text *targ1 = DatumGetTextPP(arg1);
1813  text *targ2 = DatumGetTextPP(arg2);
1814 
1815  result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1816  len1 - VARHDRSZ) == 0);
1817 
1818  PG_FREE_IF_COPY(targ1, 0);
1819  PG_FREE_IF_COPY(targ2, 1);
1820  }
1821  }
1822  else
1823  {
1824  text *arg1 = PG_GETARG_TEXT_PP(0);
1825  text *arg2 = PG_GETARG_TEXT_PP(1);
1826 
1827  result = (text_cmp(arg1, arg2, collid) == 0);
1828 
1829  PG_FREE_IF_COPY(arg1, 0);
1830  PG_FREE_IF_COPY(arg2, 1);
1831  }
1832 
1833  PG_RETURN_BOOL(result);
1834 }
1835 
1836 Datum
1838 {
1839  Oid collid = PG_GET_COLLATION();
1840  bool result;
1841 
1842  check_collation_set(collid);
1843 
1844  if (lc_collate_is_c(collid) ||
1845  collid == DEFAULT_COLLATION_OID ||
1846  pg_newlocale_from_collation(collid)->deterministic)
1847  {
1848  Datum arg1 = PG_GETARG_DATUM(0);
1849  Datum arg2 = PG_GETARG_DATUM(1);
1850  Size len1,
1851  len2;
1852 
1853  /* See comment in texteq() */
1854  len1 = toast_raw_datum_size(arg1);
1855  len2 = toast_raw_datum_size(arg2);
1856  if (len1 != len2)
1857  result = true;
1858  else
1859  {
1860  text *targ1 = DatumGetTextPP(arg1);
1861  text *targ2 = DatumGetTextPP(arg2);
1862 
1863  result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1864  len1 - VARHDRSZ) != 0);
1865 
1866  PG_FREE_IF_COPY(targ1, 0);
1867  PG_FREE_IF_COPY(targ2, 1);
1868  }
1869  }
1870  else
1871  {
1872  text *arg1 = PG_GETARG_TEXT_PP(0);
1873  text *arg2 = PG_GETARG_TEXT_PP(1);
1874 
1875  result = (text_cmp(arg1, arg2, collid) != 0);
1876 
1877  PG_FREE_IF_COPY(arg1, 0);
1878  PG_FREE_IF_COPY(arg2, 1);
1879  }
1880 
1881  PG_RETURN_BOOL(result);
1882 }
1883 
1884 Datum
1886 {
1887  text *arg1 = PG_GETARG_TEXT_PP(0);
1888  text *arg2 = PG_GETARG_TEXT_PP(1);
1889  bool result;
1890 
1891  result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) < 0);
1892 
1893  PG_FREE_IF_COPY(arg1, 0);
1894  PG_FREE_IF_COPY(arg2, 1);
1895 
1896  PG_RETURN_BOOL(result);
1897 }
1898 
1899 Datum
1901 {
1902  text *arg1 = PG_GETARG_TEXT_PP(0);
1903  text *arg2 = PG_GETARG_TEXT_PP(1);
1904  bool result;
1905 
1906  result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) <= 0);
1907 
1908  PG_FREE_IF_COPY(arg1, 0);
1909  PG_FREE_IF_COPY(arg2, 1);
1910 
1911  PG_RETURN_BOOL(result);
1912 }
1913 
1914 Datum
1916 {
1917  text *arg1 = PG_GETARG_TEXT_PP(0);
1918  text *arg2 = PG_GETARG_TEXT_PP(1);
1919  bool result;
1920 
1921  result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) > 0);
1922 
1923  PG_FREE_IF_COPY(arg1, 0);
1924  PG_FREE_IF_COPY(arg2, 1);
1925 
1926  PG_RETURN_BOOL(result);
1927 }
1928 
1929 Datum
1931 {
1932  text *arg1 = PG_GETARG_TEXT_PP(0);
1933  text *arg2 = PG_GETARG_TEXT_PP(1);
1934  bool result;
1935 
1936  result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) >= 0);
1937 
1938  PG_FREE_IF_COPY(arg1, 0);
1939  PG_FREE_IF_COPY(arg2, 1);
1940 
1941  PG_RETURN_BOOL(result);
1942 }
1943 
1944 Datum
1946 {
1947  Datum arg1 = PG_GETARG_DATUM(0);
1948  Datum arg2 = PG_GETARG_DATUM(1);
1949  Oid collid = PG_GET_COLLATION();
1950  pg_locale_t mylocale = 0;
1951  bool result;
1952  Size len1,
1953  len2;
1954 
1955  check_collation_set(collid);
1956 
1957  if (!lc_collate_is_c(collid) && collid != DEFAULT_COLLATION_OID)
1958  mylocale = pg_newlocale_from_collation(collid);
1959 
1960  if (mylocale && !mylocale->deterministic)
1961  ereport(ERROR,
1962  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1963  errmsg("nondeterministic collations are not supported for substring searches")));
1964 
1965  len1 = toast_raw_datum_size(arg1);
1966  len2 = toast_raw_datum_size(arg2);
1967  if (len2 > len1)
1968  result = false;
1969  else
1970  {
1971  text *targ1 = text_substring(arg1, 1, len2, false);
1972  text *targ2 = DatumGetTextPP(arg2);
1973 
1974  result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1975  VARSIZE_ANY_EXHDR(targ2)) == 0);
1976 
1977  PG_FREE_IF_COPY(targ1, 0);
1978  PG_FREE_IF_COPY(targ2, 1);
1979  }
1980 
1981  PG_RETURN_BOOL(result);
1982 }
1983 
1984 Datum
1986 {
1987  text *arg1 = PG_GETARG_TEXT_PP(0);
1988  text *arg2 = PG_GETARG_TEXT_PP(1);
1989  int32 result;
1990 
1991  result = text_cmp(arg1, arg2, PG_GET_COLLATION());
1992 
1993  PG_FREE_IF_COPY(arg1, 0);
1994  PG_FREE_IF_COPY(arg2, 1);
1995 
1996  PG_RETURN_INT32(result);
1997 }
1998 
1999 Datum
2001 {
2003  Oid collid = ssup->ssup_collation;
2004  MemoryContext oldcontext;
2005 
2006  oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
2007 
2008  /* Use generic string SortSupport */
2009  varstr_sortsupport(ssup, TEXTOID, collid);
2010 
2011  MemoryContextSwitchTo(oldcontext);
2012 
2013  PG_RETURN_VOID();
2014 }
2015 
2016 /*
2017  * Generic sortsupport interface for character type's operator classes.
2018  * Includes locale support, and support for BpChar semantics (i.e. removing
2019  * trailing spaces before comparison).
2020  *
2021  * Relies on the assumption that text, VarChar, BpChar, and bytea all have the
2022  * same representation. Callers that always use the C collation (e.g.
2023  * non-collatable type callers like bytea) may have NUL bytes in their strings;
2024  * this will not work with any other collation, though.
2025  */
2026 void
2028 {
2029  bool abbreviate = ssup->abbreviate;
2030  bool collate_c = false;
2031  VarStringSortSupport *sss;
2032  pg_locale_t locale = 0;
2033 
2034  check_collation_set(collid);
2035 
2036  /*
2037  * If possible, set ssup->comparator to a function which can be used to
2038  * directly compare two datums. If we can do this, we'll avoid the
2039  * overhead of a trip through the fmgr layer for every comparison, which
2040  * can be substantial.
2041  *
2042  * Most typically, we'll set the comparator to varlenafastcmp_locale,
2043  * which uses strcoll() to perform comparisons. We use that for the
2044  * BpChar case too, but type NAME uses namefastcmp_locale. However, if
2045  * LC_COLLATE = C, we can make things quite a bit faster with
2046  * varstrfastcmp_c, bpcharfastcmp_c, or namefastcmp_c, all of which use
2047  * memcmp() rather than strcoll().
2048  */
2049  if (lc_collate_is_c(collid))
2050  {
2051  if (typid == BPCHAROID)
2052  ssup->comparator = bpcharfastcmp_c;
2053  else if (typid == NAMEOID)
2054  {
2055  ssup->comparator = namefastcmp_c;
2056  /* Not supporting abbreviation with type NAME, for now */
2057  abbreviate = false;
2058  }
2059  else
2060  ssup->comparator = varstrfastcmp_c;
2061 
2062  collate_c = true;
2063  }
2064  else
2065  {
2066  /*
2067  * We need a collation-sensitive comparison. To make things faster,
2068  * we'll figure out the collation based on the locale id and cache the
2069  * result.
2070  */
2071  if (collid != DEFAULT_COLLATION_OID)
2072  locale = pg_newlocale_from_collation(collid);
2073 
2074  /*
2075  * There is a further exception on Windows. When the database
2076  * encoding is UTF-8 and we are not using the C collation, complex
2077  * hacks are required. We don't currently have a comparator that
2078  * handles that case, so we fall back on the slow method of having the
2079  * sort code invoke bttextcmp() (in the case of text) via the fmgr
2080  * trampoline. ICU locales work just the same on Windows, however.
2081  */
2082 #ifdef WIN32
2083  if (GetDatabaseEncoding() == PG_UTF8 &&
2084  !(locale && locale->provider == COLLPROVIDER_ICU))
2085  return;
2086 #endif
2087 
2088  /*
2089  * We use varlenafastcmp_locale except for type NAME.
2090  */
2091  if (typid == NAMEOID)
2092  {
2094  /* Not supporting abbreviation with type NAME, for now */
2095  abbreviate = false;
2096  }
2097  else
2099  }
2100 
2101  /*
2102  * Unfortunately, it seems that abbreviation for non-C collations is
2103  * broken on many common platforms; testing of multiple versions of glibc
2104  * reveals that, for many locales, strcoll() and strxfrm() do not return
2105  * consistent results, which is fatal to this optimization. While no
2106  * other libc other than Cygwin has so far been shown to have a problem,
2107  * we take the conservative course of action for right now and disable
2108  * this categorically. (Users who are certain this isn't a problem on
2109  * their system can define TRUST_STRXFRM.)
2110  *
2111  * Even apart from the risk of broken locales, it's possible that there
2112  * are platforms where the use of abbreviated keys should be disabled at
2113  * compile time. Having only 4 byte datums could make worst-case
2114  * performance drastically more likely, for example. Moreover, macOS's
2115  * strxfrm() implementation is known to not effectively concentrate a
2116  * significant amount of entropy from the original string in earlier
2117  * transformed blobs. It's possible that other supported platforms are
2118  * similarly encumbered. So, if we ever get past disabling this
2119  * categorically, we may still want or need to disable it for particular
2120  * platforms.
2121  */
2122 #ifndef TRUST_STRXFRM
2123  if (!collate_c && !(locale && locale->provider == COLLPROVIDER_ICU))
2124  abbreviate = false;
2125 #endif
2126 
2127  /*
2128  * If we're using abbreviated keys, or if we're using a locale-aware
2129  * comparison, we need to initialize a VarStringSortSupport object. Both
2130  * cases will make use of the temporary buffers we initialize here for
2131  * scratch space (and to detect requirement for BpChar semantics from
2132  * caller), and the abbreviation case requires additional state.
2133  */
2134  if (abbreviate || !collate_c)
2135  {
2136  sss = palloc(sizeof(VarStringSortSupport));
2137  sss->buf1 = palloc(TEXTBUFLEN);
2138  sss->buflen1 = TEXTBUFLEN;
2139  sss->buf2 = palloc(TEXTBUFLEN);
2140  sss->buflen2 = TEXTBUFLEN;
2141  /* Start with invalid values */
2142  sss->last_len1 = -1;
2143  sss->last_len2 = -1;
2144  /* Initialize */
2145  sss->last_returned = 0;
2146  sss->locale = locale;
2147 
2148  /*
2149  * To avoid somehow confusing a strxfrm() blob and an original string,
2150  * constantly keep track of the variety of data that buf1 and buf2
2151  * currently contain.
2152  *
2153  * Comparisons may be interleaved with conversion calls. Frequently,
2154  * conversions and comparisons are batched into two distinct phases,
2155  * but the correctness of caching cannot hinge upon this. For
2156  * comparison caching, buffer state is only trusted if cache_blob is
2157  * found set to false, whereas strxfrm() caching only trusts the state
2158  * when cache_blob is found set to true.
2159  *
2160  * Arbitrarily initialize cache_blob to true.
2161  */
2162  sss->cache_blob = true;
2163  sss->collate_c = collate_c;
2164  sss->typid = typid;
2165  ssup->ssup_extra = sss;
2166 
2167  /*
2168  * If possible, plan to use the abbreviated keys optimization. The
2169  * core code may switch back to authoritative comparator should
2170  * abbreviation be aborted.
2171  */
2172  if (abbreviate)
2173  {
2174  sss->prop_card = 0.20;
2175  initHyperLogLog(&sss->abbr_card, 10);
2176  initHyperLogLog(&sss->full_card, 10);
2177  ssup->abbrev_full_comparator = ssup->comparator;
2178  ssup->comparator = varstrcmp_abbrev;
2181  }
2182  }
2183 }
2184 
2185 /*
2186  * sortsupport comparison func (for C locale case)
2187  */
2188 static int
2190 {
2191  VarString *arg1 = DatumGetVarStringPP(x);
2192  VarString *arg2 = DatumGetVarStringPP(y);
2193  char *a1p,
2194  *a2p;
2195  int len1,
2196  len2,
2197  result;
2198 
2199  a1p = VARDATA_ANY(arg1);
2200  a2p = VARDATA_ANY(arg2);
2201 
2202  len1 = VARSIZE_ANY_EXHDR(arg1);
2203  len2 = VARSIZE_ANY_EXHDR(arg2);
2204 
2205  result = memcmp(a1p, a2p, Min(len1, len2));
2206  if ((result == 0) && (len1 != len2))
2207  result = (len1 < len2) ? -1 : 1;
2208 
2209  /* We can't afford to leak memory here. */
2210  if (PointerGetDatum(arg1) != x)
2211  pfree(arg1);
2212  if (PointerGetDatum(arg2) != y)
2213  pfree(arg2);
2214 
2215  return result;
2216 }
2217 
2218 /*
2219  * sortsupport comparison func (for BpChar C locale case)
2220  *
2221  * BpChar outsources its sortsupport to this module. Specialization for the
2222  * varstr_sortsupport BpChar case, modeled on
2223  * internal_bpchar_pattern_compare().
2224  */
2225 static int
2227 {
2228  BpChar *arg1 = DatumGetBpCharPP(x);
2229  BpChar *arg2 = DatumGetBpCharPP(y);
2230  char *a1p,
2231  *a2p;
2232  int len1,
2233  len2,
2234  result;
2235 
2236  a1p = VARDATA_ANY(arg1);
2237  a2p = VARDATA_ANY(arg2);
2238 
2239  len1 = bpchartruelen(a1p, VARSIZE_ANY_EXHDR(arg1));
2240  len2 = bpchartruelen(a2p, VARSIZE_ANY_EXHDR(arg2));
2241 
2242  result = memcmp(a1p, a2p, Min(len1, len2));
2243  if ((result == 0) && (len1 != len2))
2244  result = (len1 < len2) ? -1 : 1;
2245 
2246  /* We can't afford to leak memory here. */
2247  if (PointerGetDatum(arg1) != x)
2248  pfree(arg1);
2249  if (PointerGetDatum(arg2) != y)
2250  pfree(arg2);
2251 
2252  return result;
2253 }
2254 
2255 /*
2256  * sortsupport comparison func (for NAME C locale case)
2257  */
2258 static int
2260 {
2261  Name arg1 = DatumGetName(x);
2262  Name arg2 = DatumGetName(y);
2263 
2264  return strncmp(NameStr(*arg1), NameStr(*arg2), NAMEDATALEN);
2265 }
2266 
2267 /*
2268  * sortsupport comparison func (for locale case with all varlena types)
2269  */
2270 static int
2272 {
2273  VarString *arg1 = DatumGetVarStringPP(x);
2274  VarString *arg2 = DatumGetVarStringPP(y);
2275  char *a1p,
2276  *a2p;
2277  int len1,
2278  len2,
2279  result;
2280 
2281  a1p = VARDATA_ANY(arg1);
2282  a2p = VARDATA_ANY(arg2);
2283 
2284  len1 = VARSIZE_ANY_EXHDR(arg1);
2285  len2 = VARSIZE_ANY_EXHDR(arg2);
2286 
2287  result = varstrfastcmp_locale(a1p, len1, a2p, len2, ssup);
2288 
2289  /* We can't afford to leak memory here. */
2290  if (PointerGetDatum(arg1) != x)
2291  pfree(arg1);
2292  if (PointerGetDatum(arg2) != y)
2293  pfree(arg2);
2294 
2295  return result;
2296 }
2297 
2298 /*
2299  * sortsupport comparison func (for locale case with NAME type)
2300  */
2301 static int
2303 {
2304  Name arg1 = DatumGetName(x);
2305  Name arg2 = DatumGetName(y);
2306 
2307  return varstrfastcmp_locale(NameStr(*arg1), strlen(NameStr(*arg1)),
2308  NameStr(*arg2), strlen(NameStr(*arg2)),
2309  ssup);
2310 }
2311 
2312 /*
2313  * sortsupport comparison func for locale cases
2314  */
2315 static int
2316 varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup)
2317 {
2319  int result;
2320  bool arg1_match;
2321 
2322  /* Fast pre-check for equality, as discussed in varstr_cmp() */
2323  if (len1 == len2 && memcmp(a1p, a2p, len1) == 0)
2324  {
2325  /*
2326  * No change in buf1 or buf2 contents, so avoid changing last_len1 or
2327  * last_len2. Existing contents of buffers might still be used by
2328  * next call.
2329  *
2330  * It's fine to allow the comparison of BpChar padding bytes here,
2331  * even though that implies that the memcmp() will usually be
2332  * performed for BpChar callers (though multibyte characters could
2333  * still prevent that from occurring). The memcmp() is still very
2334  * cheap, and BpChar's funny semantics have us remove trailing spaces
2335  * (not limited to padding), so we need make no distinction between
2336  * padding space characters and "real" space characters.
2337  */
2338  return 0;
2339  }
2340 
2341  if (sss->typid == BPCHAROID)
2342  {
2343  /* Get true number of bytes, ignoring trailing spaces */
2344  len1 = bpchartruelen(a1p, len1);
2345  len2 = bpchartruelen(a2p, len2);
2346  }
2347 
2348  if (len1 >= sss->buflen1)
2349  {
2350  pfree(sss->buf1);
2351  sss->buflen1 = Max(len1 + 1, Min(sss->buflen1 * 2, MaxAllocSize));
2352  sss->buf1 = MemoryContextAlloc(ssup->ssup_cxt, sss->buflen1);
2353  }
2354  if (len2 >= sss->buflen2)
2355  {
2356  pfree(sss->buf2);
2357  sss->buflen2 = Max(len2 + 1, Min(sss->buflen2 * 2, MaxAllocSize));
2358  sss->buf2 = MemoryContextAlloc(ssup->ssup_cxt, sss->buflen2);
2359  }
2360 
2361  /*
2362  * We're likely to be asked to compare the same strings repeatedly, and
2363  * memcmp() is so much cheaper than strcoll() that it pays to try to cache
2364  * comparisons, even though in general there is no reason to think that
2365  * that will work out (every string datum may be unique). Caching does
2366  * not slow things down measurably when it doesn't work out, and can speed
2367  * things up by rather a lot when it does. In part, this is because the
2368  * memcmp() compares data from cachelines that are needed in L1 cache even
2369  * when the last comparison's result cannot be reused.
2370  */
2371  arg1_match = true;
2372  if (len1 != sss->last_len1 || memcmp(sss->buf1, a1p, len1) != 0)
2373  {
2374  arg1_match = false;
2375  memcpy(sss->buf1, a1p, len1);
2376  sss->buf1[len1] = '\0';
2377  sss->last_len1 = len1;
2378  }
2379 
2380  /*
2381  * If we're comparing the same two strings as last time, we can return the
2382  * same answer without calling strcoll() again. This is more likely than
2383  * it seems (at least with moderate to low cardinality sets), because
2384  * quicksort compares the same pivot against many values.
2385  */
2386  if (len2 != sss->last_len2 || memcmp(sss->buf2, a2p, len2) != 0)
2387  {
2388  memcpy(sss->buf2, a2p, len2);
2389  sss->buf2[len2] = '\0';
2390  sss->last_len2 = len2;
2391  }
2392  else if (arg1_match && !sss->cache_blob)
2393  {
2394  /* Use result cached following last actual strcoll() call */
2395  return sss->last_returned;
2396  }
2397 
2398  if (sss->locale)
2399  {
2400  if (sss->locale->provider == COLLPROVIDER_ICU)
2401  {
2402 #ifdef USE_ICU
2403 #ifdef HAVE_UCOL_STRCOLLUTF8
2404  if (GetDatabaseEncoding() == PG_UTF8)
2405  {
2406  UErrorCode status;
2407 
2408  status = U_ZERO_ERROR;
2409  result = ucol_strcollUTF8(sss->locale->info.icu.ucol,
2410  a1p, len1,
2411  a2p, len2,
2412  &status);
2413  if (U_FAILURE(status))
2414  ereport(ERROR,
2415  (errmsg("collation failed: %s", u_errorName(status))));
2416  }
2417  else
2418 #endif
2419  {
2420  int32_t ulen1,
2421  ulen2;
2422  UChar *uchar1,
2423  *uchar2;
2424 
2425  ulen1 = icu_to_uchar(&uchar1, a1p, len1);
2426  ulen2 = icu_to_uchar(&uchar2, a2p, len2);
2427 
2428  result = ucol_strcoll(sss->locale->info.icu.ucol,
2429  uchar1, ulen1,
2430  uchar2, ulen2);
2431 
2432  pfree(uchar1);
2433  pfree(uchar2);
2434  }
2435 #else /* not USE_ICU */
2436  /* shouldn't happen */
2437  elog(ERROR, "unsupported collprovider: %c", sss->locale->provider);
2438 #endif /* not USE_ICU */
2439  }
2440  else
2441  {
2442 #ifdef HAVE_LOCALE_T
2443  result = strcoll_l(sss->buf1, sss->buf2, sss->locale->info.lt);
2444 #else
2445  /* shouldn't happen */
2446  elog(ERROR, "unsupported collprovider: %c", sss->locale->provider);
2447 #endif
2448  }
2449  }
2450  else
2451  result = strcoll(sss->buf1, sss->buf2);
2452 
2453  /* Break tie if necessary. */
2454  if (result == 0 &&
2455  (!sss->locale || sss->locale->deterministic))
2456  result = strcmp(sss->buf1, sss->buf2);
2457 
2458  /* Cache result, perhaps saving an expensive strcoll() call next time */
2459  sss->cache_blob = false;
2460  sss->last_returned = result;
2461  return result;
2462 }
2463 
2464 /*
2465  * Abbreviated key comparison func
2466  */
2467 static int
2469 {
2470  /*
2471  * When 0 is returned, the core system will call varstrfastcmp_c()
2472  * (bpcharfastcmp_c() in BpChar case) or varlenafastcmp_locale(). Even a
2473  * strcmp() on two non-truncated strxfrm() blobs cannot indicate *equality*
2474  * authoritatively, for the same reason that there is a strcoll()
2475  * tie-breaker call to strcmp() in varstr_cmp().
2476  */
2477  if (x > y)
2478  return 1;
2479  else if (x == y)
2480  return 0;
2481  else
2482  return -1;
2483 }
2484 
2485 /*
2486  * Conversion routine for sortsupport. Converts original to abbreviated key
2487  * representation. Our encoding strategy is simple -- pack the first 8 bytes
2488  * of a strxfrm() blob into a Datum (on little-endian machines, the 8 bytes are
2489  * stored in reverse order), and treat it as an unsigned integer. When the "C"
2490  * locale is used, or in case of bytea, just memcpy() from original instead.
2491  */
2492 static Datum
2494 {
2496  VarString *authoritative = DatumGetVarStringPP(original);
2497  char *authoritative_data = VARDATA_ANY(authoritative);
2498 
2499  /* working state */
2500  Datum res;
2501  char *pres;
2502  int len;
2503  uint32 hash;
2504 
2505  pres = (char *) &res;
2506  /* memset(), so any non-overwritten bytes are NUL */
2507  memset(pres, 0, sizeof(Datum));
2508  len = VARSIZE_ANY_EXHDR(authoritative);
2509 
2510  /* Get number of bytes, ignoring trailing spaces */
2511  if (sss->typid == BPCHAROID)
2512  len = bpchartruelen(authoritative_data, len);
2513 
2514  /*
2515  * If we're using the C collation, use memcpy(), rather than strxfrm(), to
2516  * abbreviate keys. The full comparator for the C locale is always
2517  * memcmp(). It would be incorrect to allow bytea callers (callers that
2518  * always force the C collation -- bytea isn't a collatable type, but this
2519  * approach is convenient) to use strxfrm(). This is because bytea
2520  * strings may contain NUL bytes. Besides, this should be faster, too.
2521  *
2522  * More generally, it's okay that bytea callers can have NUL bytes in
2523  * strings because varstrcmp_abbrev() need not make a distinction between
2524  * terminating NUL bytes, and NUL bytes representing actual NULs in the
2525  * authoritative representation. Hopefully a comparison at or past one
2526  * abbreviated key's terminating NUL byte will resolve the comparison
2527  * without consulting the authoritative representation; specifically, some
2528  * later non-NUL byte in the longer string can resolve the comparison
2529  * against a subsequent terminating NUL in the shorter string. There will
2530  * usually be what is effectively a "length-wise" resolution there and
2531  * then.
2532  *
2533  * If that doesn't work out -- if all bytes in the longer string
2534  * positioned at or past the offset of the smaller string's (first)
2535  * terminating NUL are actually representative of NUL bytes in the
2536  * authoritative binary string (perhaps with some *terminating* NUL bytes
2537  * towards the end of the longer string iff it happens to still be small)
2538  * -- then an authoritative tie-breaker will happen, and do the right
2539  * thing: explicitly consider string length.
2540  */
2541  if (sss->collate_c)
2542  memcpy(pres, authoritative_data, Min(len, sizeof(Datum)));
2543  else
2544  {
2545  Size bsize;
2546 #ifdef USE_ICU
2547  int32_t ulen = -1;
2548  UChar *uchar = NULL;
2549 #endif
2550 
2551  /*
2552  * We're not using the C collation, so fall back on strxfrm or ICU
2553  * analogs.
2554  */
2555 
2556  /* By convention, we use buffer 1 to store and NUL-terminate */
2557  if (len >= sss->buflen1)
2558  {
2559  pfree(sss->buf1);
2560  sss->buflen1 = Max(len + 1, Min(sss->buflen1 * 2, MaxAllocSize));
2561  sss->buf1 = palloc(sss->buflen1);
2562  }
2563 
2564  /* Might be able to reuse strxfrm() blob from last call */
2565  if (sss->last_len1 == len && sss->cache_blob &&
2566  memcmp(sss->buf1, authoritative_data, len) == 0)
2567  {
2568  memcpy(pres, sss->buf2, Min(sizeof(Datum), sss->last_len2));
2569  /* No change affecting cardinality, so no hashing required */
2570  goto done;
2571  }
2572 
2573  memcpy(sss->buf1, authoritative_data, len);
2574 
2575  /*
2576  * Just like strcoll(), strxfrm() expects a NUL-terminated string. Not
2577  * necessary for ICU, but doesn't hurt.
2578  */
2579  sss->buf1[len] = '\0';
2580  sss->last_len1 = len;
2581 
2582 #ifdef USE_ICU
2583  /* When using ICU and not UTF8, convert string to UChar. */
2584  if (sss->locale && sss->locale->provider == COLLPROVIDER_ICU &&
2586  ulen = icu_to_uchar(&uchar, sss->buf1, len);
2587 #endif
2588 
2589  /*
2590  * Loop: Call strxfrm() or ucol_getSortKey(), possibly enlarge buffer,
2591  * and try again. Both of these functions have the result buffer
2592  * content undefined if the result did not fit, so we need to retry
2593  * until everything fits, even though we only need the first few bytes
2594  * in the end. When using ucol_nextSortKeyPart(), however, we only
2595  * ask for as many bytes as we actually need.
2596  */
2597  for (;;)
2598  {
2599 #ifdef USE_ICU
2600  if (sss->locale && sss->locale->provider == COLLPROVIDER_ICU)
2601  {
2602  /*
2603  * When using UTF8, use the iteration interface so we only
2604  * need to produce as many bytes as we actually need.
2605  */
2606  if (GetDatabaseEncoding() == PG_UTF8)
2607  {
2608  UCharIterator iter;
2609  uint32_t state[2];
2610  UErrorCode status;
2611 
2612  uiter_setUTF8(&iter, sss->buf1, len);
2613  state[0] = state[1] = 0; /* won't need that again */
2614  status = U_ZERO_ERROR;
2615  bsize = ucol_nextSortKeyPart(sss->locale->info.icu.ucol,
2616  &iter,
2617  state,
2618  (uint8_t *) sss->buf2,
2619  Min(sizeof(Datum), sss->buflen2),
2620  &status);
2621  if (U_FAILURE(status))
2622  ereport(ERROR,
2623  (errmsg("sort key generation failed: %s",
2624  u_errorName(status))));
2625  }
2626  else
2627  bsize = ucol_getSortKey(sss->locale->info.icu.ucol,
2628  uchar, ulen,
2629  (uint8_t *) sss->buf2, sss->buflen2);
2630  }
2631  else
2632 #endif
2633 #ifdef HAVE_LOCALE_T
2634  if (sss->locale && sss->locale->provider == COLLPROVIDER_LIBC)
2635  bsize = strxfrm_l(sss->buf2, sss->buf1,
2636  sss->buflen2, sss->locale->info.lt);
2637  else
2638 #endif
2639  bsize = strxfrm(sss->buf2, sss->buf1, sss->buflen2);
2640 
2641  sss->last_len2 = bsize;
2642  if (bsize < sss->buflen2)
2643  break;
2644 
2645  /*
2646  * Grow buffer and retry.
2647  */
2648  pfree(sss->buf2);
2649  sss->buflen2 = Max(bsize + 1,
2650  Min(sss->buflen2 * 2, MaxAllocSize));
2651  sss->buf2 = palloc(sss->buflen2);
2652  }
2653 
2654  /*
2655  * Every Datum byte is always compared. This is safe because the
2656  * strxfrm() blob is itself NUL terminated, leaving no danger of
2657  * misinterpreting any NUL bytes not intended to be interpreted as
2658  * logically representing termination.
2659  *
2660  * (Actually, even if there were NUL bytes in the blob it would be
2661  * okay. See remarks on bytea case above.)
2662  */
2663  memcpy(pres, sss->buf2, Min(sizeof(Datum), bsize));
2664 
2665 #ifdef USE_ICU
2666  if (uchar)
2667  pfree(uchar);
2668 #endif
2669  }
2670 
2671  /*
2672  * Maintain approximate cardinality of both abbreviated keys and original,
2673  * authoritative keys using HyperLogLog. Used as cheap insurance against
2674  * the worst case, where we do many string transformations for no saving
2675  * in full strcoll()-based comparisons. These statistics are used by
2676  * varstr_abbrev_abort().
2677  *
2678  * First, Hash key proper, or a significant fraction of it. Mix in length
2679  * in order to compensate for cases where differences are past
2680  * PG_CACHE_LINE_SIZE bytes, so as to limit the overhead of hashing.
2681  */
2682  hash = DatumGetUInt32(hash_any((unsigned char *) authoritative_data,
2683  Min(len, PG_CACHE_LINE_SIZE)));
2684 
2685  if (len > PG_CACHE_LINE_SIZE)
2686  hash ^= DatumGetUInt32(hash_uint32((uint32) len));
2687 
2688  addHyperLogLog(&sss->full_card, hash);
2689 
2690  /* Hash abbreviated key */
2691 #if SIZEOF_DATUM == 8
2692  {
2693  uint32 lohalf,
2694  hihalf;
2695 
2696  lohalf = (uint32) res;
2697  hihalf = (uint32) (res >> 32);
2698  hash = DatumGetUInt32(hash_uint32(lohalf ^ hihalf));
2699  }
2700 #else /* SIZEOF_DATUM != 8 */
2701  hash = DatumGetUInt32(hash_uint32((uint32) res));
2702 #endif
2703 
2704  addHyperLogLog(&sss->abbr_card, hash);
2705 
2706  /* Cache result, perhaps saving an expensive strxfrm() call next time */
2707  sss->cache_blob = true;
2708 done:
2709 
2710  /*
2711  * Byteswap on little-endian machines.
2712  *
2713  * This is needed so that varstrcmp_abbrev() (an unsigned integer 3-way
2714  * comparator) works correctly on all platforms. If we didn't do this,
2715  * the comparator would have to call memcmp() with a pair of pointers to
2716  * the first byte of each abbreviated key, which is slower.
2717  */
2718  res = DatumBigEndianToNative(res);
2719 
2720  /* Don't leak memory here */
2721  if (PointerGetDatum(authoritative) != original)
2722  pfree(authoritative);
2723 
2724  return res;
2725 }
2726 
2727 /*
2728  * Callback for estimating effectiveness of abbreviated key optimization, using
2729  * heuristic rules. Returns value indicating if the abbreviation optimization
2730  * should be aborted, based on its projected effectiveness.
2731  */
2732 static bool
2733 varstr_abbrev_abort(int memtupcount, SortSupport ssup)
2734 {
2736  double abbrev_distinct,
2737  key_distinct;
2738 
2739  Assert(ssup->abbreviate);
2740 
2741  /* Have a little patience */
2742  if (memtupcount < 100)
2743  return false;
2744 
2745  abbrev_distinct = estimateHyperLogLog(&sss->abbr_card);
2746  key_distinct = estimateHyperLogLog(&sss->full_card);
2747 
2748  /*
2749  * Clamp cardinality estimates to at least one distinct value. While
2750  * NULLs are generally disregarded, if only NULL values were seen so far,
2751  * that might misrepresent costs if we failed to clamp.
2752  */
2753  if (abbrev_distinct <= 1.0)
2754  abbrev_distinct = 1.0;
2755 
2756  if (key_distinct <= 1.0)
2757  key_distinct = 1.0;
2758 
2759  /*
2760  * In the worst case all abbreviated keys are identical, while at the same
2761  * time there are differences within full key strings not captured in
2762  * abbreviations.
2763  */
2764 #ifdef TRACE_SORT
2765  if (trace_sort)
2766  {
2767  double norm_abbrev_card = abbrev_distinct / (double) memtupcount;
2768 
2769  elog(LOG, "varstr_abbrev: abbrev_distinct after %d: %f "
2770  "(key_distinct: %f, norm_abbrev_card: %f, prop_card: %f)",
2771  memtupcount, abbrev_distinct, key_distinct, norm_abbrev_card,
2772  sss->prop_card);
2773  }
2774 #endif
2775 
2776  /*
2777  * If the number of distinct abbreviated keys approximately matches the
2778  * number of distinct authoritative original keys, that's reason enough to
2779  * proceed. We can win even with a very low cardinality set if most
2780  * tie-breakers only memcmp(). This is by far the most important
2781  * consideration.
2782  *
2783  * While comparisons that are resolved at the abbreviated key level are
2784  * considerably cheaper than tie-breakers resolved with memcmp(), both of
2785  * those two outcomes are so much cheaper than a full strcoll() once
2786  * sorting is underway that it doesn't seem worth it to weigh abbreviated
2787  * cardinality against the overall size of the set in order to more
2788  * accurately model costs. Assume that an abbreviated comparison, and an
2789  * abbreviated comparison with a cheap memcmp()-based authoritative
2790  * resolution are equivalent.
2791  */
2792  if (abbrev_distinct > key_distinct * sss->prop_card)
2793  {
2794  /*
2795  * When we have exceeded 10,000 tuples, decay required cardinality
2796  * aggressively for next call.
2797  *
2798  * This is useful because the number of comparisons required on
2799  * average increases at a linearithmic rate, and at roughly 10,000
2800  * tuples that factor will start to dominate over the linear costs of
2801  * string transformation (this is a conservative estimate). The decay
2802  * rate is chosen to be a little less aggressive than halving -- which
2803  * (since we're called at points at which memtupcount has doubled)
2804  * would never see the cost model actually abort past the first call
2805  * following a decay. This decay rate is mostly a precaution against
2806  * a sudden, violent swing in how well abbreviated cardinality tracks
2807  * full key cardinality. The decay also serves to prevent a marginal
2808  * case from being aborted too late, when too much has already been
2809  * invested in string transformation.
2810  *
2811  * It's possible for sets of several million distinct strings with
2812  * mere tens of thousands of distinct abbreviated keys to still
2813  * benefit very significantly. This will generally occur provided
2814  * each abbreviated key is a proxy for a roughly uniform number of the
2815  * set's full keys. If it isn't so, we hope to catch that early and
2816  * abort. If it isn't caught early, by the time the problem is
2817  * apparent it's probably not worth aborting.
2818  */
2819  if (memtupcount > 10000)
2820  sss->prop_card *= 0.65;
2821 
2822  return false;
2823  }
2824 
2825  /*
2826  * Abort abbreviation strategy.
2827  *
2828  * The worst case, where all abbreviated keys are identical while all
2829  * original strings differ will typically only see a regression of about
2830  * 10% in execution time for small to medium sized lists of strings.
2831  * Whereas on modern CPUs where cache stalls are the dominant cost, we can
2832  * often expect very large improvements, particularly with sets of strings
2833  * of moderately high to high abbreviated cardinality. There is little to
2834  * lose but much to gain, which our strategy reflects.
2835  */
2836 #ifdef TRACE_SORT
2837  if (trace_sort)
2838  elog(LOG, "varstr_abbrev: aborted abbreviation at %d "
2839  "(abbrev_distinct: %f, key_distinct: %f, prop_card: %f)",
2840  memtupcount, abbrev_distinct, key_distinct, sss->prop_card);
2841 #endif
2842 
2843  return true;
2844 }
2845 
2846 /*
2847  * Generic equalimage support function for character type's operator classes.
2848  * Disables the use of deduplication with nondeterministic collations.
2849  */
2850 Datum
2852 {
2853  /* Oid opcintype = PG_GETARG_OID(0); */
2854  Oid collid = PG_GET_COLLATION();
2855 
2856  check_collation_set(collid);
2857 
2858  if (lc_collate_is_c(collid) ||
2859  collid == DEFAULT_COLLATION_OID ||
2861  PG_RETURN_BOOL(true);
2862  else
2863  PG_RETURN_BOOL(false);
2864 }
2865 
2866 Datum
2868 {
2869  text *arg1 = PG_GETARG_TEXT_PP(0);
2870  text *arg2 = PG_GETARG_TEXT_PP(1);
2871  text *result;
2872 
2873  result = ((text_cmp(arg1, arg2, PG_GET_COLLATION()) > 0) ? arg1 : arg2);
2874 
2875  PG_RETURN_TEXT_P(result);
2876 }
2877 
2878 Datum
2880 {
2881  text *arg1 = PG_GETARG_TEXT_PP(0);
2882  text *arg2 = PG_GETARG_TEXT_PP(1);
2883  text *result;
2884 
2885  result = ((text_cmp(arg1, arg2, PG_GET_COLLATION()) < 0) ? arg1 : arg2);
2886 
2887  PG_RETURN_TEXT_P(result);
2888 }
2889 
2890 
2891 /*
2892  * Cross-type comparison functions for types text and name.
2893  */
2894 
2895 Datum
2897 {
2898  Name arg1 = PG_GETARG_NAME(0);
2899  text *arg2 = PG_GETARG_TEXT_PP(1);
2900  size_t len1 = strlen(NameStr(*arg1));
2901  size_t len2 = VARSIZE_ANY_EXHDR(arg2);
2902  Oid collid = PG_GET_COLLATION();
2903  bool result;
2904 
2905  check_collation_set(collid);
2906 
2907  if (collid == C_COLLATION_OID)
2908  result = (len1 == len2 &&
2909  memcmp(NameStr(*arg1), VARDATA_ANY(arg2), len1) == 0);
2910  else
2911  result = (varstr_cmp(NameStr(*arg1), len1,
2912  VARDATA_ANY(arg2), len2,
2913  collid) == 0);
2914 
2915  PG_FREE_IF_COPY(arg2, 1);
2916 
2917  PG_RETURN_BOOL(result);
2918 }
2919 
2920 Datum
2922 {
2923  text *arg1 = PG_GETARG_TEXT_PP(0);
2924  Name arg2 = PG_GETARG_NAME(1);
2925  size_t len1 = VARSIZE_ANY_EXHDR(arg1);
2926  size_t len2 = strlen(NameStr(*arg2));
2927  Oid collid = PG_GET_COLLATION();
2928  bool result;
2929 
2930  check_collation_set(collid);
2931 
2932  if (collid == C_COLLATION_OID)
2933  result = (len1 == len2 &&
2934  memcmp(VARDATA_ANY(arg1), NameStr(*arg2), len1) == 0);
2935  else
2936  result = (varstr_cmp(VARDATA_ANY(arg1), len1,
2937  NameStr(*arg2), len2,
2938  collid) == 0);
2939 
2940  PG_FREE_IF_COPY(arg1, 0);
2941 
2942  PG_RETURN_BOOL(result);
2943 }
2944 
2945 Datum
2947 {
2948  Name arg1 = PG_GETARG_NAME(0);
2949  text *arg2 = PG_GETARG_TEXT_PP(1);
2950  size_t len1 = strlen(NameStr(*arg1));
2951  size_t len2 = VARSIZE_ANY_EXHDR(arg2);
2952  Oid collid = PG_GET_COLLATION();
2953  bool result;
2954 
2955  check_collation_set(collid);
2956 
2957  if (collid == C_COLLATION_OID)
2958  result = !(len1 == len2 &&
2959  memcmp(NameStr(*arg1), VARDATA_ANY(arg2), len1) == 0);
2960  else
2961  result = !(varstr_cmp(NameStr(*arg1), len1,
2962  VARDATA_ANY(arg2), len2,
2963  collid) == 0);
2964 
2965  PG_FREE_IF_COPY(arg2, 1);
2966 
2967  PG_RETURN_BOOL(result);
2968 }
2969 
2970 Datum
2972 {
2973  text *arg1 = PG_GETARG_TEXT_PP(0);
2974  Name arg2 = PG_GETARG_NAME(1);
2975  size_t len1 = VARSIZE_ANY_EXHDR(arg1);
2976  size_t len2 = strlen(NameStr(*arg2));
2977  Oid collid = PG_GET_COLLATION();
2978  bool result;
2979 
2980  check_collation_set(collid);
2981 
2982  if (collid == C_COLLATION_OID)
2983  result = !(len1 == len2 &&
2984  memcmp(VARDATA_ANY(arg1), NameStr(*arg2), len1) == 0);
2985  else
2986  result = !(varstr_cmp(VARDATA_ANY(arg1), len1,
2987  NameStr(*arg2), len2,
2988  collid) == 0);
2989 
2990  PG_FREE_IF_COPY(arg1, 0);
2991 
2992  PG_RETURN_BOOL(result);
2993 }
2994 
2995 Datum
2997 {
2998  Name arg1 = PG_GETARG_NAME(0);
2999  text *arg2 = PG_GETARG_TEXT_PP(1);
3000  int32 result;
3001 
3002  result = varstr_cmp(NameStr(*arg1), strlen(NameStr(*arg1)),
3003  VARDATA_ANY(arg2), VARSIZE_ANY_EXHDR(arg2),
3004  PG_GET_COLLATION());
3005 
3006  PG_FREE_IF_COPY(arg2, 1);
3007 
3008  PG_RETURN_INT32(result);
3009 }
3010 
3011 Datum
3013 {
3014  text *arg1 = PG_GETARG_TEXT_PP(0);
3015  Name arg2 = PG_GETARG_NAME(1);
3016  int32 result;
3017 
3018  result = varstr_cmp(VARDATA_ANY(arg1), VARSIZE_ANY_EXHDR(arg1),
3019  NameStr(*arg2), strlen(NameStr(*arg2)),
3020  PG_GET_COLLATION());
3021 
3022  PG_FREE_IF_COPY(arg1, 0);
3023 
3024  PG_RETURN_INT32(result);
3025 }
3026 
3027 #define CmpCall(cmpfunc) \
3028  DatumGetInt32(DirectFunctionCall2Coll(cmpfunc, \
3029  PG_GET_COLLATION(), \
3030  PG_GETARG_DATUM(0), \
3031  PG_GETARG_DATUM(1)))
3032 
3033 Datum
3035 {
3037 }
3038 
3039 Datum
3041 {
3043 }
3044 
3045 Datum
3047 {
3049 }
3050 
3051 Datum
3053 {
3055 }
3056 
3057 Datum
3059 {
3061 }
3062 
3063 Datum
3065 {
3067 }
3068 
3069 Datum
3071 {
3073 }
3074 
3075 Datum
3077 {
3079 }
3080 
3081 #undef CmpCall
3082 
3083 
3084 /*
3085  * The following operators support character-by-character comparison
3086  * of text datums, to allow building indexes suitable for LIKE clauses.
3087  * Note that the regular texteq/textne comparison operators, and regular
3088  * support functions 1 and 2 with "C" collation are assumed to be
3089  * compatible with these!
3090  */
3091 
3092 static int
3094 {
3095  int result;
3096  int len1,
3097  len2;
3098 
3099  len1 = VARSIZE_ANY_EXHDR(arg1);
3100  len2 = VARSIZE_ANY_EXHDR(arg2);
3101 
3102  result = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3103  if (result != 0)
3104  return result;
3105  else if (len1 < len2)
3106  return -1;
3107  else if (len1 > len2)
3108  return 1;
3109  else
3110  return 0;
3111 }
3112 
3113 
3114 Datum
3116 {
3117  text *arg1 = PG_GETARG_TEXT_PP(0);
3118  text *arg2 = PG_GETARG_TEXT_PP(1);
3119  int result;
3120 
3121  result = internal_text_pattern_compare(arg1, arg2);
3122 
3123  PG_FREE_IF_COPY(arg1, 0);
3124  PG_FREE_IF_COPY(arg2, 1);
3125 
3126  PG_RETURN_BOOL(result < 0);
3127 }
3128 
3129 
3130 Datum
3132 {
3133  text *arg1 = PG_GETARG_TEXT_PP(0);
3134  text *arg2 = PG_GETARG_TEXT_PP(1);
3135  int result;
3136 
3137  result = internal_text_pattern_compare(arg1, arg2);
3138 
3139  PG_FREE_IF_COPY(arg1, 0);
3140  PG_FREE_IF_COPY(arg2, 1);
3141 
3142  PG_RETURN_BOOL(result <= 0);
3143 }
3144 
3145 
3146 Datum
3148 {
3149  text *arg1 = PG_GETARG_TEXT_PP(0);
3150  text *arg2 = PG_GETARG_TEXT_PP(1);
3151  int result;
3152 
3153  result = internal_text_pattern_compare(arg1, arg2);
3154 
3155  PG_FREE_IF_COPY(arg1, 0);
3156  PG_FREE_IF_COPY(arg2, 1);
3157 
3158  PG_RETURN_BOOL(result >= 0);
3159 }
3160 
3161 
3162 Datum
3164 {
3165  text *arg1 = PG_GETARG_TEXT_PP(0);
3166  text *arg2 = PG_GETARG_TEXT_PP(1);
3167  int result;
3168 
3169  result = internal_text_pattern_compare(arg1, arg2);
3170 
3171  PG_FREE_IF_COPY(arg1, 0);
3172  PG_FREE_IF_COPY(arg2, 1);
3173 
3174  PG_RETURN_BOOL(result > 0);
3175 }
3176 
3177 
3178 Datum
3180 {
3181  text *arg1 = PG_GETARG_TEXT_PP(0);
3182  text *arg2 = PG_GETARG_TEXT_PP(1);
3183  int result;
3184 
3185  result = internal_text_pattern_compare(arg1, arg2);
3186 
3187  PG_FREE_IF_COPY(arg1, 0);
3188  PG_FREE_IF_COPY(arg2, 1);
3189 
3190  PG_RETURN_INT32(result);
3191 }
3192 
3193 
3194 Datum
3196 {
3198  MemoryContext oldcontext;
3199 
3200  oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
3201 
3202  /* Use generic string SortSupport, forcing "C" collation */
3203  varstr_sortsupport(ssup, TEXTOID, C_COLLATION_OID);
3204 
3205  MemoryContextSwitchTo(oldcontext);
3206 
3207  PG_RETURN_VOID();
3208 }
3209 
3210 
3211 /*-------------------------------------------------------------
3212  * byteaoctetlen
3213  *
3214  * get the number of bytes contained in an instance of type 'bytea'
3215  *-------------------------------------------------------------
3216  */
3217 Datum
3219 {
3220  Datum str = PG_GETARG_DATUM(0);
3221 
3222  /* We need not detoast the input at all */
3224 }
3225 
3226 /*
3227  * byteacat -
3228  * takes two bytea* and returns a bytea* that is the concatenation of
3229  * the two.
3230  *
3231  * Cloned from textcat and modified as required.
3232  */
3233 Datum
3235 {
3236  bytea *t1 = PG_GETARG_BYTEA_PP(0);
3237  bytea *t2 = PG_GETARG_BYTEA_PP(1);
3238 
3240 }
3241 
3242 /*
3243  * bytea_catenate
3244  * Guts of byteacat(), broken out so it can be used by other functions
3245  *
3246  * Arguments can be in short-header form, but not compressed or out-of-line
3247  */
3248 static bytea *
3250 {
3251  bytea *result;
3252  int len1,
3253  len2,
3254  len;
3255  char *ptr;
3256 
3257  len1 = VARSIZE_ANY_EXHDR(t1);
3258  len2 = VARSIZE_ANY_EXHDR(t2);
3259 
3260  /* paranoia ... probably should throw error instead? */
3261  if (len1 < 0)
3262  len1 = 0;
3263  if (len2 < 0)
3264  len2 = 0;
3265 
3266  len = len1 + len2 + VARHDRSZ;
3267  result = (bytea *) palloc(len);
3268 
3269  /* Set size of result string... */
3270  SET_VARSIZE(result, len);
3271 
3272  /* Fill data field of result string... */
3273  ptr = VARDATA(result);
3274  if (len1 > 0)
3275  memcpy(ptr, VARDATA_ANY(t1), len1);
3276  if (len2 > 0)
3277  memcpy(ptr + len1, VARDATA_ANY(t2), len2);
3278 
3279  return result;
3280 }
3281 
3282 #define PG_STR_GET_BYTEA(str_) \
3283  DatumGetByteaPP(DirectFunctionCall1(byteain, CStringGetDatum(str_)))
3284 
3285 /*
3286  * bytea_substr()
3287  * Return a substring starting at the specified position.
3288  * Cloned from text_substr and modified as required.
3289  *
3290  * Input:
3291  * - string
3292  * - starting position (is one-based)
3293  * - string length (optional)
3294  *
3295  * If the starting position is zero or less, then return from the start of the string
3296  * adjusting the length to be consistent with the "negative start" per SQL.
3297  * If the length is less than zero, an ERROR is thrown. If no third argument
3298  * (length) is provided, the length to the end of the string is assumed.
3299  */
3300 Datum
3302 {
3304  PG_GETARG_INT32(1),
3305  PG_GETARG_INT32(2),
3306  false));
3307 }
3308 
3309 /*
3310  * bytea_substr_no_len -
3311  * Wrapper to avoid opr_sanity failure due to
3312  * one function accepting a different number of args.
3313  */
3314 Datum
3316 {
3318  PG_GETARG_INT32(1),
3319  -1,
3320  true));
3321 }
3322 
3323 static bytea *
3325  int S,
3326  int L,
3327  bool length_not_specified)
3328 {
3329  int32 S1; /* adjusted start position */
3330  int32 L1; /* adjusted substring length */
3331  int32 E; /* end position */
3332 
3333  /*
3334  * The logic here should generally match text_substring().
3335  */
3336  S1 = Max(S, 1);
3337 
3338  if (length_not_specified)
3339  {
3340  /*
3341  * Not passed a length - DatumGetByteaPSlice() grabs everything to the
3342  * end of the string if we pass it a negative value for length.
3343  */
3344  L1 = -1;
3345  }
3346  else if (L < 0)
3347  {
3348  /* SQL99 says to throw an error for E < S, i.e., negative length */
3349  ereport(ERROR,
3350  (errcode(ERRCODE_SUBSTRING_ERROR),
3351  errmsg("negative substring length not allowed")));
3352  L1 = -1; /* silence stupider compilers */
3353  }
3354  else if (pg_add_s32_overflow(S, L, &E))
3355  {
3356  /*
3357  * L could be large enough for S + L to overflow, in which case the
3358  * substring must run to end of string.
3359  */
3360  L1 = -1;
3361  }
3362  else
3363  {
3364  /*
3365  * A zero or negative value for the end position can happen if the
3366  * start was negative or one. SQL99 says to return a zero-length
3367  * string.
3368  */
3369  if (E < 1)
3370  return PG_STR_GET_BYTEA("");
3371 
3372  L1 = E - S1;
3373  }
3374 
3375  /*
3376  * If the start position is past the end of the string, SQL99 says to
3377  * return a zero-length string -- DatumGetByteaPSlice() will do that for
3378  * us. We need only convert S1 to zero-based starting position.
3379  */
3380  return DatumGetByteaPSlice(str, S1 - 1, L1);
3381 }
3382 
3383 /*
3384  * byteaoverlay
3385  * Replace specified substring of first string with second
3386  *
3387  * The SQL standard defines OVERLAY() in terms of substring and concatenation.
3388  * This code is a direct implementation of what the standard says.
3389  */
3390 Datum
3392 {
3393  bytea *t1 = PG_GETARG_BYTEA_PP(0);
3394  bytea *t2 = PG_GETARG_BYTEA_PP(1);
3395  int sp = PG_GETARG_INT32(2); /* substring start position */
3396  int sl = PG_GETARG_INT32(3); /* substring length */
3397 
3398  PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl));
3399 }
3400 
3401 Datum
3403 {
3404  bytea *t1 = PG_GETARG_BYTEA_PP(0);
3405  bytea *t2 = PG_GETARG_BYTEA_PP(1);
3406  int sp = PG_GETARG_INT32(2); /* substring start position */
3407  int sl;
3408 
3409  sl = VARSIZE_ANY_EXHDR(t2); /* defaults to length(t2) */
3410  PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl));
3411 }
3412 
3413 static bytea *
3414 bytea_overlay(bytea *t1, bytea *t2, int sp, int sl)
3415 {
3416  bytea *result;
3417  bytea *s1;
3418  bytea *s2;
3419  int sp_pl_sl;
3420 
3421  /*
3422  * Check for possible integer-overflow cases. For negative sp, throw a
3423  * "substring length" error because that's what should be expected
3424  * according to the spec's definition of OVERLAY().
3425  */
3426  if (sp <= 0)
3427  ereport(ERROR,
3428  (errcode(ERRCODE_SUBSTRING_ERROR),
3429  errmsg("negative substring length not allowed")));
3430  if (pg_add_s32_overflow(sp, sl, &sp_pl_sl))
3431  ereport(ERROR,
3432  (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
3433  errmsg("integer out of range")));
3434 
3435  s1 = bytea_substring(PointerGetDatum(t1), 1, sp - 1, false);
3436  s2 = bytea_substring(PointerGetDatum(t1), sp_pl_sl, -1, true);
3437  result = bytea_catenate(s1, t2);
3438  result = bytea_catenate(result, s2);
3439 
3440  return result;
3441 }
3442 
3443 /*
3444  * bit_count
3445  */
3446 Datum
3448 {
3449  bytea *t1 = PG_GETARG_BYTEA_PP(0);
3450 
3452 }
3453 
3454 /*
3455  * byteapos -
3456  * Return the position of the specified substring.
3457  * Implements the SQL POSITION() function.
3458  * Cloned from textpos and modified as required.
3459  */
3460 Datum
3462 {
3463  bytea *t1 = PG_GETARG_BYTEA_PP(0);
3464  bytea *t2 = PG_GETARG_BYTEA_PP(1);
3465  int pos;
3466  int px,
3467  p;
3468  int len1,
3469  len2;
3470  char *p1,
3471  *p2;
3472 
3473  len1 = VARSIZE_ANY_EXHDR(t1);
3474  len2 = VARSIZE_ANY_EXHDR(t2);
3475 
3476  if (len2 <= 0)
3477  PG_RETURN_INT32(1); /* result for empty pattern */
3478 
3479  p1 = VARDATA_ANY(t1);
3480  p2 = VARDATA_ANY(t2);
3481 
3482  pos = 0;
3483  px = (len1 - len2);
3484  for (p = 0; p <= px; p++)
3485  {
3486  if ((*p2 == *p1) && (memcmp(p1, p2, len2) == 0))
3487  {
3488  pos = p + 1;
3489  break;
3490  };
3491  p1++;
3492  };
3493 
3494  PG_RETURN_INT32(pos);
3495 }
3496 
3497 /*-------------------------------------------------------------
3498  * byteaGetByte
3499  *
3500  * this routine treats "bytea" as an array of bytes.
3501  * It returns the Nth byte (a number between 0 and 255).
3502  *-------------------------------------------------------------
3503  */
3504 Datum
3506 {
3507  bytea *v = PG_GETARG_BYTEA_PP(0);
3508  int32 n = PG_GETARG_INT32(1);
3509  int len;
3510  int byte;
3511 
3512  len = VARSIZE_ANY_EXHDR(v);
3513 
3514  if (n < 0 || n >= len)
3515  ereport(ERROR,
3516  (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3517  errmsg("index %d out of valid range, 0..%d",
3518  n, len - 1)));
3519 
3520  byte = ((unsigned char *) VARDATA_ANY(v))[n];
3521 
3522  PG_RETURN_INT32(byte);
3523 }
3524 
3525 /*-------------------------------------------------------------
3526  * byteaGetBit
3527  *
3528  * This routine treats a "bytea" type like an array of bits.
3529  * It returns the value of the Nth bit (0 or 1).
3530  *
3531  *-------------------------------------------------------------
3532  */
3533 Datum
3535 {
3536  bytea *v = PG_GETARG_BYTEA_PP(0);
3537  int64 n = PG_GETARG_INT64(1);
3538  int byteNo,
3539  bitNo;
3540  int len;
3541  int byte;
3542 
3543  len = VARSIZE_ANY_EXHDR(v);
3544 
3545  if (n < 0 || n >= (int64) len * 8)
3546  ereport(ERROR,
3547  (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3548  errmsg("index %lld out of valid range, 0..%lld",
3549  (long long) n, (long long) len * 8 - 1)));
3550 
3551  /* n/8 is now known < len, so safe to cast to int */
3552  byteNo = (int) (n / 8);
3553  bitNo = (int) (n % 8);
3554 
3555  byte = ((unsigned char *) VARDATA_ANY(v))[byteNo];
3556 
3557  if (byte & (1 << bitNo))
3558  PG_RETURN_INT32(1);
3559  else
3560  PG_RETURN_INT32(0);
3561 }
3562 
3563 /*-------------------------------------------------------------
3564  * byteaSetByte
3565  *
3566  * Given an instance of type 'bytea' creates a new one with
3567  * the Nth byte set to the given value.
3568  *
3569  *-------------------------------------------------------------
3570  */
3571 Datum
3573 {
3574  bytea *res = PG_GETARG_BYTEA_P_COPY(0);
3575  int32 n = PG_GETARG_INT32(1);
3576  int32 newByte = PG_GETARG_INT32(2);
3577  int len;
3578 
3579  len = VARSIZE(res) - VARHDRSZ;
3580 
3581  if (n < 0 || n >= len)
3582  ereport(ERROR,
3583  (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3584  errmsg("index %d out of valid range, 0..%d",
3585  n, len - 1)));
3586 
3587  /*
3588  * Now set the byte.
3589  */
3590  ((unsigned char *) VARDATA(res))[n] = newByte;
3591 
3592  PG_RETURN_BYTEA_P(res);
3593 }
3594 
3595 /*-------------------------------------------------------------
3596  * byteaSetBit
3597  *
3598  * Given an instance of type 'bytea' creates a new one with
3599  * the Nth bit set to the given value.
3600  *
3601  *-------------------------------------------------------------
3602  */
3603 Datum
3605 {
3606  bytea *res = PG_GETARG_BYTEA_P_COPY(0);
3607  int64 n = PG_GETARG_INT64(1);
3608  int32 newBit = PG_GETARG_INT32(2);
3609  int len;
3610  int oldByte,
3611  newByte;
3612  int byteNo,
3613  bitNo;
3614 
3615  len = VARSIZE(res) - VARHDRSZ;
3616 
3617  if (n < 0 || n >= (int64) len * 8)
3618  ereport(ERROR,
3619  (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3620  errmsg("index %lld out of valid range, 0..%lld",
3621  (long long) n, (long long) len * 8 - 1)));
3622 
3623  /* n/8 is now known < len, so safe to cast to int */
3624  byteNo = (int) (n / 8);
3625  bitNo = (int) (n % 8);
3626 
3627  /*
3628  * sanity check!
3629  */
3630  if (newBit != 0 && newBit != 1)
3631  ereport(ERROR,
3632  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
3633  errmsg("new bit must be 0 or 1")));
3634 
3635  /*
3636  * Update the byte.
3637  */
3638  oldByte = ((unsigned char *) VARDATA(res))[byteNo];
3639 
3640  if (newBit == 0)
3641  newByte = oldByte & (~(1 << bitNo));
3642  else
3643  newByte = oldByte | (1 << bitNo);
3644 
3645  ((unsigned char *) VARDATA(res))[byteNo] = newByte;
3646 
3647  PG_RETURN_BYTEA_P(res);
3648 }
3649 
3650 
3651 /* text_name()
3652  * Converts a text type to a Name type.
3653  */
3654 Datum
3656 {
3657  text *s = PG_GETARG_TEXT_PP(0);
3658  Name result;
3659  int len;
3660 
3661  len = VARSIZE_ANY_EXHDR(s);
3662 
3663  /* Truncate oversize input */
3664  if (len >= NAMEDATALEN)
3665  len = pg_mbcliplen(VARDATA_ANY(s), len, NAMEDATALEN - 1);
3666 
3667  /* We use palloc0 here to ensure result is zero-padded */
3668  result = (Name) palloc0(NAMEDATALEN);
3669  memcpy(NameStr(*result), VARDATA_ANY(s), len);
3670 
3671  PG_RETURN_NAME(result);
3672 }
3673 
3674 /* name_text()
3675  * Converts a Name type to a text type.
3676  */
3677 Datum
3679 {
3680  Name s = PG_GETARG_NAME(0);
3681 
3683 }
3684 
3685 
3686 /*
3687  * textToQualifiedNameList - convert a text object to list of names
3688  *
3689  * This implements the input parsing needed by nextval() and other
3690  * functions that take a text parameter representing a qualified name.
3691  * We split the name at dots, downcase if not double-quoted, and
3692  * truncate names if they're too long.
3693  */
3694 List *
3696 {
3697  char *rawname;
3698  List *result = NIL;
3699  List *namelist;
3700  ListCell *l;
3701 
3702  /* Convert to C string (handles possible detoasting). */
3703  /* Note we rely on being able to modify rawname below. */
3704  rawname = text_to_cstring(textval);
3705 
3706  if (!SplitIdentifierString(rawname, '.', &namelist))
3707  ereport(ERROR,
3708  (errcode(ERRCODE_INVALID_NAME),
3709  errmsg("invalid name syntax")));
3710 
3711  if (namelist == NIL)
3712  ereport(ERROR,
3713  (errcode(ERRCODE_INVALID_NAME),
3714  errmsg("invalid name syntax")));
3715 
3716  foreach(l, namelist)
3717  {
3718  char *curname = (char *) lfirst(l);
3719 
3720  result = lappend(result, makeString(pstrdup(curname)));
3721  }
3722 
3723  pfree(rawname);
3724  list_free(namelist);
3725 
3726  return result;
3727 }
3728 
3729 /*
3730  * SplitIdentifierString --- parse a string containing identifiers
3731  *
3732  * This is the guts of textToQualifiedNameList, and is exported for use in
3733  * other situations such as parsing GUC variables. In the GUC case, it's
3734  * important to avoid memory leaks, so the API is designed to minimize the
3735  * amount of stuff that needs to be allocated and freed.
3736  *
3737  * Inputs:
3738  * rawstring: the input string; must be overwritable! On return, it's
3739  * been modified to contain the separated identifiers.
3740  * separator: the separator punctuation expected between identifiers
3741  * (typically '.' or ','). Whitespace may also appear around
3742  * identifiers.
3743  * Outputs:
3744  * namelist: filled with a palloc'd list of pointers to identifiers within
3745  * rawstring. Caller should list_free() this even on error return.
3746  *
3747  * Returns true if okay, false if there is a syntax error in the string.
3748  *
3749  * Note that an empty string is considered okay here, though not in
3750  * textToQualifiedNameList.
3751  */
3752 bool
3753 SplitIdentifierString(char *rawstring, char separator,
3754  List **namelist)
3755 {
3756  char *nextp = rawstring;
3757  bool done = false;
3758 
3759  *namelist = NIL;
3760 
3761  while (scanner_isspace(*nextp))
3762  nextp++; /* skip leading whitespace */
3763 
3764  if (*nextp == '\0')
3765  return true; /* allow empty string */
3766 
3767  /* At the top of the loop, we are at start of a new identifier. */
3768  do
3769  {
3770  char *curname;
3771  char *endp;
3772 
3773  if (*nextp == '"')
3774  {
3775  /* Quoted name --- collapse quote-quote pairs, no downcasing */
3776  curname = nextp + 1;
3777  for (;;)
3778  {
3779  endp = strchr(nextp + 1, '"');
3780  if (endp == NULL)
3781  return false; /* mismatched quotes */
3782  if (endp[1] != '"')
3783  break; /* found end of quoted name */
3784  /* Collapse adjacent quotes into one quote, and look again */
3785  memmove(endp, endp + 1, strlen(endp));
3786  nextp = endp;
3787  }
3788  /* endp now points at the terminating quote */
3789  nextp = endp + 1;
3790  }
3791  else
3792  {
3793  /* Unquoted name --- extends to separator or whitespace */
3794  char *downname;
3795  int len;
3796 
3797  curname = nextp;
3798  while (*nextp && *nextp != separator &&
3799  !scanner_isspace(*nextp))
3800  nextp++;
3801  endp = nextp;
3802  if (curname == nextp)
3803  return false; /* empty unquoted name not allowed */
3804 
3805  /*
3806  * Downcase the identifier, using same code as main lexer does.
3807  *
3808  * XXX because we want to overwrite the input in-place, we cannot
3809  * support a downcasing transformation that increases the string
3810  * length. This is not a problem given the current implementation
3811  * of downcase_truncate_identifier, but we'll probably have to do
3812  * something about this someday.
3813  */
3814  len = endp - curname;
3815  downname = downcase_truncate_identifier(curname, len, false);
3816  Assert(strlen(downname) <= len);
3817  strncpy(curname, downname, len); /* strncpy is required here */
3818  pfree(downname);
3819  }
3820 
3821  while (scanner_isspace(*nextp))
3822  nextp++; /* skip trailing whitespace */
3823 
3824  if (*nextp == separator)
3825  {
3826  nextp++;
3827  while (scanner_isspace(*nextp))
3828  nextp++; /* skip leading whitespace for next */
3829  /* we expect another name, so done remains false */
3830  }
3831  else if (*nextp == '\0')
3832  done = true;
3833  else
3834  return false; /* invalid syntax */
3835 
3836  /* Now safe to overwrite separator with a null */
3837  *endp = '\0';
3838 
3839  /* Truncate name if it's overlength */
3840  truncate_identifier(curname, strlen(curname), false);
3841 
3842  /*
3843  * Finished isolating current name --- add it to list
3844  */
3845  *namelist = lappend(*namelist, curname);
3846 
3847  /* Loop back if we didn't reach end of string */
3848  } while (!done);
3849 
3850  return true;
3851 }
3852 
3853 
3854 /*
3855  * SplitDirectoriesString --- parse a string containing file/directory names
3856  *
3857  * This works fine on file names too; the function name is historical.
3858  *
3859  * This is similar to SplitIdentifierString, except that the parsing
3860  * rules are meant to handle pathnames instead of identifiers: there is
3861  * no downcasing, embedded spaces are allowed, the max length is MAXPGPATH-1,
3862  * and we apply canonicalize_path() to each extracted string. Because of the
3863  * last, the returned strings are separately palloc'd rather than being
3864  * pointers into rawstring --- but we still scribble on rawstring.
3865  *
3866  * Inputs:
3867  * rawstring: the input string; must be modifiable!
3868  * separator: the separator punctuation expected between directories
3869  * (typically ',' or ';'). Whitespace may also appear around
3870  * directories.
3871  * Outputs:
3872  * namelist: filled with a palloc'd list of directory names.
3873  * Caller should list_free_deep() this even on error return.
3874  *
3875  * Returns true if okay, false if there is a syntax error in the string.
3876  *
3877  * Note that an empty string is considered okay here.
3878  */
3879 bool
3880 SplitDirectoriesString(char *rawstring, char separator,
3881  List **namelist)
3882 {
3883  char *nextp = rawstring;
3884  bool done = false;
3885 
3886  *namelist = NIL;
3887 
3888  while (scanner_isspace(*nextp))
3889  nextp++; /* skip leading whitespace */
3890 
3891  if (*nextp == '\0')
3892  return true; /* allow empty string */
3893 
3894  /* At the top of the loop, we are at start of a new directory. */
3895  do
3896  {
3897  char *curname;
3898  char *endp;
3899 
3900  if (*nextp == '"')
3901  {
3902  /* Quoted name --- collapse quote-quote pairs */
3903  curname = nextp + 1;
3904  for (;;)
3905  {
3906  endp = strchr(nextp + 1, '"');
3907  if (endp == NULL)
3908  return false; /* mismatched quotes */
3909  if (endp[1] != '"')
3910  break; /* found end of quoted name */
3911  /* Collapse adjacent quotes into one quote, and look again */
3912  memmove(endp, endp + 1, strlen(endp));
3913  nextp = endp;
3914  }
3915  /* endp now points at the terminating quote */
3916  nextp = endp + 1;
3917  }
3918  else
3919  {
3920  /* Unquoted name --- extends to separator or end of string */
3921  curname = endp = nextp;
3922  while (*nextp && *nextp != separator)
3923  {
3924  /* trailing whitespace should not be included in name */
3925  if (!scanner_isspace(*nextp))
3926  endp = nextp + 1;
3927  nextp++;
3928  }
3929  if (curname == endp)
3930  return false; /* empty unquoted name not allowed */
3931  }
3932 
3933  while (scanner_isspace(*nextp))
3934  nextp++; /* skip trailing whitespace */
3935 
3936  if (*nextp == separator)
3937  {
3938  nextp++;
3939  while (scanner_isspace(*nextp))
3940  nextp++; /* skip leading whitespace for next */
3941  /* we expect another name, so done remains false */
3942  }
3943  else if (*nextp == '\0')
3944  done = true;
3945  else
3946  return false; /* invalid syntax */
3947 
3948  /* Now safe to overwrite separator with a null */
3949  *endp = '\0';
3950 
3951  /* Truncate path if it's overlength */
3952  if (strlen(curname) >= MAXPGPATH)
3953  curname[MAXPGPATH - 1] = '\0';
3954 
3955  /*
3956  * Finished isolating current name --- add it to list
3957  */
3958  curname = pstrdup(curname);
3959  canonicalize_path(curname);
3960  *namelist = lappend(*namelist, curname);
3961 
3962  /* Loop back if we didn't reach end of string */
3963  } while (!done);
3964 
3965  return true;
3966 }
3967 
3968 
3969 /*
3970  * SplitGUCList --- parse a string containing identifiers or file names
3971  *
3972  * This is used to split the value of a GUC_LIST_QUOTE GUC variable, without
3973  * presuming whether the elements will be taken as identifiers or file names.
3974  * We assume the input has already been through flatten_set_variable_args(),
3975  * so that we need never downcase (if appropriate, that was done already).
3976  * Nor do we ever truncate, since we don't know the correct max length.
3977  * We disallow embedded whitespace for simplicity (it shouldn't matter,
3978  * because any embedded whitespace should have led to double-quoting).
3979  * Otherwise the API is identical to SplitIdentifierString.
3980  *
3981  * XXX it's annoying to have so many copies of this string-splitting logic.
3982  * However, it's not clear that having one function with a bunch of option
3983  * flags would be much better.
3984  *
3985  * XXX there is a version of this function in src/bin/pg_dump/dumputils.c.
3986  * Be sure to update that if you have to change this.
3987  *
3988  * Inputs:
3989  * rawstring: the input string; must be overwritable! On return, it's
3990  * been modified to contain the separated identifiers.
3991  * separator: the separator punctuation expected between identifiers
3992  * (typically '.' or ','). Whitespace may also appear around
3993  * identifiers.
3994  * Outputs:
3995  * namelist: filled with a palloc'd list of pointers to identifiers within
3996  * rawstring. Caller should list_free() this even on error return.
3997  *
3998  * Returns true if okay, false if there is a syntax error in the string.
3999  */
4000 bool
4001 SplitGUCList(char *rawstring, char separator,
4002  List **namelist)
4003 {
4004  char *nextp = rawstring;
4005  bool done = false;
4006 
4007  *namelist = NIL;
4008 
4009  while (scanner_isspace(*nextp))
4010  nextp++; /* skip leading whitespace */
4011 
4012  if (*nextp == '\0')
4013  return true; /* allow empty string */
4014 
4015  /* At the top of the loop, we are at start of a new identifier. */
4016  do
4017  {
4018  char *curname;
4019  char *endp;
4020 
4021  if (*nextp == '"')
4022  {
4023  /* Quoted name --- collapse quote-quote pairs */
4024  curname = nextp + 1;
4025  for (;;)
4026  {
4027  endp = strchr(nextp + 1, '"');
4028  if (endp == NULL)
4029  return false; /* mismatched quotes */
4030  if (endp[1] != '"')
4031  break; /* found end of quoted name */
4032  /* Collapse adjacent quotes into one quote, and look again */
4033  memmove(endp, endp + 1, strlen(endp));
4034  nextp = endp;
4035  }
4036  /* endp now points at the terminating quote */
4037  nextp = endp + 1;
4038  }
4039  else
4040  {
4041  /* Unquoted name --- extends to separator or whitespace */
4042  curname = nextp;
4043  while (*nextp && *nextp != separator &&
4044  !scanner_isspace(*nextp))
4045  nextp++;
4046  endp = nextp;
4047  if (curname == nextp)
4048  return false; /* empty unquoted name not allowed */
4049  }
4050 
4051  while (scanner_isspace(*nextp))
4052  nextp++; /* skip trailing whitespace */
4053 
4054  if (*nextp == separator)
4055  {
4056  nextp++;
4057  while (scanner_isspace(*nextp))
4058  nextp++; /* skip leading whitespace for next */
4059  /* we expect another name, so done remains false */
4060  }
4061  else if (*nextp == '\0')
4062  done = true;
4063  else
4064  return false; /* invalid syntax */
4065 
4066  /* Now safe to overwrite separator with a null */
4067  *endp = '\0';
4068 
4069  /*
4070  * Finished isolating current name --- add it to list
4071  */
4072  *namelist = lappend(*namelist, curname);
4073 
4074  /* Loop back if we didn't reach end of string */
4075  } while (!done);
4076 
4077  return true;
4078 }
4079 
4080 
4081 /*****************************************************************************
4082  * Comparison Functions used for bytea
4083  *
4084  * Note: btree indexes need these routines not to leak memory; therefore,
4085  * be careful to free working copies of toasted datums. Most places don't
4086  * need to be so careful.
4087  *****************************************************************************/
4088 
4089 Datum
4091 {
4092  Datum arg1 = PG_GETARG_DATUM(0);
4093  Datum arg2 = PG_GETARG_DATUM(1);
4094  bool result;
4095  Size len1,
4096  len2;
4097 
4098  /*
4099  * We can use a fast path for unequal lengths, which might save us from
4100  * having to detoast one or both values.
4101  */
4102  len1 = toast_raw_datum_size(arg1);
4103  len2 = toast_raw_datum_size(arg2);
4104  if (len1 != len2)
4105  result = false;
4106  else
4107  {
4108  bytea *barg1 = DatumGetByteaPP(arg1);
4109  bytea *barg2 = DatumGetByteaPP(arg2);
4110 
4111  result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2),
4112  len1 - VARHDRSZ) == 0);
4113 
4114  PG_FREE_IF_COPY(barg1, 0);
4115  PG_FREE_IF_COPY(barg2, 1);
4116  }
4117 
4118  PG_RETURN_BOOL(result);
4119 }
4120 
4121 Datum
4123 {
4124  Datum arg1 = PG_GETARG_DATUM(0);
4125  Datum arg2 = PG_GETARG_DATUM(1);
4126  bool result;
4127  Size len1,
4128  len2;
4129 
4130  /*
4131  * We can use a fast path for unequal lengths, which might save us from
4132  * having to detoast one or both values.
4133  */
4134  len1 = toast_raw_datum_size(arg1);
4135  len2 = toast_raw_datum_size(arg2);
4136  if (len1 != len2)
4137  result = true;
4138  else
4139  {
4140  bytea *barg1 = DatumGetByteaPP(arg1);
4141  bytea *barg2 = DatumGetByteaPP(arg2);
4142 
4143  result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2),
4144  len1 - VARHDRSZ) != 0);
4145 
4146  PG_FREE_IF_COPY(barg1, 0);
4147  PG_FREE_IF_COPY(barg2, 1);
4148  }
4149 
4150  PG_RETURN_BOOL(result);
4151 }
4152 
4153 Datum
4155 {
4156  bytea *arg1 = PG_GETARG_BYTEA_PP(0);
4157  bytea *arg2 = PG_GETARG_BYTEA_PP(1);
4158  int len1,
4159  len2;
4160  int cmp;
4161 
4162  len1 = VARSIZE_ANY_EXHDR(arg1);
4163  len2 = VARSIZE_ANY_EXHDR(arg2);
4164 
4165  cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4166 
4167  PG_FREE_IF_COPY(arg1, 0);
4168  PG_FREE_IF_COPY(arg2, 1);
4169 
4170  PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 < len2)));
4171 }
4172 
4173 Datum
4175 {
4176  bytea *arg1 = PG_GETARG_BYTEA_PP(0);
4177  bytea *arg2 = PG_GETARG_BYTEA_PP(1);
4178  int len1,
4179  len2;
4180  int cmp;
4181 
4182  len1 = VARSIZE_ANY_EXHDR(arg1);
4183  len2 = VARSIZE_ANY_EXHDR(arg2);
4184 
4185  cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4186 
4187  PG_FREE_IF_COPY(arg1, 0);
4188  PG_FREE_IF_COPY(arg2, 1);
4189 
4190  PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 <= len2)));
4191 }
4192 
4193 Datum
4195 {
4196  bytea *arg1 = PG_GETARG_BYTEA_PP(0);
4197  bytea *arg2 = PG_GETARG_BYTEA_PP(1);
4198  int len1,
4199  len2;
4200  int cmp;
4201 
4202  len1 = VARSIZE_ANY_EXHDR(arg1);
4203  len2 = VARSIZE_ANY_EXHDR(arg2);
4204 
4205  cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4206 
4207  PG_FREE_IF_COPY(arg1, 0);
4208  PG_FREE_IF_COPY(arg2, 1);
4209 
4210  PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 > len2)));
4211 }
4212 
4213 Datum
4215 {
4216  bytea *arg1 = PG_GETARG_BYTEA_PP(0);
4217  bytea *arg2 = PG_GETARG_BYTEA_PP(1);
4218  int len1,
4219  len2;
4220  int cmp;
4221 
4222  len1 = VARSIZE_ANY_EXHDR(arg1);
4223  len2 = VARSIZE_ANY_EXHDR(arg2);
4224 
4225  cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4226 
4227  PG_FREE_IF_COPY(arg1, 0);
4228  PG_FREE_IF_COPY(arg2, 1);
4229 
4230  PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 >= len2)));
4231 }
4232 
4233 Datum
4235 {
4236  bytea *arg1 = PG_GETARG_BYTEA_PP(0);
4237  bytea *arg2 = PG_GETARG_BYTEA_PP(1);
4238  int len1,
4239  len2;
4240  int cmp;
4241 
4242  len1 = VARSIZE_ANY_EXHDR(arg1);
4243  len2 = VARSIZE_ANY_EXHDR(arg2);
4244 
4245  cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4246  if ((cmp == 0) && (len1 != len2))
4247  cmp = (len1 < len2) ? -1 : 1;
4248 
4249  PG_FREE_IF_COPY(arg1, 0);
4250  PG_FREE_IF_COPY(arg2, 1);
4251 
4252  PG_RETURN_INT32(cmp);
4253 }
4254 
4255 Datum
4257 {
4259  MemoryContext oldcontext;
4260 
4261  oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
4262 
4263  /* Use generic string SortSupport, forcing "C" collation */
4264  varstr_sortsupport(ssup, BYTEAOID, C_COLLATION_OID);
4265 
4266  MemoryContextSwitchTo(oldcontext);
4267 
4268  PG_RETURN_VOID();
4269 }
4270 
4271 /*
4272  * appendStringInfoText
4273  *
4274  * Append a text to str.
4275  * Like appendStringInfoString(str, text_to_cstring(t)) but faster.
4276  */
4277 static void
4279 {
4281 }
4282 
4283 /*
4284  * replace_text
4285  * replace all occurrences of 'old_sub_str' in 'orig_str'
4286  * with 'new_sub_str' to form 'new_str'
4287  *
4288  * returns 'orig_str' if 'old_sub_str' == '' or 'orig_str' == ''
4289  * otherwise returns 'new_str'
4290  */
4291 Datum
4293 {
4294  text *src_text = PG_GETARG_TEXT_PP(0);
4295  text *from_sub_text = PG_GETARG_TEXT_PP(1);
4296  text *to_sub_text = PG_GETARG_TEXT_PP(2);
4297  int src_text_len;
4298  int from_sub_text_len;
4300  text *ret_text;
4301  int chunk_len;
4302  char *curr_ptr;
4303  char *start_ptr;
4305  bool found;
4306 
4307  src_text_len = VARSIZE_ANY_EXHDR(src_text);
4308  from_sub_text_len = VARSIZE_ANY_EXHDR(from_sub_text);
4309 
4310  /* Return unmodified source string if empty source or pattern */
4311  if (src_text_len < 1 || from_sub_text_len < 1)
4312  {
4313  PG_RETURN_TEXT_P(src_text);
4314  }
4315 
4316  text_position_setup(src_text, from_sub_text, PG_GET_COLLATION(), &state);
4317 
4318  found = text_position_next(&state);
4319 
4320  /* When the from_sub_text is not found, there is nothing to do. */
4321  if (!found)
4322  {
4323  text_position_cleanup(&state);
4324  PG_RETURN_TEXT_P(src_text);
4325  }
4326  curr_ptr = text_position_get_match_ptr(&state);
4327  start_ptr = VARDATA_ANY(src_text);
4328 
4329  initStringInfo(&str);
4330 
4331  do
4332  {
4334 
4335  /* copy the data skipped over by last text_position_next() */
4336  chunk_len = curr_ptr - start_ptr;
4337  appendBinaryStringInfo(&str, start_ptr, chunk_len);
4338 
4339  appendStringInfoText(&str, to_sub_text);
4340 
4341  start_ptr = curr_ptr + from_sub_text_len;
4342 
4343  found = text_position_next(&state);
4344  if (found)
4345  curr_ptr = text_position_get_match_ptr(&state);
4346  }
4347  while (found);
4348 
4349  /* copy trailing data */
4350  chunk_len = ((char *) src_text + VARSIZE_ANY(src_text)) - start_ptr;
4351  appendBinaryStringInfo(&str, start_ptr, chunk_len);
4352 
4353  text_position_cleanup(&state);
4354 
4355  ret_text = cstring_to_text_with_len(str.data, str.len);
4356  pfree(str.data);
4357 
4358  PG_RETURN_TEXT_P(ret_text);
4359 }
4360 
4361 /*
4362  * check_replace_text_has_escape_char
4363  *
4364  * check whether replace_text contains escape char.
4365  */
4366 static bool
4368 {
4369  const char *p = VARDATA_ANY(replace_text);
4370  const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);
4371 
4373  {
4374  for (; p < p_end; p++)
4375  {
4376  if (*p == '\\')
4377  return true;
4378  }
4379  }
4380  else
4381  {
4382  for (; p < p_end; p += pg_mblen(p))
4383  {
4384  if (*p == '\\')
4385  return true;
4386  }
4387  }
4388 
4389  return false;
4390 }
4391 
4392 /*
4393  * appendStringInfoRegexpSubstr
4394  *
4395  * Append replace_text to str, substituting regexp back references for
4396  * \n escapes. start_ptr is the start of the match in the source string,
4397  * at logical character position data_pos.
4398  */
4399 static void
4401  regmatch_t *pmatch,
4402  char *start_ptr, int data_pos)
4403 {
4404  const char *p = VARDATA_ANY(replace_text);
4405  const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);
4406  int eml = pg_database_encoding_max_length();
4407 
4408  for (;;)
4409  {
4410  const char *chunk_start = p;
4411  int so;
4412  int eo;
4413 
4414  /* Find next escape char. */
4415  if (eml == 1)
4416  {
4417  for (; p < p_end && *p != '\\'; p++)
4418  /* nothing */ ;
4419  }
4420  else
4421  {
4422  for (; p < p_end && *p != '\\'; p += pg_mblen(p))
4423  /* nothing */ ;
4424  }
4425 
4426  /* Copy the text we just scanned over, if any. */
4427  if (p > chunk_start)
4428  appendBinaryStringInfo(str, chunk_start, p - chunk_start);
4429 
4430  /* Done if at end of string, else advance over escape char. */
4431  if (p >= p_end)
4432  break;
4433  p++;
4434 
4435  if (p >= p_end)
4436  {
4437  /* Escape at very end of input. Treat same as unexpected char */
4438  appendStringInfoChar(str, '\\');
4439  break;
4440  }
4441 
4442  if (*p >= '1' && *p <= '9')
4443  {
4444  /* Use the back reference of regexp. */
4445  int idx = *p - '0';
4446 
4447  so = pmatch[idx].rm_so;
4448  eo = pmatch[idx].rm_eo;
4449  p++;
4450  }
4451  else if (*p == '&')
4452  {
4453  /* Use the entire matched string. */
4454  so = pmatch[0].rm_so;
4455  eo = pmatch[0].rm_eo;
4456  p++;
4457  }
4458  else if (*p == '\\')
4459  {
4460  /* \\ means transfer one \ to output. */
4461  appendStringInfoChar(str, '\\');
4462  p++;
4463  continue;
4464  }
4465  else
4466  {
4467  /*
4468  * If escape char is not followed by any expected char, just treat
4469  * it as ordinary data to copy. (XXX would it be better to throw
4470  * an error?)
4471  */
4472  appendStringInfoChar(str, '\\');
4473  continue;
4474  }
4475 
4476  if (so != -1 && eo != -1)
4477  {
4478  /*
4479  * Copy the text that is back reference of regexp. Note so and eo
4480  * are counted in characters not bytes.
4481  */
4482  char *chunk_start;
4483  int chunk_len;
4484 
4485  Assert(so >= data_pos);
4486  chunk_start = start_ptr;
4487  chunk_start += charlen_to_bytelen(chunk_start, so - data_pos);
4488  chunk_len = charlen_to_bytelen(chunk_start, eo - so);
4489  appendBinaryStringInfo(str, chunk_start, chunk_len);
4490  }
4491  }
4492 }
4493 
4494 #define REGEXP_REPLACE_BACKREF_CNT 10
4495 
4496 /*
4497  * replace_text_regexp
4498  *
4499  * replace text that matches to regexp in src_text to replace_text.
4500  *
4501  * Note: to avoid having to include regex.h in builtins.h, we declare
4502  * the regexp argument as void *, but really it's regex_t *.
4503  */
4504 text *
4505 replace_text_regexp(text *src_text, void *regexp,
4506  text *replace_text, bool glob)
4507 {
4508  text *ret_text;
4509  regex_t *re = (regex_t *) regexp;
4510  int src_text_len = VARSIZE_ANY_EXHDR(src_text);
4513  pg_wchar *data;
4514  size_t data_len;
4515  int search_start;
4516  int data_pos;
4517  char *start_ptr;
4518  bool have_escape;
4519 
4520  initStringInfo(&buf);
4521 
4522  /* Convert data string to wide characters. */
4523  data = (pg_wchar *) palloc((src_text_len + 1) * sizeof(pg_wchar));
4524  data_len = pg_mb2wchar_with_len(VARDATA_ANY(src_text), data, src_text_len);
4525 
4526  /* Check whether replace_text has escape char. */
4527  have_escape = check_replace_text_has_escape_char(replace_text);
4528 
4529  /* start_ptr points to the data_pos'th character of src_text */
4530  start_ptr = (char *) VARDATA_ANY(src_text);
4531  data_pos = 0;
4532 
4533  search_start = 0;
4534  while (search_start <= data_len)
4535  {
4536  int regexec_result;
4537 
4539 
4540  regexec_result = pg_regexec(re,
4541  data,
4542  data_len,
4543  search_start,
4544  NULL, /* no details */
4546  pmatch,
4547  0);
4548 
4549  if (regexec_result == REG_NOMATCH)
4550  break;
4551 
4552  if (regexec_result != REG_OKAY)
4553  {
4554  char errMsg[100];
4555 
4557  pg_regerror(regexec_result, re, errMsg, sizeof(errMsg));
4558  ereport(ERROR,
4559  (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
4560  errmsg("regular expression failed: %s", errMsg)));
4561  }
4562 
4563  /*
4564  * Copy the text to the left of the match position. Note we are given
4565  * character not byte indexes.
4566  */
4567  if (pmatch[0].rm_so - data_pos > 0)
4568  {
4569  int chunk_len;
4570 
4571  chunk_len = charlen_to_bytelen(start_ptr,
4572  pmatch[0].rm_so - data_pos);
4573  appendBinaryStringInfo(&buf, start_ptr, chunk_len);
4574 
4575  /*
4576  * Advance start_ptr over that text, to avoid multiple rescans of
4577  * it if the replace_text contains multiple back-references.
4578  */
4579  start_ptr += chunk_len;
4580  data_pos = pmatch[0].rm_so;
4581  }
4582 
4583  /*
4584  * Copy the replace_text. Process back references when the
4585  * replace_text has escape characters.
4586  */
4587  if (have_escape)
4588  appendStringInfoRegexpSubstr(&buf, replace_text, pmatch,
4589  start_ptr, data_pos);
4590  else
4591  appendStringInfoText(&buf, replace_text);
4592 
4593  /* Advance start_ptr and data_pos over the matched text. */
4594  start_ptr += charlen_to_bytelen(start_ptr,
4595  pmatch[0].rm_eo - data_pos);
4596  data_pos = pmatch[0].rm_eo;
4597 
4598  /*
4599  * When global option is off, replace the first instance only.
4600  */
4601  if (!glob)
4602  break;
4603 
4604  /*
4605  * Advance search position. Normally we start the next search at the
4606  * end of the previous match; but if the match was of zero length, we
4607  * have to advance by one character, or we'd just find the same match
4608  * again.
4609  */
4610  search_start = data_pos;
4611  if (pmatch[0].rm_so == pmatch[0].rm_eo)
4612  search_start++;
4613  }
4614 
4615  /*
4616  * Copy the text to the right of the last match.
4617  */
4618  if (data_pos < data_len)
4619  {
4620  int chunk_len;
4621 
4622  chunk_len = ((char *) src_text + VARSIZE_ANY(src_text)) - start_ptr;
4623  appendBinaryStringInfo(&buf, start_ptr, chunk_len);
4624  }
4625 
4626  ret_text = cstring_to_text_with_len(buf.data, buf.len);
4627  pfree(buf.data);
4628  pfree(data);
4629 
4630  return ret_text;
4631 }
4632 
4633 /*
4634  * split_part
4635  * parse input string based on provided field separator
4636  * return N'th item (1 based, negative counts from end)
4637  */
4638 Datum
4640 {
4641  text *inputstring = PG_GETARG_TEXT_PP(0);
4642  text *fldsep = PG_GETARG_TEXT_PP(1);
4643  int fldnum = PG_GETARG_INT32(2);
4644  int inputstring_len;
4645  int fldsep_len;
4647  char *start_ptr;
4648  char *end_ptr;
4649  text *result_text;
4650  bool found;
4651 
4652  /* field number is 1 based */
4653  if (fldnum == 0)
4654  ereport(ERROR,
4655  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4656  errmsg("field position must not be zero")));
4657 
4658  inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
4659  fldsep_len = VARSIZE_ANY_EXHDR(fldsep);
4660 
4661  /* return empty string for empty input string */
4662  if (inputstring_len < 1)
4664 
4665  /* handle empty field separator */
4666  if (fldsep_len < 1)
4667  {
4668  /* if first or last field, return input string, else empty string */
4669  if (fldnum == 1 || fldnum == -1)
4670  PG_RETURN_TEXT_P(inputstring);
4671  else
4673  }
4674 
4675  /* find the first field separator */
4676  text_position_setup(inputstring, fldsep, PG_GET_COLLATION(), &state);
4677 
4678  found = text_position_next(&state);
4679 
4680  /* special case if fldsep not found at all */
4681  if (!found)
4682  {
4683  text_position_cleanup(&state);
4684  /* if first or last field, return input string, else empty string */
4685  if (fldnum == 1 || fldnum == -1)
4686  PG_RETURN_TEXT_P(inputstring);
4687  else
4689  }
4690 
4691  /*
4692  * take care of a negative field number (i.e. count from the right) by
4693  * converting to a positive field number; we need total number of fields
4694  */
4695  if (fldnum < 0)
4696  {
4697  /* we found a fldsep, so there are at least two fields */
4698  int numfields = 2;
4699 
4700  while (text_position_next(&state))
4701  numfields++;
4702 
4703  /* special case of last field does not require an extra pass */
4704  if (fldnum == -1)
4705  {
4706  start_ptr = text_position_get_match_ptr(&state) + fldsep_len;
4707  end_ptr = VARDATA_ANY(inputstring) + inputstring_len;
4708  text_position_cleanup(&state);
4710  end_ptr - start_ptr));
4711  }
4712 
4713  /* else, convert fldnum to positive notation */
4714  fldnum += numfields + 1;
4715 
4716  /* if nonexistent field, return empty string */
4717  if (fldnum <= 0)
4718  {
4719  text_position_cleanup(&state);
4721  }
4722 
4723  /* reset to pointing at first match, but now with positive fldnum */
4724  text_position_reset(&state);
4725  found = text_position_next(&state);
4726  Assert(found);
4727  }
4728 
4729  /* identify bounds of first field */
4730  start_ptr = VARDATA_ANY(inputstring);
4731  end_ptr = text_position_get_match_ptr(&state);
4732 
4733  while (found && --fldnum > 0)
4734  {
4735  /* identify bounds of next field */
4736  start_ptr = end_ptr + fldsep_len;
4737  found = text_position_next(&state);
4738  if (found)
4739  end_ptr = text_position_get_match_ptr(&state);
4740  }
4741 
4742  text_position_cleanup(&state);
4743 
4744  if (fldnum > 0)
4745  {
4746  /* N'th field separator not found */
4747  /* if last field requested, return it, else empty string */
4748  if (fldnum == 1)
4749  {
4750  int last_len = start_ptr - VARDATA_ANY(inputstring);
4751 
4752  result_text = cstring_to_text_with_len(start_ptr,
4753  inputstring_len - last_len);
4754  }
4755  else
4756  result_text = cstring_to_text("");
4757  }
4758  else
4759  {
4760  /* non-last field requested */
4761  result_text = cstring_to_text_with_len(start_ptr, end_ptr - start_ptr);
4762  }
4763 
4764  PG_RETURN_TEXT_P(result_text);
4765 }
4766 
4767 /*
4768  * Convenience function to return true when two text params are equal.
4769  */
4770 static bool
4771 text_isequal(text *txt1, text *txt2, Oid collid)
4772 {
4774  collid,
4775  PointerGetDatum(txt1),
4776  PointerGetDatum(txt2)));
4777 }
4778 
4779 /*
4780  * text_to_array
4781  * parse input string and return text array of elements,
4782  * based on provided field separator
4783  */
4784 Datum
4786 {
4787  SplitTextOutputData tstate;
4788 
4789  /* For array output, tstate should start as all zeroes */
4790  memset(&tstate, 0, sizeof(tstate));
4791 
4792  if (!split_text(fcinfo, &tstate))
4793  PG_RETURN_NULL();
4794 
4795  if (tstate.astate == NULL)
4797 
4800 }
4801 
4802 /*
4803  * text_to_array_null
4804  * parse input string and return text array of elements,
4805  * based on provided field separator and null string
4806  *
4807  * This is a separate entry point only to prevent the regression tests from
4808  * complaining about different argument sets for the same internal function.
4809  */
4810 Datum
4812 {
4813  return text_to_array(fcinfo);
4814 }
4815 
4816 /*
4817  * text_to_table
4818  * parse input string and return table of elements,
4819  * based on provided field separator
4820  */
4821 Datum
4823 {
4824  ReturnSetInfo *rsi = (ReturnSetInfo *) fcinfo->resultinfo;
4825  SplitTextOutputData tstate;
4826  MemoryContext old_cxt;
4827 
4828  /* check to see if caller supports us returning a tuplestore */
4829  if (rsi == NULL || !IsA(rsi, ReturnSetInfo))
4830  ereport(ERROR,
4831  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
4832  errmsg("set-valued function called in context that cannot accept a set")));
4833  if (!(rsi->allowedModes & SFRM_Materialize))
4834  ereport(ERROR,
4835  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
4836  errmsg("materialize mode required, but it is not allowed in this context")));
4837 
4838  /* OK, prepare tuplestore in per-query memory */
4840 
4841  tstate.astate = NULL;
4842  tstate.tupdesc = CreateTupleDescCopy(rsi->expectedDesc);
4843  tstate.tupstore = tuplestore_begin_heap(true, false, work_mem);
4844 
4845  MemoryContextSwitchTo(old_cxt);
4846 
4847  (void) split_text(fcinfo, &tstate);
4848 
4849  tuplestore_donestoring(tstate.tupstore);
4850 
4852  rsi->setResult = tstate.tupstore;
4853  rsi->setDesc = tstate.tupdesc;
4854 
4855  return (Datum) 0;
4856 }
4857 
4858 /*
4859  * text_to_table_null
4860  * parse input string and return table of elements,
4861  * based on provided field separator and null string
4862  *
4863  * This is a separate entry point only to prevent the regression tests from
4864  * complaining about different argument sets for the same internal function.
4865  */
4866 Datum
4868 {
4869  return text_to_table(fcinfo);
4870 }
4871 
4872 /*
4873  * Common code for text_to_array, text_to_array_null, text_to_table
4874  * and text_to_table_null functions.
4875  *
4876  * These are not strict so we have to test for null inputs explicitly.
4877  * Returns false if result is to be null, else returns true.
4878  *
4879  * Note that if the result is valid but empty (zero elements), we return
4880  * without changing *tstate --- caller must handle that case, too.
4881  */
4882 static bool
4884 {
4885  text *inputstring;
4886  text *fldsep;
4887  text *null_string;
4888  Oid collation = PG_GET_COLLATION();
4889  int inputstring_len;
4890  int fldsep_len;
4891  char *start_ptr;
4892  text *result_text;
4893 
4894  /* when input string is NULL, then result is NULL too */
4895  if (PG_ARGISNULL(0))
4896  return false;
4897 
4898  inputstring = PG_GETARG_TEXT_PP(0);
4899 
4900  /* fldsep can be NULL */
4901  if (!PG_ARGISNULL(1))
4902  fldsep = PG_GETARG_TEXT_PP(1);
4903  else
4904  fldsep = NULL;
4905 
4906  /* null_string can be NULL or omitted */
4907  if (PG_NARGS() > 2 && !PG_ARGISNULL(2))
4908  null_string = PG_GETARG_TEXT_PP(2);
4909  else
4910  null_string = NULL;
4911 
4912  if (fldsep != NULL)
4913  {
4914  /*
4915  * Normal case with non-null fldsep. Use the text_position machinery
4916  * to search for occurrences of fldsep.
4917  */
4919 
4920  inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
4921  fldsep_len = VARSIZE_ANY_EXHDR(fldsep);
4922 
4923  /* return empty set for empty input string */
4924  if (inputstring_len < 1)
4925  return true;
4926 
4927  /* empty field separator: return input string as a one-element set */
4928  if (fldsep_len < 1)
4929  {
4930  split_text_accum_result(tstate, inputstring,
4931  null_string, collation);
4932  return true;
4933  }
4934 
4935  text_position_setup(inputstring, fldsep, collation, &state);
4936 
4937  start_ptr = VARDATA_ANY(inputstring);
4938 
4939  for (;;)
4940  {
4941  bool found;
4942  char *end_ptr;
4943  int chunk_len;
4944 
4946 
4947  found = text_position_next(&state);
4948  if (!found)
4949  {
4950  /* fetch last field */
4951  chunk_len = ((char *) inputstring + VARSIZE_ANY(inputstring)) - start_ptr;
4952  end_ptr = NULL; /* not used, but some compilers complain */
4953  }
4954  else
4955  {
4956  /* fetch non-last field */
4957  end_ptr = text_position_get_match_ptr(&state);
4958  chunk_len = end_ptr - start_ptr;
4959  }
4960 
4961  /* build a temp text datum to pass to split_text_accum_result */
4962  result_text = cstring_to_text_with_len(start_ptr, chunk_len);
4963 
4964  /* stash away this field */
4965  split_text_accum_result(tstate, result_text,
4966  null_string, collation);
4967 
4968  pfree(result_text);
4969 
4970  if (!found)
4971  break;
4972 
4973  start_ptr = end_ptr + fldsep_len;
4974  }
4975 
4976  text_position_cleanup(&state);
4977  }
4978  else
4979  {
4980  /*
4981  * When fldsep is NULL, each character in the input string becomes a
4982  * separate element in the result set. The separator is effectively
4983  * the space between characters.
4984  */
4985  inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
4986 
4987  start_ptr = VARDATA_ANY(inputstring);
4988 
4989  while (inputstring_len > 0)
4990  {
4991  int chunk_len = pg_mblen(start_ptr);
4992 
4994 
4995  /* build a temp text datum to pass to split_text_accum_result */
4996  result_text = cstring_to_text_with_len(start_ptr, chunk_len);
4997 
4998  /* stash away this field */
4999  split_text_accum_result(tstate, result_text,
5000  null_string, collation);
5001 
5002  pfree(result_text);
5003 
5004  start_ptr += chunk_len;
5005  inputstring_len -= chunk_len;
5006  }
5007  }
5008 
5009  return true;
5010 }
5011 
5012 /*
5013  * Add text item to result set (table or array).
5014  *
5015  * This is also responsible for checking to see if the item matches
5016  * the null_string, in which case we should emit NULL instead.
5017  */
5018 static void
5020  text *field_value,
5021  text *null_string,
5022  Oid collation)
5023 {
5024  bool is_null = false;
5025 
5026  if (null_string && text_isequal(field_value, null_string, collation))
5027  is_null = true;
5028 
5029  if (tstate->tupstore)
5030  {
5031  Datum values[1];
5032  bool nulls[1];
5033 
5034  values[0] = PointerGetDatum(field_value);
5035  nulls[0] = is_null;
5036 
5038  tstate->tupdesc,
5039  values,
5040  nulls);
5041  }
5042  else
5043  {
5044  tstate->astate = accumArrayResult(tstate->astate,
5045  PointerGetDatum(field_value),
5046  is_null,
5047  TEXTOID,
5049  }
5050 }
5051 
5052 /*
5053  * array_to_text
5054  * concatenate Cstring representation of input array elements
5055  * using provided field separator
5056  */
5057 Datum
5059 {
5061  char *fldsep = text_to_cstring(PG_GETARG_TEXT_PP(1));
5062 
5063  PG_RETURN_TEXT_P(array_to_text_internal(fcinfo, v, fldsep, NULL));
5064 }
5065 
5066 /*
5067  * array_to_text_null
5068  * concatenate Cstring representation of input array elements
5069  * using provided field separator and null string
5070  *
5071  * This version is not strict so we have to test for null inputs explicitly.
5072  */
5073 Datum
5075 {
5076  ArrayType *v;
5077  char *fldsep;
5078  char *null_string;
5079 
5080  /* returns NULL when first or second parameter is NULL */
5081  if (PG_ARGISNULL(0) || PG_ARGISNULL(1))
5082  PG_RETURN_NULL();
5083 
5084  v = PG_GETARG_ARRAYTYPE_P(0);
5085  fldsep = text_to_cstring(PG_GETARG_TEXT_PP(1));
5086 
5087  /* NULL null string is passed through as a null pointer */
5088  if (!PG_ARGISNULL(2))
5089  null_string = text_to_cstring(PG_GETARG_TEXT_PP(2));
5090  else
5091  null_string = NULL;
5092 
5093  PG_RETURN_TEXT_P(array_to_text_internal(fcinfo, v, fldsep, null_string));
5094 }
5095 
5096 /*
5097  * common code for array_to_text and array_to_text_null functions
5098  */
5099 static text *
5101  const char *fldsep, const char *null_string)
5102 {
5103  text *result;
5104  int nitems,
5105  *dims,
5106  ndims;
5107  Oid element_type;
5108  int typlen;
5109  bool typbyval;
5110  char typalign;
5112  bool printed = false;
5113  char *p;
5114  bits8 *bitmap;
5115  int bitmask;
5116  int i;
5117  ArrayMetaState *my_extra;
5118 
5119  ndims = ARR_NDIM(v);
5120  dims = ARR_DIMS(v);
5121  nitems = ArrayGetNItems(ndims, dims);
5122 
5123  /* if there are no elements, return an empty string */
5124  if (nitems == 0)
5125  return cstring_to_text_with_len("", 0);
5126 
5127  element_type = ARR_ELEMTYPE(v);
5128  initStringInfo(&buf);
5129 
5130  /*
5131  * We arrange to look up info about element type, including its output
5132  * conversion proc, only once per series of calls, assuming the element
5133  * type doesn't change underneath us.
5134  */
5135  my_extra = (ArrayMetaState *) fcinfo->flinfo->fn_extra;
5136  if (my_extra == NULL)
5137  {
5138  fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
5139  sizeof(ArrayMetaState));
5140  my_extra = (ArrayMetaState *) fcinfo->flinfo->fn_extra;
5141  my_extra->element_type = ~element_type;
5142  }
5143 
5144  if (my_extra->element_type != element_type)
5145  {
5146  /*
5147  * Get info about element type, including its output conversion proc
5148  */
5149  get_type_io_data(element_type, IOFunc_output,
5150  &my_extra->typlen, &my_extra->typbyval,
5151  &my_extra->typalign, &my_extra->typdelim,
5152  &my_extra->typioparam, &my_extra->typiofunc);
5153  fmgr_info_cxt(my_extra->typiofunc, &my_extra->proc,
5154  fcinfo->flinfo->fn_mcxt);
5155  my_extra->element_type = element_type;
5156  }
5157  typlen = my_extra->typlen;
5158  typbyval = my_extra->typbyval;
5159  typalign = my_extra->typalign;
5160 
5161  p = ARR_DATA_PTR(v);
5162  bitmap = ARR_NULLBITMAP(v);
5163  bitmask = 1;
5164 
5165  for (i = 0; i < nitems; i++)
5166  {
5167  Datum itemvalue;
5168  char *value;
5169 
5170  /* Get source element, checking for NULL */
5171  if (bitmap && (*bitmap & bitmask) == 0)
5172  {
5173  /* if null_string is NULL, we just ignore null elements */
5174  if (null_string != NULL)
5175  {
5176  if (printed)
5177  appendStringInfo(&buf, "%s%s", fldsep, null_string);
5178  else
5179  appendStringInfoString(&buf, null_string);
5180  printed = true;
5181  }
5182  }
5183  else
5184  {
5185  itemvalue = fetch_att(p, typbyval, typlen);
5186 
5187  value = OutputFunctionCall(&my_extra->proc, itemvalue);
5188 
5189  if (printed)
5190  appendStringInfo(&buf, "%s%s", fldsep, value);
5191  else
5192  appendStringInfoString(&buf, value);
5193  printed = true;
5194 
5195  p = att_addlength_pointer(p, typlen, p);
5196  p = (char *) att_align_nominal(p, typalign);
5197  }
5198 
5199  /* advance bitmap pointer if any */
5200  if (bitmap)
5201  {
5202  bitmask <<= 1;
5203  if (bitmask == 0x100)
5204  {
5205  bitmap++;
5206  bitmask = 1;
5207  }
5208  }
5209  }
5210 
5211  result = cstring_to_text_with_len(buf.data, buf.len);
5212  pfree(buf.data);
5213 
5214  return result;
5215 }
5216 
5217 #define HEXBASE 16
5218 /*
5219  * Convert an int32 to a string containing a base 16 (hex) representation of
5220  * the number.
5221  */
5222 Datum
5224 {
5226  char *ptr;
5227  const char *digits = "0123456789abcdef";
5228  char buf[32]; /* bigger than needed, but reasonable */
5229 
5230  ptr = buf + sizeof(buf) - 1;
5231  *ptr = '\0';
5232 
5233  do
5234  {
5235  *--ptr = digits[value % HEXBASE];
5236  value /= HEXBASE;
5237  } while (ptr > buf && value);
5238 
5240 }
5241 
5242 /*
5243  * Convert an int64 to a string containing a base 16 (hex) representation of
5244  * the number.
5245  */
5246 Datum
5248 {
5249  uint64 value = (uint64) PG_GETARG_INT64(0);
5250  char *ptr;
5251  const char *digits = "0123456789abcdef";
5252  char buf[32]; /* bigger than needed, but reasonable */
5253 
5254  ptr = buf + sizeof(buf) - 1;
5255  *ptr = '\0';
5256 
5257  do
5258  {
5259  *--ptr = digits[value % HEXBASE];
5260  value /= HEXBASE;
5261  } while (ptr > buf && value);
5262 
5264 }
5265 
5266 /*
5267  * Return the size of a datum, possibly compressed
5268  *
5269  * Works on any data type
5270  */
5271 Datum
5273 {
5275  int32 result;
5276  int typlen;
5277 
5278  /* On first call, get the input type's typlen, and save at *fn_extra */
5279  if (fcinfo->flinfo->fn_extra == NULL)
5280  {
5281  /* Lookup the datatype of the supplied argument */
5282  Oid argtypeid = get_fn_expr_argtype(fcinfo->flinfo, 0);
5283 
5284  typlen = get_typlen(argtypeid);
5285  if (typlen == 0) /* should not happen */
5286  elog(ERROR, "cache lookup failed for type %u", argtypeid);
5287 
5288  fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
5289  sizeof(int));
5290  *((int *) fcinfo->flinfo->fn_extra) = typlen;
5291  }
5292  else
5293  typlen = *((int *) fcinfo->flinfo->fn_extra);
5294 
5295  if (typlen == -1)
5296  {
5297  /* varlena type, possibly toasted */
5298  result = toast_datum_size(value);
5299  }
5300  else if (typlen == -2)
5301  {
5302  /* cstring */
5303  result = strlen(DatumGetCString(value)) + 1;
5304  }
5305  else
5306  {
5307  /* ordinary fixed-width type */
5308  result = typlen;
5309  }
5310 
5311  PG_RETURN_INT32(result);
5312 }
5313 
5314 /*
5315  * Return the compression method stored in the compressed attribute. Return
5316  * NULL for non varlena type or uncompressed data.
5317  */
5318 Datum
5320 {
5321  int typlen;
5322  char *result;
5323  ToastCompressionId cmid;
5324 
5325  /* On first call, get the input type's typlen, and save at *fn_extra */
5326  if (fcinfo->flinfo->fn_extra == NULL)
5327  {
5328  /* Lookup the datatype of the supplied argument */
5329  Oid argtypeid = get_fn_expr_argtype(fcinfo->flinfo, 0);
5330 
5331  typlen = get_typlen(argtypeid);
5332  if (typlen == 0) /* should not happen */
5333  elog(ERROR, "cache lookup failed for type %u", argtypeid);
5334 
5335  fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
5336  sizeof(int));
5337  *((int *) fcinfo->flinfo->fn_extra) = typlen;
5338  }
5339  else
5340  typlen = *((int *) fcinfo->flinfo->fn_extra);
5341 
5342  if (typlen != -1)
5343  PG_RETURN_NULL();
5344 
5345  /* get the compression method id stored in the compressed varlena */
5346  cmid = toast_get_compression_id((struct varlena *)
5348  if (cmid == TOAST_INVALID_COMPRESSION_ID)
5349  PG_RETURN_NULL();
5350 
5351  /* convert compression method id to compression method name */
5352  switch (cmid)
5353  {
5355  result = "pglz";
5356  break;
5358  result = "lz4";
5359  break;
5360  default:
5361  elog(ERROR, "invalid compression method id %d", cmid);
5362  }
5363 
5365 }
5366 
5367 /*
5368  * string_agg - Concatenates values and returns string.
5369  *
5370  * Syntax: string_agg(value text, delimiter text) RETURNS text
5371  *
5372  * Note: Any NULL values are ignored. The first-call delimiter isn't
5373  * actually used at all, and on subsequent calls the delimiter precedes
5374  * the associated value.
5375  */
5376 
5377 /* subroutine to initialize state */
5378 static StringInfo
5380 {
5381  StringInfo state;
5382  MemoryContext aggcontext;
5383  MemoryContext oldcontext;
5384 
5385  if (!AggCheckCallContext(fcinfo, &aggcontext))
5386  {
5387  /* cannot be called directly because of internal-type argument */
5388  elog(ERROR, "string_agg_transfn called in non-aggregate context");
5389  }
5390 
5391  /*
5392  * Create state in aggregate context. It'll stay there across subsequent
5393  * calls.
5394  */
5395  oldcontext = MemoryContextSwitchTo(aggcontext);
5396  state = makeStringInfo();
5397  MemoryContextSwitchTo(oldcontext);
5398 
5399  return state;
5400 }
5401 
5402 Datum
5404 {
5405  StringInfo state;
5406 
5407  state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
5408 
5409  /* Append the value unless null. */
5410  if (!PG_ARGISNULL(1))
5411  {
5412  /* On the first time through, we ignore the delimiter. */
5413  if (state == NULL)
5414  state = makeStringAggState(fcinfo);
5415  else if (!PG_ARGISNULL(2))
5416  appendStringInfoText(state, PG_GETARG_TEXT_PP(2)); /* delimiter */
5417 
5418  appendStringInfoText(state, PG_GETARG_TEXT_PP(1)); /* value */
5419  }
5420 
5421  /*
5422  * The transition type for string_agg() is declared to be "internal",
5423  * which is a pass-by-value type the same size as a pointer.
5424  */
5425  PG_RETURN_POINTER(state);
5426 }
5427 
5428 Datum
5430 {
5431  StringInfo state;
5432 
5433  /* cannot be called directly because of internal-type argument */
5434  Assert(AggCheckCallContext(fcinfo, NULL));
5435 
5436  state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
5437 
5438  if (state != NULL)
5440  else
5441  PG_RETURN_NULL();
5442 }
5443 
5444 /*
5445  * Prepare cache with fmgr info for the output functions of the datatypes of
5446  * the arguments of a concat-like function, beginning with argument "argidx".
5447  * (Arguments before that will have corresponding slots in the resulting
5448  * FmgrInfo array, but we don't fill those slots.)
5449  */
5450 static FmgrInfo *
5452 {
5453  FmgrInfo *foutcache;
5454  int i;
5455 
5456  /* We keep the info in fn_mcxt so it survives across calls */
5457  foutcache = (FmgrInfo *) MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
5458  PG_NARGS() * sizeof(FmgrInfo));
5459 
5460  for (i = argidx; i < PG_NARGS(); i++)
5461  {
5462  Oid valtype;
5463  Oid typOutput;
5464  bool typIsVarlena;
5465 
5466  valtype = get_fn_expr_argtype(fcinfo->flinfo, i);
5467  if (!OidIsValid(valtype))
5468  elog(ERROR, "could not determine data type of concat() input");
5469 
5470  getTypeOutputInfo(valtype, &typOutput, &typIsVarlena);
5471  fmgr_info_cxt(typOutput, &foutcache[i], fcinfo->flinfo->fn_mcxt);
5472  }
5473 
5474  fcinfo->flinfo->fn_extra = foutcache;
5475 
5476  return foutcache;
5477 }
5478 
5479 /*
5480  * Implementation of both concat() and concat_ws().
5481  *
5482  * sepstr is the separator string to place between values.
5483  * argidx identifies the first argument to concatenate (counting from zero);
5484  * note that this must be constant across any one series of calls.
5485  *
5486  * Returns NULL if result should be NULL, else text value.
5487  */
5488 static text *
5489 concat_internal(const char *sepstr, int argidx,
5490  FunctionCallInfo fcinfo)
5491 {
5492  text *result;
5494  FmgrInfo *foutcache;
5495  bool first_arg = true;
5496  int i;
5497 
5498  /*
5499  * concat(VARIADIC some-array) is essentially equivalent to
5500  * array_to_text(), ie concat the array elements with the given separator.
5501  * So we just pass the case off to that code.
5502  */
5503  if (get_fn_expr_variadic(fcinfo->flinfo))
5504  {
5505  ArrayType *arr;
5506 
5507  /* Should have just the one argument */
5508  Assert(argidx == PG_NARGS() - 1);
5509 
5510  /* concat(VARIADIC NULL) is defined as NULL */
5511  if (PG_ARGISNULL(argidx))
5512  return NULL;
5513 
5514  /*
5515  * Non-null argument had better be an array. We assume that any call
5516  * context that could let get_fn_expr_variadic return true will have
5517  * checked that a VARIADIC-labeled parameter actually is an array. So
5518  * it should be okay to just Assert that it's an array rather than
5519  * doing a full-fledged error check.
5520  */
5522 
5523  /* OK, safe to fetch the array value */
5524  arr = PG_GETARG_ARRAYTYPE_P(argidx);
5525 
5526  /*
5527  * And serialize the array. We tell array_to_text to ignore null
5528  * elements, which matches the behavior of the loop below.
5529  */
5530  return array_to_text_internal(fcinfo, arr, sepstr, NULL);
5531  }
5532 
5533  /* Normal case without explicit VARIADIC marker */
5534  initStringInfo(&str);
5535 
5536  /* Get output function info, building it if first time through */
5537  foutcache = (FmgrInfo *) fcinfo->flinfo->fn_extra;
5538  if (foutcache == NULL)
5539  foutcache = build_concat_foutcache(fcinfo, argidx);
5540 
5541  for (i = argidx; i < PG_NARGS(); i++)
5542  {
5543  if (!PG_ARGISNULL(i))
5544  {
5546 
5547  /* add separator if appropriate */
5548  if (first_arg)
5549  first_arg = false;
5550  else
5551  appendStringInfoString(&str, sepstr);
5552 
5553  /* call the appropriate type output function, append the result */
5555  OutputFunctionCall(&foutcache[i], value));
5556  }
5557  }
5558 
5559  result = cstring_to_text_with_len(str.data, str.len);
5560  pfree(str.data);
5561 
5562  return result;
5563 }
5564 
5565 /*
5566  * Concatenate all arguments. NULL arguments are ignored.
5567  */
5568 Datum
5570 {
5571  text *result;
5572 
5573  result = concat_internal("", 0, fcinfo);
5574  if (result == NULL)
5575  PG_RETURN_NULL();
5576  PG_RETURN_TEXT_P(result);
5577 }
5578 
5579 /*
5580  * Concatenate all but first argument value with separators. The first
5581  * parameter is used as the separator. NULL arguments are ignored.
5582  */
5583 Datum
5585 {
5586  char *sep;
5587  text *result;
5588 
5589  /* return NULL when separator is NULL */
5590  if (PG_ARGISNULL(0))
5591  PG_RETURN_NULL();
5593 
5594  result = concat_internal(sep, 1, fcinfo);
5595  if (result == NULL)
5596  PG_RETURN_NULL();
5597  PG_RETURN_TEXT_P(result);
5598 }
5599 
5600 /*
5601  * Return first n characters in the string. When n is negative,
5602  * return all but last |n| characters.
5603  */
5604 Datum
5606 {
5607  int n = PG_GETARG_INT32(1);
5608 
5609  if (n < 0)
5610  {
5611  text *str = PG_GETARG_TEXT_PP(0);
5612  const char *p = VARDATA_ANY(str);
5613  int len = VARSIZE_ANY_EXHDR(str);
5614  int rlen;
5615 
5616  n = pg_mbstrlen_with_len(p, len) + n;
5617  rlen = pg_mbcharcliplen(p, len, n);
5619  }
5620  else
5622 }
5623 
5624 /*
5625  * Return last n characters in the string. When n is negative,
5626  * return all but first |n| characters.
5627  */
5628 Datum
5630 {
5631  text *str = PG_GETARG_TEXT_PP(0);
5632  const char *p = VARDATA_ANY(str);
5633  int len = VARSIZE_ANY_EXHDR(str);
5634  int n = PG_GETARG_INT32(1);
5635  int off;
5636 
5637  if (n < 0)
5638  n = -n;
5639  else
5640  n = pg_mbstrlen_with_len(p, len) - n;
5641  off = pg_mbcharcliplen(p, len, n);
5642 
5643  PG_RETURN_TEXT_P(cstring_to_text_with_len(p + off, len - off));
5644 }
5645 
5646 /*
5647  * Return reversed string
5648  */
5649 Datum
5651 {
5652  text *str = PG_GETARG_TEXT_PP(0);
5653  const char *p = VARDATA_ANY(str);
5654  int len = VARSIZE_ANY_EXHDR(str);
5655  const char *endp = p + len;
5656  text *result;
5657  char *dst;
5658 
5659  result = palloc(len + VARHDRSZ);
5660  dst = (char *) VARDATA(result) + len;
5661  SET_VARSIZE(result, len + VARHDRSZ);
5662 
5664  {
5665  /* multibyte version */
5666  while (p < endp)
5667  {
5668  int sz;
5669 
5670  sz = pg_mblen(p);
5671  dst -= sz;
5672  memcpy(dst, p, sz);
5673  p += sz;
5674  }
5675  }
5676  else
5677  {
5678  /* single byte version */
5679  while (p < endp)
5680  *(--dst) = *p++;
5681  }
5682 
5683  PG_RETURN_TEXT_P(result);
5684 }
5685 
5686 
5687 /*
5688  * Support macros for text_format()
5689  */
5690 #define TEXT_FORMAT_FLAG_MINUS 0x0001 /* is minus flag present? */
5691 
5692 #define ADVANCE_PARSE_POINTER(ptr,end_ptr) \
5693  do { \
5694  if (++(ptr) >= (end_ptr)) \
5695  ereport(ERROR, \
5696  (errcode(ERRCODE_INVALID_PARAMETER_VALUE), \
5697  errmsg("unterminated format() type specifier"), \
5698  errhint("For a single \"%%\" use \"%%%%\"."))); \
5699  } while (0)
5700 
5701 /*
5702  * Returns a formatted string
5703  */
5704 Datum
5706 {
5707  text *fmt;
5709  const char *cp;
5710  const char *start_ptr;
5711  const char *end_ptr;
5712  text *result;
5713  int arg;
5714  bool funcvariadic;
5715  int nargs;
5716  Datum *elements = NULL;
5717  bool *nulls = NULL;
5718  Oid element_type = InvalidOid;
5719  Oid prev_type = InvalidOid;
5720  Oid prev_width_type = InvalidOid;
5721  FmgrInfo typoutputfinfo;
5722  FmgrInfo typoutputinfo_width;
5723 
5724  /* When format string is null, immediately return null */
5725  if (PG_ARGISNULL(0))
5726  PG_RETURN_NULL();
5727 
5728  /* If argument is marked VARIADIC, expand array into elements */
5729  if (get_fn_expr_variadic(fcinfo->flinfo))
5730  {
5731  ArrayType *arr;
5732  int16 elmlen;
5733  bool elmbyval;
5734  char elmalign;
5735  int nitems;
5736 
5737  /* Should have just the one argument */
5738  Assert(PG_NARGS() == 2);
5739 
5740  /* If argument is NULL, we treat it as zero-length array */
5741  if (PG_ARGISNULL(1))
5742  nitems = 0;
5743  else
5744  {
5745  /*
5746  * Non-null argument had better be an array. We assume that any
5747  * call context that could let get_fn_expr_variadic return true
5748  * will have checked that a VARIADIC-labeled parameter actually is
5749  * an array. So it should be okay to just Assert that it's an
5750  * array rather than doing a full-fledged error check.
5751  */
5753 
5754  /* OK, safe to fetch the array value */
5755  arr = PG_GETARG_ARRAYTYPE_P(1);
5756 
5757  /* Get info about array element type */
5758  element_type = ARR_ELEMTYPE(arr);
5759  get_typlenbyvalalign(element_type,
5760  &elmlen, &elmbyval, &elmalign);
5761 
5762  /* Extract all array elements */
5763  deconstruct_array(arr, element_type, elmlen, elmbyval, elmalign,
5764  &elements, &nulls, &nitems);
5765  }
5766 
5767  nargs = nitems + 1;
5768  funcvariadic = true;
5769  }
5770  else
5771  {
5772  /* Non-variadic case, we'll process the arguments individually */
5773  nargs = PG_NARGS();
5774  funcvariadic = false;
5775  }
5776 
5777  /* Setup for main loop. */
5778  fmt = PG_GETARG_TEXT_PP(0);
5779  start_ptr = VARDATA_ANY(fmt);
5780  end_ptr = start_ptr + VARSIZE_ANY_EXHDR(fmt);
5781  initStringInfo(&str);
5782  arg = 1; /* next argument position to print */
5783 
5784  /* Scan format string, looking for conversion specifiers. */
5785  for (cp = start_ptr; cp < end_ptr; cp++)
5786  {
5787  int argpos;
5788  int widthpos;
5789  int flags;
5790  int width;
5791  Datum value;
5792  bool isNull;
5793  Oid typid;
5794 
5795  /*
5796  * If it's not the start of a conversion specifier, just copy it to
5797  * the output buffer.
5798  */
5799  if (*cp != '%')
5800  {
5801  appendStringInfoCharMacro(&str, *cp);
5802  continue;
5803  }
5804 
5805  ADVANCE_PARSE_POINTER(cp, end_ptr);
5806 
5807  /* Easy case: %% outputs a single % */
5808  if (*cp == '%')
5809  {
5810  appendStringInfoCharMacro(&str, *cp);
5811  continue;
5812  }
5813 
5814  /* Parse the optional portions of the format specifier */
5815  cp = text_format_parse_format(cp, end_ptr,
5816  &argpos, &widthpos,
5817  &flags, &width);
5818 
5819  /*
5820  * Next we should see the main conversion specifier. Whether or not
5821  * an argument position was present, it's known that at least one
5822  * character remains in the string at this point. Experience suggests
5823  * that it's worth checking that that character is one of the expected
5824  * ones before we try to fetch arguments, so as to produce the least
5825  * confusing response to a mis-formatted specifier.
5826  */
5827  if (strchr("sIL", *cp) == NULL)
5828  ereport(ERROR,
5829  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5830  errmsg("unrecognized format() type specifier \"%.*s\"",
5831  pg_mblen(cp), cp),
5832  errhint("For a single \"%%\" use \"%%%%\".")));
5833 
5834  /* If indirect width was specified, get its value */
5835  if (widthpos >= 0)
5836  {
5837  /* Collect the specified or next argument position */
5838  if (widthpos > 0)
5839  arg = widthpos;
5840  if (arg >= nargs)
5841  ereport(ERROR,
5842  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5843  errmsg("too few arguments for format()")));
5844 
5845  /* Get the value and type of the selected argument */
5846  if (!funcvariadic)
5847  {
5848  value = PG_GETARG_DATUM(arg);
5849  isNull = PG_ARGISNULL(arg);
5850  typid = get_fn_expr_argtype(fcinfo->flinfo, arg);
5851  }
5852  else
5853  {
5854  value = elements[arg - 1];
5855  isNull = nulls[arg - 1];
5856  typid = element_type;
5857  }
5858  if (!OidIsValid(typid))
5859  elog(ERROR, "could not determine data type of format() input");
5860 
5861  arg++;
5862 
5863  /* We can treat NULL width the same as zero */
5864  if (isNull)
5865  width = 0;
5866  else if (typid == INT4OID)
5867  width = DatumGetInt32(value);
5868  else if (typid == INT2OID)
5869  width = DatumGetInt16(value);
5870  else
5871  {
5872  /* For less-usual datatypes, convert to text then to int */
5873  char *str;
5874 
5875  if (typid != prev_width_type)
5876  {
5877  Oid typoutputfunc;
5878  bool typIsVarlena;
5879 
5880  getTypeOutputInfo(typid, &typoutputfunc, &typIsVarlena);
5881  fmgr_info(typoutputfunc, &typoutputinfo_width);
5882  prev_width_type = typid;
5883  }
5884 
5885  str = OutputFunctionCall(&typoutputinfo_width, value);
5886 
5887  /* pg_strtoint32 will complain about bad data or overflow */
5888  width = pg_strtoint32(str);
5889 
5890  pfree(str);
5891  }
5892  }
5893 
5894  /* Collect the specified or next argument position */
5895  if (argpos > 0)
5896  arg = argpos;
5897  if (arg >= nargs)
5898  ereport(ERROR,
5899  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5900  errmsg("too few arguments for format()")));
5901 
5902  /* Get the value and type of the selected argument */
5903  if (!funcvariadic)
5904  {
5905  value = PG_GETARG_DATUM(arg);
5906  isNull = PG_ARGISNULL(arg);
5907  typid = get_fn_expr_argtype(fcinfo->flinfo, arg);
5908  }
5909  else
5910  {
5911  value = elements[arg - 1];
5912  isNull = nulls[arg - 1];
5913  typid = element_type;
5914  }
5915  if (!OidIsValid(typid))
5916  elog(ERROR, "could not determine data type of format() input");
5917 
5918  arg++;
5919 
5920  /*
5921  * Get the appropriate typOutput function, reusing previous one if
5922  * same type as previous argument. That's particularly useful in the
5923  * variadic-array case, but often saves work even for ordinary calls.
5924  */
5925  if (typid != prev_type)
5926  {
5927  Oid typoutputfunc;
5928  bool typIsVarlena;
5929 
5930  getTypeOutputInfo(typid, &typoutputfunc, &typIsVarlena);
5931  fmgr_info(typoutputfunc, &typoutputfinfo);
5932  prev_type = typid;
5933  }
5934 
5935  /*
5936  * And now we can format the value.
5937  */
5938  switch (*cp)
5939  {
5940  case 's':
5941  case 'I':
5942  case 'L':
5943  text_format_string_conversion(&str, *cp, &typoutputfinfo,
5944  value, isNull,
5945  flags, width);
5946  break;
5947  default:
5948  /* should not get here, because of previous check */
5949  ereport(ERROR,
5950  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5951  errmsg("unrecognized format() type specifier \"%.*s\"",
5952  pg_mblen(cp), cp),
5953  errhint("For a single \"%%\" use \"%%%%\".")));
5954  break;
5955  }
5956  }
5957 
5958  /* Don't need deconstruct_array results anymore. */
5959  if (elements != NULL)
5960  pfree(elements);
5961  if (nulls != NULL)
5962  pfree(nulls);
5963 
5964  /* Generate results. */
5965  result = cstring_to_text_with_len(str.data, str.len);
5966  pfree(str.data);
5967 
5968  PG_RETURN_TEXT_P(result);
5969 }
5970 
5971 /*
5972  * Parse contiguous digits as a decimal number.
5973  *
5974  * Returns true if some digits could be parsed.
5975  * The value is returned into *value, and *ptr is advanced to the next
5976  * character to be parsed.
5977  *
5978  * Note parsing invariant: at least one character is known available before
5979  * string end (end_ptr) at entry, and this is still true at exit.
5980  */
5981 static bool
5982 text_format_parse_digits(const char **ptr, const char *end_ptr, int *value)
5983 {
5984  bool found = false;
5985  const char *cp = *ptr;
5986  int val = 0;
5987 
5988  while (*cp >= '0' && *cp <= '9')
5989  {
5990  int8 digit = (*cp - '0');
5991 
5992  if (unlikely(pg_mul_s32_overflow(val, 10, &val)) ||
5993  unlikely(pg_add_s32_overflow(val, digit, &val)))
5994  ereport(ERROR,
5995  (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
5996  errmsg("number is out of range")));
5997  ADVANCE_PARSE_POINTER(cp, end_ptr);
5998  found = true;
5999  }
6000 
6001  *ptr = cp;
6002  *value = val;
6003 
6004  return found;
6005 }
6006 
6007 /*
6008  * Parse a format specifier (generally following the SUS printf spec).
6009  *
6010  * We have already advanced over the initial '%', and we are looking for
6011  * [argpos][flags][width]type (but the type character is not consumed here).
6012  *
6013  * Inputs are start_ptr (the position after '%') and end_ptr (string end + 1).
6014  * Output parameters:
6015  * argpos: argument position for value to be printed. -1 means unspecified.
6016  * widthpos: argument position for width. Zero means the argument position
6017  * was unspecified (ie, take the next arg) and -1 means no width
6018  * argument (width was omitted or specified as a constant).
6019  * flags: bitmask of flags.
6020  * width: directly-specified width value. Zero means the width was omitted
6021  * (note it's not necessary to distinguish this case from an explicit
6022  * zero width value).
6023  *
6024  * The function result is the next character position to be parsed, ie, the
6025  * location where the type character is/should be.
6026  *
6027  * Note parsing invariant: at least one character is known available before
6028  * string end (end_ptr) at entry, and this is still true at exit.
6029  */
6030 static const char *
6031 text_format_parse_format(const char *start_ptr, const char *end_ptr,
6032  int *argpos, int *widthpos,
6033  int *flags, int *width)
6034 {
6035  const char *cp = start_ptr;
6036  int n;
6037 
6038  /* set defaults for output parameters */
6039  *argpos = -1;
6040  *widthpos = -1;
6041  *flags = 0;
6042  *width = 0;
6043 
6044  /* try to identify first number */
6045  if (text_format_parse_digits(&cp, end_ptr, &n))
6046  {
6047  if (*cp != '$')
6048  {
6049  /* Must be just a width and a type, so we're done */
6050  *width = n;
6051  return cp;
6052  }
6053  /* The number was argument position */
6054  *argpos = n;
6055  /* Explicit 0 for argument index is immediately refused */
6056  if (n == 0)
6057  ereport(ERROR,
6058  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6059  errmsg("format specifies argument 0, but arguments are numbered from 1")));
6060  ADVANCE_PARSE_POINTER(cp, end_ptr);
6061  }
6062 
6063  /* Handle flags (only minus is supported now) */
6064  while (*cp == '-')
6065  {
6066  *flags |= TEXT_FORMAT_FLAG_MINUS;
6067  ADVANCE_PARSE_POINTER(cp, end_ptr);
6068  }
6069 
6070  if (*cp == '*')
6071  {
6072  /* Handle indirect width */
6073  ADVANCE_PARSE_POINTER(cp, end_ptr);
6074  if (text_format_parse_digits(&cp, end_ptr, &n))
6075  {
6076  /* number in this position must be closed by $ */
6077  if (*cp != '$')
6078  ereport(ERROR,
6079  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6080  errmsg("width argument position must be ended by \"$\"")));
6081  /* The number was width argument position */
6082  *widthpos = n;
6083  /* Explicit 0 for argument index is immediately refused */
6084  if (n == 0)
6085  ereport(ERROR,
6086  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6087  errmsg("format specifies argument 0, but arguments are numbered from 1")));
6088  ADVANCE_PARSE_POINTER(cp, end_ptr);
6089  }
6090  else
6091  *widthpos = 0; /* width's argument position is unspecified */
6092  }
6093  else
6094  {
6095  /* Check for direct width specification */
6096  if (text_format_parse_digits(&cp, end_ptr, &n))
6097  *width = n;
6098  }
6099 
6100  /* cp should now be pointing at type character */
6101  return cp;
6102 }
6103 
6104 /*
6105  * Format a %s, %I, or %L conversion
6106  */
6107 static void
6109  FmgrInfo *typOutputInfo,
6110  Datum value, bool isNull,
6111  int flags, int width)
6112 {
6113  char *str;
6114 
6115  /* Handle NULL arguments before trying to stringify the value. */
6116  if (isNull)
6117  {
6118  if (conversion == 's')
6119  text_format_append_string(buf, "", flags, width);
6120  else if (conversion == 'L')
6121  text_format_append_string(buf, "NULL", flags, width);
6122  else if (conversion == 'I')
6123  ereport(ERROR,
6124  (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
6125  errmsg("null values cannot be formatted as an SQL identifier")));
6126  return;
6127  }
6128 
6129  /* Stringify. */
6130  str = OutputFunctionCall(typOutputInfo, value);
6131 
6132  /* Escape. */
6133  if (conversion == 'I')
6134  {
6135  /* quote_identifier may or may not allocate a new string. */
6136  text_format_append_string(buf, quote_identifier(str), flags, width);
6137  }
6138  else if (conversion == 'L')
6139  {
6140  char *qstr = quote_literal_cstr(str);
6141 
6142  text_format_append_string(buf, qstr, flags, width);
6143  /* quote_literal_cstr() always allocates a new string */
6144  pfree(qstr);
6145  }
6146  else
6147  text_format_append_string(buf, str, flags, width);
6148 
6149  /* Cleanup. */
6150  pfree(str);
6151 }
6152 
6153 /*
6154  * Append str to buf, padding as directed by flags/width
6155  */
6156 static void
6158  int flags, int width)
6159 {
6160  bool align_to_left = false;
6161  int len;
6162 
6163  /* fast path for typical easy case */
6164  if (width == 0)
6165  {
6166  appendStringInfoString(buf, str);
6167  return;
6168  }
6169 
6170  if (width < 0)
6171  {
6172  /* Negative width: implicit '-' flag, then take absolute value */
6173  align_to_left = true;
6174  /* -INT_MIN is undefined */
6175  if (width <= INT_MIN)
6176  ereport(ERROR,
6177  (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
6178  errmsg("number is out of range")));
6179  width = -width;
6180  }
6181  else if (flags & TEXT_FORMAT_FLAG_MINUS)
6182  align_to_left = true;
6183 
6184  len = pg_mbstrlen(str);
6185  if (align_to_left)
6186  {
6187  /* left justify */
6188  appendStringInfoString(buf, str);
6189  if (len < width)
6190  appendStringInfoSpaces(buf, width - len);
6191  }
6192  else
6193  {
6194  /* right justify */
6195  if (len < width)
6196  appendStringInfoSpaces(buf, width - len);
6197  appendStringInfoString(buf, str);
6198  }
6199 }
6200 
6201 /*
6202  * text_format_nv - nonvariadic wrapper for text_format function.
6203  *
6204  * note: this wrapper is necessary to pass the sanity check in opr_sanity,
6205  * which checks that all built-in functions that share the implementing C
6206  * function take the same number of arguments.
6207  */
6208 Datum
6210 {
6211  return text_format(fcinfo);
6212 }
6213 
6214 /*
6215  * Helper function for Levenshtein distance functions. Faster than memcmp(),
6216  * for this use case.
6217  */
6218 static inline bool
6219 rest_of_char_same(const char *s1, const char *s2, int len)
6220 {
6221  while (len > 0)
6222  {
6223  len--;
6224  if (s1[len] != s2[len])
6225  return false;
6226  }
6227  return true;
6228 }
6229 
6230 /* Expand each Levenshtein distance variant */
6231 #include "levenshtein.c"
6232 #define LEVENSHTEIN_LESS_EQUAL
6233 #include "levenshtein.c"
6234 
6235 
6236 /*
6237  * Unicode support
6238  */
6239 
6241 unicode_norm_form_from_string(const char *formstr)
6242 {
6243  UnicodeNormalizationForm form = -1;
6244 
6245  /*
6246  * Might as well check this while we're here.
6247  */
6248  if (GetDatabaseEncoding() != PG_UTF8)
6249  ereport(ERROR,
6250  (errcode(ERRCODE_SYNTAX_ERROR),
6251  errmsg("Unicode normalization can only be performed if server encoding is UTF8")));
6252 
6253  if (pg_strcasecmp(formstr, "NFC") == 0)
6254  form = UNICODE_NFC;
6255  e