PostgreSQL Source Code  git master
varlena.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * varlena.c
4  * Functions for the variable-length built-in types.
5  *
6  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  * src/backend/utils/adt/varlena.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 #include "postgres.h"
16 
17 #include <ctype.h>
18 #include <limits.h>
19 
20 #include "access/detoast.h"
21 #include "catalog/pg_collation.h"
22 #include "catalog/pg_type.h"
23 #include "common/hashfn.h"
24 #include "common/hex.h"
25 #include "common/int.h"
26 #include "common/unicode_norm.h"
27 #include "lib/hyperloglog.h"
28 #include "libpq/pqformat.h"
29 #include "miscadmin.h"
30 #include "nodes/execnodes.h"
31 #include "parser/scansup.h"
32 #include "port/pg_bswap.h"
33 #include "regex/regex.h"
34 #include "utils/builtins.h"
35 #include "utils/bytea.h"
36 #include "utils/lsyscache.h"
37 #include "utils/memutils.h"
38 #include "utils/pg_locale.h"
39 #include "utils/sortsupport.h"
40 #include "utils/varlena.h"
41 
42 
43 /* GUC variable */
45 
46 typedef struct varlena unknown;
47 typedef struct varlena VarString;
48 
49 /*
50  * State for text_position_* functions.
51  */
52 typedef struct
53 {
54  bool is_multibyte; /* T if multibyte encoding */
55  bool is_multibyte_char_in_char; /* need to check char boundaries? */
56 
57  char *str1; /* haystack string */
58  char *str2; /* needle string */
59  int len1; /* string lengths in bytes */
60  int len2;
61 
62  /* Skip table for Boyer-Moore-Horspool search algorithm: */
63  int skiptablemask; /* mask for ANDing with skiptable subscripts */
64  int skiptable[256]; /* skip distance for given mismatched char */
65 
66  char *last_match; /* pointer to last match in 'str1' */
67 
68  /*
69  * Sometimes we need to convert the byte position of a match to a
70  * character position. These store the last position that was converted,
71  * so that on the next call, we can continue from that point, rather than
72  * count characters from the very beginning.
73  */
74  char *refpoint; /* pointer within original haystack string */
75  int refpos; /* 0-based character offset of the same point */
77 
78 typedef struct
79 {
80  char *buf1; /* 1st string, or abbreviation original string
81  * buf */
82  char *buf2; /* 2nd string, or abbreviation strxfrm() buf */
83  int buflen1;
84  int buflen2;
85  int last_len1; /* Length of last buf1 string/strxfrm() input */
86  int last_len2; /* Length of last buf2 string/strxfrm() blob */
87  int last_returned; /* Last comparison result (cache) */
88  bool cache_blob; /* Does buf2 contain strxfrm() blob, etc? */
89  bool collate_c;
90  Oid typid; /* Actual datatype (text/bpchar/bytea/name) */
91  hyperLogLogState abbr_card; /* Abbreviated key cardinality state */
92  hyperLogLogState full_card; /* Full key cardinality state */
93  double prop_card; /* Required cardinality proportion */
96 
97 /*
98  * Output data for split_text(): we output either to an array or a table.
99  * tupstore and tupdesc must be set up in advance to output to a table.
100  */
101 typedef struct
102 {
107 
108 /*
109  * This should be large enough that most strings will fit, but small enough
110  * that we feel comfortable putting it on the stack
111  */
112 #define TEXTBUFLEN 1024
113 
114 #define DatumGetUnknownP(X) ((unknown *) PG_DETOAST_DATUM(X))
115 #define DatumGetUnknownPCopy(X) ((unknown *) PG_DETOAST_DATUM_COPY(X))
116 #define PG_GETARG_UNKNOWN_P(n) DatumGetUnknownP(PG_GETARG_DATUM(n))
117 #define PG_GETARG_UNKNOWN_P_COPY(n) DatumGetUnknownPCopy(PG_GETARG_DATUM(n))
118 #define PG_RETURN_UNKNOWN_P(x) PG_RETURN_POINTER(x)
119 
120 #define DatumGetVarStringP(X) ((VarString *) PG_DETOAST_DATUM(X))
121 #define DatumGetVarStringPP(X) ((VarString *) PG_DETOAST_DATUM_PACKED(X))
122 
123 static int varstrfastcmp_c(Datum x, Datum y, SortSupport ssup);
124 static int bpcharfastcmp_c(Datum x, Datum y, SortSupport ssup);
125 static int namefastcmp_c(Datum x, Datum y, SortSupport ssup);
126 static int varlenafastcmp_locale(Datum x, Datum y, SortSupport ssup);
127 static int namefastcmp_locale(Datum x, Datum y, SortSupport ssup);
128 static int varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup);
129 static int varstrcmp_abbrev(Datum x, Datum y, SortSupport ssup);
130 static Datum varstr_abbrev_convert(Datum original, SortSupport ssup);
131 static bool varstr_abbrev_abort(int memtupcount, SortSupport ssup);
132 static int32 text_length(Datum str);
133 static text *text_catenate(text *t1, text *t2);
134 static text *text_substring(Datum str,
135  int32 start,
136  int32 length,
137  bool length_not_specified);
138 static text *text_overlay(text *t1, text *t2, int sp, int sl);
139 static int text_position(text *t1, text *t2, Oid collid);
140 static void text_position_setup(text *t1, text *t2, Oid collid, TextPositionState *state);
142 static char *text_position_next_internal(char *start_ptr, TextPositionState *state);
146 static void check_collation_set(Oid collid);
147 static int text_cmp(text *arg1, text *arg2, Oid collid);
148 static bytea *bytea_catenate(bytea *t1, bytea *t2);
150  int S,
151  int L,
152  bool length_not_specified);
153 static bytea *bytea_overlay(bytea *t1, bytea *t2, int sp, int sl);
154 static void appendStringInfoText(StringInfo str, const text *t);
155 static bool split_text(FunctionCallInfo fcinfo, SplitTextOutputData *tstate);
156 static void split_text_accum_result(SplitTextOutputData *tstate,
157  text *field_value,
158  text *null_string,
159  Oid collation);
161  const char *fldsep, const char *null_string);
163 static bool text_format_parse_digits(const char **ptr, const char *end_ptr,
164  int *value);
165 static const char *text_format_parse_format(const char *start_ptr,
166  const char *end_ptr,
167  int *argpos, int *widthpos,
168  int *flags, int *width);
169 static void text_format_string_conversion(StringInfo buf, char conversion,
170  FmgrInfo *typOutputInfo,
171  Datum value, bool isNull,
172  int flags, int width);
173 static void text_format_append_string(StringInfo buf, const char *str,
174  int flags, int width);
175 
176 
177 /*****************************************************************************
178  * CONVERSION ROUTINES EXPORTED FOR USE BY C CODE *
179  *****************************************************************************/
180 
181 /*
182  * cstring_to_text
183  *
184  * Create a text value from a null-terminated C string.
185  *
186  * The new text value is freshly palloc'd with a full-size VARHDR.
187  */
188 text *
189 cstring_to_text(const char *s)
190 {
191  return cstring_to_text_with_len(s, strlen(s));
192 }
193 
194 /*
195  * cstring_to_text_with_len
196  *
197  * Same as cstring_to_text except the caller specifies the string length;
198  * the string need not be null_terminated.
199  */
200 text *
201 cstring_to_text_with_len(const char *s, int len)
202 {
203  text *result = (text *) palloc(len + VARHDRSZ);
204 
205  SET_VARSIZE(result, len + VARHDRSZ);
206  memcpy(VARDATA(result), s, len);
207 
208  return result;
209 }
210 
211 /*
212  * text_to_cstring
213  *
214  * Create a palloc'd, null-terminated C string from a text value.
215  *
216  * We support being passed a compressed or toasted text value.
217  * This is a bit bogus since such values shouldn't really be referred to as
218  * "text *", but it seems useful for robustness. If we didn't handle that
219  * case here, we'd need another routine that did, anyway.
220  */
221 char *
223 {
224  /* must cast away the const, unfortunately */
225  text *tunpacked = pg_detoast_datum_packed(unconstify(text *, t));
226  int len = VARSIZE_ANY_EXHDR(tunpacked);
227  char *result;
228 
229  result = (char *) palloc(len + 1);
230  memcpy(result, VARDATA_ANY(tunpacked), len);
231  result[len] = '\0';
232 
233  if (tunpacked != t)
234  pfree(tunpacked);
235 
236  return result;
237 }
238 
239 /*
240  * text_to_cstring_buffer
241  *
242  * Copy a text value into a caller-supplied buffer of size dst_len.
243  *
244  * The text string is truncated if necessary to fit. The result is
245  * guaranteed null-terminated (unless dst_len == 0).
246  *
247  * We support being passed a compressed or toasted text value.
248  * This is a bit bogus since such values shouldn't really be referred to as
249  * "text *", but it seems useful for robustness. If we didn't handle that
250  * case here, we'd need another routine that did, anyway.
251  */
252 void
253 text_to_cstring_buffer(const text *src, char *dst, size_t dst_len)
254 {
255  /* must cast away the const, unfortunately */
256  text *srcunpacked = pg_detoast_datum_packed(unconstify(text *, src));
257  size_t src_len = VARSIZE_ANY_EXHDR(srcunpacked);
258 
259  if (dst_len > 0)
260  {
261  dst_len--;
262  if (dst_len >= src_len)
263  dst_len = src_len;
264  else /* ensure truncation is encoding-safe */
265  dst_len = pg_mbcliplen(VARDATA_ANY(srcunpacked), src_len, dst_len);
266  memcpy(dst, VARDATA_ANY(srcunpacked), dst_len);
267  dst[dst_len] = '\0';
268  }
269 
270  if (srcunpacked != src)
271  pfree(srcunpacked);
272 }
273 
274 
275 /*****************************************************************************
276  * USER I/O ROUTINES *
277  *****************************************************************************/
278 
279 
280 #define VAL(CH) ((CH) - '0')
281 #define DIG(VAL) ((VAL) + '0')
282 
283 /*
284  * byteain - converts from printable representation of byte array
285  *
286  * Non-printable characters must be passed as '\nnn' (octal) and are
287  * converted to internal form. '\' must be passed as '\\'.
288  * ereport(ERROR, ...) if bad form.
289  *
290  * BUGS:
291  * The input is scanned twice.
292  * The error checking of input is minimal.
293  */
294 Datum
296 {
297  char *inputText = PG_GETARG_CSTRING(0);
298  char *tp;
299  char *rp;
300  int bc;
301  bytea *result;
302 
303  /* Recognize hex input */
304  if (inputText[0] == '\\' && inputText[1] == 'x')
305  {
306  size_t len = strlen(inputText);
307  uint64 dstlen = pg_hex_dec_len(len - 2);
308 
309  bc = dstlen + VARHDRSZ; /* maximum possible length */
310  result = palloc(bc);
311 
312  bc = pg_hex_decode(inputText + 2, len - 2, VARDATA(result), dstlen);
313  SET_VARSIZE(result, bc + VARHDRSZ); /* actual length */
314 
315  PG_RETURN_BYTEA_P(result);
316  }
317 
318  /* Else, it's the traditional escaped style */
319  for (bc = 0, tp = inputText; *tp != '\0'; bc++)
320  {
321  if (tp[0] != '\\')
322  tp++;
323  else if ((tp[0] == '\\') &&
324  (tp[1] >= '0' && tp[1] <= '3') &&
325  (tp[2] >= '0' && tp[2] <= '7') &&
326  (tp[3] >= '0' && tp[3] <= '7'))
327  tp += 4;
328  else if ((tp[0] == '\\') &&
329  (tp[1] == '\\'))
330  tp += 2;
331  else
332  {
333  /*
334  * one backslash, not followed by another or ### valid octal
335  */
336  ereport(ERROR,
337  (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
338  errmsg("invalid input syntax for type %s", "bytea")));
339  }
340  }
341 
342  bc += VARHDRSZ;
343 
344  result = (bytea *) palloc(bc);
345  SET_VARSIZE(result, bc);
346 
347  tp = inputText;
348  rp = VARDATA(result);
349  while (*tp != '\0')
350  {
351  if (tp[0] != '\\')
352  *rp++ = *tp++;
353  else if ((tp[0] == '\\') &&
354  (tp[1] >= '0' && tp[1] <= '3') &&
355  (tp[2] >= '0' && tp[2] <= '7') &&
356  (tp[3] >= '0' && tp[3] <= '7'))
357  {
358  bc = VAL(tp[1]);
359  bc <<= 3;
360  bc += VAL(tp[2]);
361  bc <<= 3;
362  *rp++ = bc + VAL(tp[3]);
363 
364  tp += 4;
365  }
366  else if ((tp[0] == '\\') &&
367  (tp[1] == '\\'))
368  {
369  *rp++ = '\\';
370  tp += 2;
371  }
372  else
373  {
374  /*
375  * We should never get here. The first pass should not allow it.
376  */
377  ereport(ERROR,
378  (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
379  errmsg("invalid input syntax for type %s", "bytea")));
380  }
381  }
382 
383  PG_RETURN_BYTEA_P(result);
384 }
385 
386 /*
387  * byteaout - converts to printable representation of byte array
388  *
389  * In the traditional escaped format, non-printable characters are
390  * printed as '\nnn' (octal) and '\' as '\\'.
391  */
392 Datum
394 {
395  bytea *vlena = PG_GETARG_BYTEA_PP(0);
396  char *result;
397  char *rp;
398 
400  {
401  uint64 dstlen = pg_hex_enc_len(VARSIZE_ANY_EXHDR(vlena));
402 
403  /* Print hex format */
404  rp = result = palloc(dstlen + 2 + 1);
405  *rp++ = '\\';
406  *rp++ = 'x';
407 
408  rp += pg_hex_encode(VARDATA_ANY(vlena), VARSIZE_ANY_EXHDR(vlena), rp,
409  dstlen);
410  }
411  else if (bytea_output == BYTEA_OUTPUT_ESCAPE)
412  {
413  /* Print traditional escaped format */
414  char *vp;
415  uint64 len;
416  int i;
417 
418  len = 1; /* empty string has 1 char */
419  vp = VARDATA_ANY(vlena);
420  for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++)
421  {
422  if (*vp == '\\')
423  len += 2;
424  else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e)
425  len += 4;
426  else
427  len++;
428  }
429 
430  /*
431  * In principle len can't overflow uint32 if the input fit in 1GB, but
432  * for safety let's check rather than relying on palloc's internal
433  * check.
434  */
435  if (len > MaxAllocSize)
436  ereport(ERROR,
437  (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
438  errmsg_internal("result of bytea output conversion is too large")));
439  rp = result = (char *) palloc(len);
440 
441  vp = VARDATA_ANY(vlena);
442  for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++)
443  {
444  if (*vp == '\\')
445  {
446  *rp++ = '\\';
447  *rp++ = '\\';
448  }
449  else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e)
450  {
451  int val; /* holds unprintable chars */
452 
453  val = *vp;
454  rp[0] = '\\';
455  rp[3] = DIG(val & 07);
456  val >>= 3;
457  rp[2] = DIG(val & 07);
458  val >>= 3;
459  rp[1] = DIG(val & 03);
460  rp += 4;
461  }
462  else
463  *rp++ = *vp;
464  }
465  }
466  else
467  {
468  elog(ERROR, "unrecognized bytea_output setting: %d",
469  bytea_output);
470  rp = result = NULL; /* keep compiler quiet */
471  }
472  *rp = '\0';
473  PG_RETURN_CSTRING(result);
474 }
475 
476 /*
477  * bytearecv - converts external binary format to bytea
478  */
479 Datum
481 {
483  bytea *result;
484  int nbytes;
485 
486  nbytes = buf->len - buf->cursor;
487  result = (bytea *) palloc(nbytes + VARHDRSZ);
488  SET_VARSIZE(result, nbytes + VARHDRSZ);
489  pq_copymsgbytes(buf, VARDATA(result), nbytes);
490  PG_RETURN_BYTEA_P(result);
491 }
492 
493 /*
494  * byteasend - converts bytea to binary format
495  *
496  * This is a special case: just copy the input...
497  */
498 Datum
500 {
501  bytea *vlena = PG_GETARG_BYTEA_P_COPY(0);
502 
503  PG_RETURN_BYTEA_P(vlena);
504 }
505 
506 Datum
508 {
510 
511  state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
512 
513  /* Append the value unless null. */
514  if (!PG_ARGISNULL(1))
515  {
517 
518  /* On the first time through, we ignore the delimiter. */
519  if (state == NULL)
520  state = makeStringAggState(fcinfo);
521  else if (!PG_ARGISNULL(2))
522  {
523  bytea *delim = PG_GETARG_BYTEA_PP(2);
524 
526  }
527 
529  }
530 
531  /*
532  * The transition type for string_agg() is declared to be "internal",
533  * which is a pass-by-value type the same size as a pointer.
534  */
535  PG_RETURN_POINTER(state);
536 }
537 
538 Datum
540 {
542 
543  /* cannot be called directly because of internal-type argument */
544  Assert(AggCheckCallContext(fcinfo, NULL));
545 
546  state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
547 
548  if (state != NULL)
549  {
550  bytea *result;
551 
552  result = (bytea *) palloc(state->len + VARHDRSZ);
553  SET_VARSIZE(result, state->len + VARHDRSZ);
554  memcpy(VARDATA(result), state->data, state->len);
555  PG_RETURN_BYTEA_P(result);
556  }
557  else
558  PG_RETURN_NULL();
559 }
560 
561 /*
562  * textin - converts "..." to internal representation
563  */
564 Datum
566 {
567  char *inputText = PG_GETARG_CSTRING(0);
568 
569  PG_RETURN_TEXT_P(cstring_to_text(inputText));
570 }
571 
572 /*
573  * textout - converts internal representation to "..."
574  */
575 Datum
577 {
578  Datum txt = PG_GETARG_DATUM(0);
579 
581 }
582 
583 /*
584  * textrecv - converts external binary format to text
585  */
586 Datum
588 {
590  text *result;
591  char *str;
592  int nbytes;
593 
594  str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
595 
596  result = cstring_to_text_with_len(str, nbytes);
597  pfree(str);
598  PG_RETURN_TEXT_P(result);
599 }
600 
601 /*
602  * textsend - converts text to binary format
603  */
604 Datum
606 {
607  text *t = PG_GETARG_TEXT_PP(0);
609 
610  pq_begintypsend(&buf);
613 }
614 
615 
616 /*
617  * unknownin - converts "..." to internal representation
618  */
619 Datum
621 {
622  char *str = PG_GETARG_CSTRING(0);
623 
624  /* representation is same as cstring */
626 }
627 
628 /*
629  * unknownout - converts internal representation to "..."
630  */
631 Datum
633 {
634  /* representation is same as cstring */
635  char *str = PG_GETARG_CSTRING(0);
636 
638 }
639 
640 /*
641  * unknownrecv - converts external binary format to unknown
642  */
643 Datum
645 {
647  char *str;
648  int nbytes;
649 
650  str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
651  /* representation is same as cstring */
652  PG_RETURN_CSTRING(str);
653 }
654 
655 /*
656  * unknownsend - converts unknown to binary format
657  */
658 Datum
660 {
661  /* representation is same as cstring */
662  char *str = PG_GETARG_CSTRING(0);
664 
665  pq_begintypsend(&buf);
666  pq_sendtext(&buf, str, strlen(str));
668 }
669 
670 
671 /* ========== PUBLIC ROUTINES ========== */
672 
673 /*
674  * textlen -
675  * returns the logical length of a text*
676  * (which is less than the VARSIZE of the text*)
677  */
678 Datum
680 {
682 
683  /* try to avoid decompressing argument */
685 }
686 
687 /*
688  * text_length -
689  * Does the real work for textlen()
690  *
691  * This is broken out so it can be called directly by other string processing
692  * functions. Note that the argument is passed as a Datum, to indicate that
693  * it may still be in compressed form. We can avoid decompressing it at all
694  * in some cases.
695  */
696 static int32
698 {
699  /* fastpath when max encoding length is one */
702  else
703  {
704  text *t = DatumGetTextPP(str);
705 
707  VARSIZE_ANY_EXHDR(t)));
708  }
709 }
710 
711 /*
712  * textoctetlen -
713  * returns the physical length of a text*
714  * (which is less than the VARSIZE of the text*)
715  */
716 Datum
718 {
720 
721  /* We need not detoast the input at all */
723 }
724 
725 /*
726  * textcat -
727  * takes two text* and returns a text* that is the concatenation of
728  * the two.
729  *
730  * Rewritten by Sapa, sapa@hq.icb.chel.su. 8-Jul-96.
731  * Updated by Thomas, Thomas.Lockhart@jpl.nasa.gov 1997-07-10.
732  * Allocate space for output in all cases.
733  * XXX - thomas 1997-07-10
734  */
735 Datum
737 {
738  text *t1 = PG_GETARG_TEXT_PP(0);
739  text *t2 = PG_GETARG_TEXT_PP(1);
740 
742 }
743 
744 /*
745  * text_catenate
746  * Guts of textcat(), broken out so it can be used by other functions
747  *
748  * Arguments can be in short-header form, but not compressed or out-of-line
749  */
750 static text *
752 {
753  text *result;
754  int len1,
755  len2,
756  len;
757  char *ptr;
758 
759  len1 = VARSIZE_ANY_EXHDR(t1);
760  len2 = VARSIZE_ANY_EXHDR(t2);
761 
762  /* paranoia ... probably should throw error instead? */
763  if (len1 < 0)
764  len1 = 0;
765  if (len2 < 0)
766  len2 = 0;
767 
768  len = len1 + len2 + VARHDRSZ;
769  result = (text *) palloc(len);
770 
771  /* Set size of result string... */
772  SET_VARSIZE(result, len);
773 
774  /* Fill data field of result string... */
775  ptr = VARDATA(result);
776  if (len1 > 0)
777  memcpy(ptr, VARDATA_ANY(t1), len1);
778  if (len2 > 0)
779  memcpy(ptr + len1, VARDATA_ANY(t2), len2);
780 
781  return result;
782 }
783 
784 /*
785  * charlen_to_bytelen()
786  * Compute the number of bytes occupied by n characters starting at *p
787  *
788  * It is caller's responsibility that there actually are n characters;
789  * the string need not be null-terminated.
790  */
791 static int
792 charlen_to_bytelen(const char *p, int n)
793 {
795  {
796  /* Optimization for single-byte encodings */
797  return n;
798  }
799  else
800  {
801  const char *s;
802 
803  for (s = p; n > 0; n--)
804  s += pg_mblen(s);
805 
806  return s - p;
807  }
808 }
809 
810 /*
811  * text_substr()
812  * Return a substring starting at the specified position.
813  * - thomas 1997-12-31
814  *
815  * Input:
816  * - string
817  * - starting position (is one-based)
818  * - string length
819  *
820  * If the starting position is zero or less, then return from the start of the string
821  * adjusting the length to be consistent with the "negative start" per SQL.
822  * If the length is less than zero, return the remaining string.
823  *
824  * Added multibyte support.
825  * - Tatsuo Ishii 1998-4-21
826  * Changed behavior if starting position is less than one to conform to SQL behavior.
827  * Formerly returned the entire string; now returns a portion.
828  * - Thomas Lockhart 1998-12-10
829  * Now uses faster TOAST-slicing interface
830  * - John Gray 2002-02-22
831  * Remove "#ifdef MULTIBYTE" and test for encoding_max_length instead. Change
832  * behaviors conflicting with SQL to meet SQL (if E = S + L < S throw
833  * error; if E < 1, return '', not entire string). Fixed MB related bug when
834  * S > LC and < LC + 4 sometimes garbage characters are returned.
835  * - Joe Conway 2002-08-10
836  */
837 Datum
839 {
841  PG_GETARG_INT32(1),
842  PG_GETARG_INT32(2),
843  false));
844 }
845 
846 /*
847  * text_substr_no_len -
848  * Wrapper to avoid opr_sanity failure due to
849  * one function accepting a different number of args.
850  */
851 Datum
853 {
855  PG_GETARG_INT32(1),
856  -1, true));
857 }
858 
859 /*
860  * text_substring -
861  * Does the real work for text_substr() and text_substr_no_len()
862  *
863  * This is broken out so it can be called directly by other string processing
864  * functions. Note that the argument is passed as a Datum, to indicate that
865  * it may still be in compressed/toasted form. We can avoid detoasting all
866  * of it in some cases.
867  *
868  * The result is always a freshly palloc'd datum.
869  */
870 static text *
871 text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
872 {
874  int32 S = start; /* start position */
875  int32 S1; /* adjusted start position */
876  int32 L1; /* adjusted substring length */
877  int32 E; /* end position */
878 
879  /*
880  * SQL99 says S can be zero or negative, but we still must fetch from the
881  * start of the string.
882  */
883  S1 = Max(S, 1);
884 
885  /* life is easy if the encoding max length is 1 */
886  if (eml == 1)
887  {
888  if (length_not_specified) /* special case - get length to end of
889  * string */
890  L1 = -1;
891  else if (length < 0)
892  {
893  /* SQL99 says to throw an error for E < S, i.e., negative length */
894  ereport(ERROR,
895  (errcode(ERRCODE_SUBSTRING_ERROR),
896  errmsg("negative substring length not allowed")));
897  L1 = -1; /* silence stupider compilers */
898  }
899  else if (pg_add_s32_overflow(S, length, &E))
900  {
901  /*
902  * L could be large enough for S + L to overflow, in which case
903  * the substring must run to end of string.
904  */
905  L1 = -1;
906  }
907  else
908  {
909  /*
910  * A zero or negative value for the end position can happen if the
911  * start was negative or one. SQL99 says to return a zero-length
912  * string.
913  */
914  if (E < 1)
915  return cstring_to_text("");
916 
917  L1 = E - S1;
918  }
919 
920  /*
921  * If the start position is past the end of the string, SQL99 says to
922  * return a zero-length string -- DatumGetTextPSlice() will do that
923  * for us. We need only convert S1 to zero-based starting position.
924  */
925  return DatumGetTextPSlice(str, S1 - 1, L1);
926  }
927  else if (eml > 1)
928  {
929  /*
930  * When encoding max length is > 1, we can't get LC without
931  * detoasting, so we'll grab a conservatively large slice now and go
932  * back later to do the right thing
933  */
934  int32 slice_start;
935  int32 slice_size;
936  int32 slice_strlen;
937  text *slice;
938  int32 E1;
939  int32 i;
940  char *p;
941  char *s;
942  text *ret;
943 
944  /*
945  * We need to start at position zero because there is no way to know
946  * in advance which byte offset corresponds to the supplied start
947  * position.
948  */
949  slice_start = 0;
950 
951  if (length_not_specified) /* special case - get length to end of
952  * string */
953  slice_size = L1 = -1;
954  else if (length < 0)
955  {
956  /* SQL99 says to throw an error for E < S, i.e., negative length */
957  ereport(ERROR,
958  (errcode(ERRCODE_SUBSTRING_ERROR),
959  errmsg("negative substring length not allowed")));
960  slice_size = L1 = -1; /* silence stupider compilers */
961  }
962  else if (pg_add_s32_overflow(S, length, &E))
963  {
964  /*
965  * L could be large enough for S + L to overflow, in which case
966  * the substring must run to end of string.
967  */
968  slice_size = L1 = -1;
969  }
970  else
971  {
972  /*
973  * A zero or negative value for the end position can happen if the
974  * start was negative or one. SQL99 says to return a zero-length
975  * string.
976  */
977  if (E < 1)
978  return cstring_to_text("");
979 
980  /*
981  * if E is past the end of the string, the tuple toaster will
982  * truncate the length for us
983  */
984  L1 = E - S1;
985 
986  /*
987  * Total slice size in bytes can't be any longer than the start
988  * position plus substring length times the encoding max length.
989  * If that overflows, we can just use -1.
990  */
991  if (pg_mul_s32_overflow(E, eml, &slice_size))
992  slice_size = -1;
993  }
994 
995  /*
996  * If we're working with an untoasted source, no need to do an extra
997  * copying step.
998  */
1001  slice = DatumGetTextPSlice(str, slice_start, slice_size);
1002  else
1003  slice = (text *) DatumGetPointer(str);
1004 
1005  /* see if we got back an empty string */
1006  if (VARSIZE_ANY_EXHDR(slice) == 0)
1007  {
1008  if (slice != (text *) DatumGetPointer(str))
1009  pfree(slice);
1010  return cstring_to_text("");
1011  }
1012 
1013  /* Now we can get the actual length of the slice in MB characters */
1014  slice_strlen = pg_mbstrlen_with_len(VARDATA_ANY(slice),
1015  VARSIZE_ANY_EXHDR(slice));
1016 
1017  /*
1018  * Check that the start position wasn't > slice_strlen. If so, SQL99
1019  * says to return a zero-length string.
1020  */
1021  if (S1 > slice_strlen)
1022  {
1023  if (slice != (text *) DatumGetPointer(str))
1024  pfree(slice);
1025  return cstring_to_text("");
1026  }
1027 
1028  /*
1029  * Adjust L1 and E1 now that we know the slice string length. Again
1030  * remember that S1 is one based, and slice_start is zero based.
1031  */
1032  if (L1 > -1)
1033  E1 = Min(S1 + L1, slice_start + 1 + slice_strlen);
1034  else
1035  E1 = slice_start + 1 + slice_strlen;
1036 
1037  /*
1038  * Find the start position in the slice; remember S1 is not zero based
1039  */
1040  p = VARDATA_ANY(slice);
1041  for (i = 0; i < S1 - 1; i++)
1042  p += pg_mblen(p);
1043 
1044  /* hang onto a pointer to our start position */
1045  s = p;
1046 
1047  /*
1048  * Count the actual bytes used by the substring of the requested
1049  * length.
1050  */
1051  for (i = S1; i < E1; i++)
1052  p += pg_mblen(p);
1053 
1054  ret = (text *) palloc(VARHDRSZ + (p - s));
1055  SET_VARSIZE(ret, VARHDRSZ + (p - s));
1056  memcpy(VARDATA(ret), s, (p - s));
1057 
1058  if (slice != (text *) DatumGetPointer(str))
1059  pfree(slice);
1060 
1061  return ret;
1062  }
1063  else
1064  elog(ERROR, "invalid backend encoding: encoding max length < 1");
1065 
1066  /* not reached: suppress compiler warning */
1067  return NULL;
1068 }
1069 
1070 /*
1071  * textoverlay
1072  * Replace specified substring of first string with second
1073  *
1074  * The SQL standard defines OVERLAY() in terms of substring and concatenation.
1075  * This code is a direct implementation of what the standard says.
1076  */
1077 Datum
1079 {
1080  text *t1 = PG_GETARG_TEXT_PP(0);
1081  text *t2 = PG_GETARG_TEXT_PP(1);
1082  int sp = PG_GETARG_INT32(2); /* substring start position */
1083  int sl = PG_GETARG_INT32(3); /* substring length */
1084 
1085  PG_RETURN_TEXT_P(text_overlay(t1, t2, sp, sl));
1086 }
1087 
1088 Datum
1090 {
1091  text *t1 = PG_GETARG_TEXT_PP(0);
1092  text *t2 = PG_GETARG_TEXT_PP(1);
1093  int sp = PG_GETARG_INT32(2); /* substring start position */
1094  int sl;
1095 
1096  sl = text_length(PointerGetDatum(t2)); /* defaults to length(t2) */
1097  PG_RETURN_TEXT_P(text_overlay(t1, t2, sp, sl));
1098 }
1099 
1100 static text *
1101 text_overlay(text *t1, text *t2, int sp, int sl)
1102 {
1103  text *result;
1104  text *s1;
1105  text *s2;
1106  int sp_pl_sl;
1107 
1108  /*
1109  * Check for possible integer-overflow cases. For negative sp, throw a
1110  * "substring length" error because that's what should be expected
1111  * according to the spec's definition of OVERLAY().
1112  */
1113  if (sp <= 0)
1114  ereport(ERROR,
1115  (errcode(ERRCODE_SUBSTRING_ERROR),
1116  errmsg("negative substring length not allowed")));
1117  if (pg_add_s32_overflow(sp, sl, &sp_pl_sl))
1118  ereport(ERROR,
1119  (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
1120  errmsg("integer out of range")));
1121 
1122  s1 = text_substring(PointerGetDatum(t1), 1, sp - 1, false);
1123  s2 = text_substring(PointerGetDatum(t1), sp_pl_sl, -1, true);
1124  result = text_catenate(s1, t2);
1125  result = text_catenate(result, s2);
1126 
1127  return result;
1128 }
1129 
1130 /*
1131  * textpos -
1132  * Return the position of the specified substring.
1133  * Implements the SQL POSITION() function.
1134  * Ref: A Guide To The SQL Standard, Date & Darwen, 1997
1135  * - thomas 1997-07-27
1136  */
1137 Datum
1139 {
1140  text *str = PG_GETARG_TEXT_PP(0);
1141  text *search_str = PG_GETARG_TEXT_PP(1);
1142 
1143  PG_RETURN_INT32((int32) text_position(str, search_str, PG_GET_COLLATION()));
1144 }
1145 
1146 /*
1147  * text_position -
1148  * Does the real work for textpos()
1149  *
1150  * Inputs:
1151  * t1 - string to be searched
1152  * t2 - pattern to match within t1
1153  * Result:
1154  * Character index of the first matched char, starting from 1,
1155  * or 0 if no match.
1156  *
1157  * This is broken out so it can be called directly by other string processing
1158  * functions.
1159  */
1160 static int
1161 text_position(text *t1, text *t2, Oid collid)
1162 {
1164  int result;
1165 
1166  /* Empty needle always matches at position 1 */
1167  if (VARSIZE_ANY_EXHDR(t2) < 1)
1168  return 1;
1169 
1170  /* Otherwise, can't match if haystack is shorter than needle */
1171  if (VARSIZE_ANY_EXHDR(t1) < VARSIZE_ANY_EXHDR(t2))
1172  return 0;
1173 
1174  text_position_setup(t1, t2, collid, &state);
1175  if (!text_position_next(&state))
1176  result = 0;
1177  else
1178  result = text_position_get_match_pos(&state);
1179  text_position_cleanup(&state);
1180  return result;
1181 }
1182 
1183 
1184 /*
1185  * text_position_setup, text_position_next, text_position_cleanup -
1186  * Component steps of text_position()
1187  *
1188  * These are broken out so that a string can be efficiently searched for
1189  * multiple occurrences of the same pattern. text_position_next may be
1190  * called multiple times, and it advances to the next match on each call.
1191  * text_position_get_match_ptr() and text_position_get_match_pos() return
1192  * a pointer or 1-based character position of the last match, respectively.
1193  *
1194  * The "state" variable is normally just a local variable in the caller.
1195  *
1196  * NOTE: text_position_next skips over the matched portion. For example,
1197  * searching for "xx" in "xxx" returns only one match, not two.
1198  */
1199 
1200 static void
1202 {
1203  int len1 = VARSIZE_ANY_EXHDR(t1);
1204  int len2 = VARSIZE_ANY_EXHDR(t2);
1205  pg_locale_t mylocale = 0;
1206 
1207  check_collation_set(collid);
1208 
1209  if (!lc_collate_is_c(collid) && collid != DEFAULT_COLLATION_OID)
1210  mylocale = pg_newlocale_from_collation(collid);
1211 
1212  if (mylocale && !mylocale->deterministic)
1213  ereport(ERROR,
1214  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1215  errmsg("nondeterministic collations are not supported for substring searches")));
1216 
1217  Assert(len1 > 0);
1218  Assert(len2 > 0);
1219 
1220  /*
1221  * Even with a multi-byte encoding, we perform the search using the raw
1222  * byte sequence, ignoring multibyte issues. For UTF-8, that works fine,
1223  * because in UTF-8 the byte sequence of one character cannot contain
1224  * another character. For other multi-byte encodings, we do the search
1225  * initially as a simple byte search, ignoring multibyte issues, but
1226  * verify afterwards that the match we found is at a character boundary,
1227  * and continue the search if it was a false match.
1228  */
1230  {
1231  state->is_multibyte = false;
1232  state->is_multibyte_char_in_char = false;
1233  }
1234  else if (GetDatabaseEncoding() == PG_UTF8)
1235  {
1236  state->is_multibyte = true;
1237  state->is_multibyte_char_in_char = false;
1238  }
1239  else
1240  {
1241  state->is_multibyte = true;
1242  state->is_multibyte_char_in_char = true;
1243  }
1244 
1245  state->str1 = VARDATA_ANY(t1);
1246  state->str2 = VARDATA_ANY(t2);
1247  state->len1 = len1;
1248  state->len2 = len2;
1249  state->last_match = NULL;
1250  state->refpoint = state->str1;
1251  state->refpos = 0;
1252 
1253  /*
1254  * Prepare the skip table for Boyer-Moore-Horspool searching. In these
1255  * notes we use the terminology that the "haystack" is the string to be
1256  * searched (t1) and the "needle" is the pattern being sought (t2).
1257  *
1258  * If the needle is empty or bigger than the haystack then there is no
1259  * point in wasting cycles initializing the table. We also choose not to
1260  * use B-M-H for needles of length 1, since the skip table can't possibly
1261  * save anything in that case.
1262  */
1263  if (len1 >= len2 && len2 > 1)
1264  {
1265  int searchlength = len1 - len2;
1266  int skiptablemask;
1267  int last;
1268  int i;
1269  const char *str2 = state->str2;
1270 
1271  /*
1272  * First we must determine how much of the skip table to use. The
1273  * declaration of TextPositionState allows up to 256 elements, but for
1274  * short search problems we don't really want to have to initialize so
1275  * many elements --- it would take too long in comparison to the
1276  * actual search time. So we choose a useful skip table size based on
1277  * the haystack length minus the needle length. The closer the needle
1278  * length is to the haystack length the less useful skipping becomes.
1279  *
1280  * Note: since we use bit-masking to select table elements, the skip
1281  * table size MUST be a power of 2, and so the mask must be 2^N-1.
1282  */
1283  if (searchlength < 16)
1284  skiptablemask = 3;
1285  else if (searchlength < 64)
1286  skiptablemask = 7;
1287  else if (searchlength < 128)
1288  skiptablemask = 15;
1289  else if (searchlength < 512)
1290  skiptablemask = 31;
1291  else if (searchlength < 2048)
1292  skiptablemask = 63;
1293  else if (searchlength < 4096)
1294  skiptablemask = 127;
1295  else
1296  skiptablemask = 255;
1297  state->skiptablemask = skiptablemask;
1298 
1299  /*
1300  * Initialize the skip table. We set all elements to the needle
1301  * length, since this is the correct skip distance for any character
1302  * not found in the needle.
1303  */
1304  for (i = 0; i <= skiptablemask; i++)
1305  state->skiptable[i] = len2;
1306 
1307  /*
1308  * Now examine the needle. For each character except the last one,
1309  * set the corresponding table element to the appropriate skip
1310  * distance. Note that when two characters share the same skip table
1311  * entry, the one later in the needle must determine the skip
1312  * distance.
1313  */
1314  last = len2 - 1;
1315 
1316  for (i = 0; i < last; i++)
1317  state->skiptable[(unsigned char) str2[i] & skiptablemask] = last - i;
1318  }
1319 }
1320 
1321 /*
1322  * Advance to the next match, starting from the end of the previous match
1323  * (or the beginning of the string, on first call). Returns true if a match
1324  * is found.
1325  *
1326  * Note that this refuses to match an empty-string needle. Most callers
1327  * will have handled that case specially and we'll never see it here.
1328  */
1329 static bool
1331 {
1332  int needle_len = state->len2;
1333  char *start_ptr;
1334  char *matchptr;
1335 
1336  if (needle_len <= 0)
1337  return false; /* result for empty pattern */
1338 
1339  /* Start from the point right after the previous match. */
1340  if (state->last_match)
1341  start_ptr = state->last_match + needle_len;
1342  else
1343  start_ptr = state->str1;
1344 
1345 retry:
1346  matchptr = text_position_next_internal(start_ptr, state);
1347 
1348  if (!matchptr)
1349  return false;
1350 
1351  /*
1352  * Found a match for the byte sequence. If this is a multibyte encoding,
1353  * where one character's byte sequence can appear inside a longer
1354  * multi-byte character, we need to verify that the match was at a
1355  * character boundary, not in the middle of a multi-byte character.
1356  */
1357  if (state->is_multibyte_char_in_char)
1358  {
1359  /* Walk one character at a time, until we reach the match. */
1360 
1361  /* the search should never move backwards. */
1362  Assert(state->refpoint <= matchptr);
1363 
1364  while (state->refpoint < matchptr)
1365  {
1366  /* step to next character. */
1367  state->refpoint += pg_mblen(state->refpoint);
1368  state->refpos++;
1369 
1370  /*
1371  * If we stepped over the match's start position, then it was a
1372  * false positive, where the byte sequence appeared in the middle
1373  * of a multi-byte character. Skip it, and continue the search at
1374  * the next character boundary.
1375  */
1376  if (state->refpoint > matchptr)
1377  {
1378  start_ptr = state->refpoint;
1379  goto retry;
1380  }
1381  }
1382  }
1383 
1384  state->last_match = matchptr;
1385  return true;
1386 }
1387 
1388 /*
1389  * Subroutine of text_position_next(). This searches for the raw byte
1390  * sequence, ignoring any multi-byte encoding issues. Returns the first
1391  * match starting at 'start_ptr', or NULL if no match is found.
1392  */
1393 static char *
1395 {
1396  int haystack_len = state->len1;
1397  int needle_len = state->len2;
1398  int skiptablemask = state->skiptablemask;
1399  const char *haystack = state->str1;
1400  const char *needle = state->str2;
1401  const char *haystack_end = &haystack[haystack_len];
1402  const char *hptr;
1403 
1404  Assert(start_ptr >= haystack && start_ptr <= haystack_end);
1405 
1406  if (needle_len == 1)
1407  {
1408  /* No point in using B-M-H for a one-character needle */
1409  char nchar = *needle;
1410 
1411  hptr = start_ptr;
1412  while (hptr < haystack_end)
1413  {
1414  if (*hptr == nchar)
1415  return (char *) hptr;
1416  hptr++;
1417  }
1418  }
1419  else
1420  {
1421  const char *needle_last = &needle[needle_len - 1];
1422 
1423  /* Start at startpos plus the length of the needle */
1424  hptr = start_ptr + needle_len - 1;
1425  while (hptr < haystack_end)
1426  {
1427  /* Match the needle scanning *backward* */
1428  const char *nptr;
1429  const char *p;
1430 
1431  nptr = needle_last;
1432  p = hptr;
1433  while (*nptr == *p)
1434  {
1435  /* Matched it all? If so, return 1-based position */
1436  if (nptr == needle)
1437  return (char *) p;
1438  nptr--, p--;
1439  }
1440 
1441  /*
1442  * No match, so use the haystack char at hptr to decide how far to
1443  * advance. If the needle had any occurrence of that character
1444  * (or more precisely, one sharing the same skiptable entry)
1445  * before its last character, then we advance far enough to align
1446  * the last such needle character with that haystack position.
1447  * Otherwise we can advance by the whole needle length.
1448  */
1449  hptr += state->skiptable[(unsigned char) *hptr & skiptablemask];
1450  }
1451  }
1452 
1453  return 0; /* not found */
1454 }
1455 
1456 /*
1457  * Return a pointer to the current match.
1458  *
1459  * The returned pointer points into the original haystack string.
1460  */
1461 static char *
1463 {
1464  return state->last_match;
1465 }
1466 
1467 /*
1468  * Return the offset of the current match.
1469  *
1470  * The offset is in characters, 1-based.
1471  */
1472 static int
1474 {
1475  if (!state->is_multibyte)
1476  return state->last_match - state->str1 + 1;
1477  else
1478  {
1479  /* Convert the byte position to char position. */
1480  while (state->refpoint < state->last_match)
1481  {
1482  state->refpoint += pg_mblen(state->refpoint);
1483  state->refpos++;
1484  }
1485  Assert(state->refpoint == state->last_match);
1486  return state->refpos + 1;
1487  }
1488 }
1489 
1490 /*
1491  * Reset search state to the initial state installed by text_position_setup.
1492  *
1493  * The next call to text_position_next will search from the beginning
1494  * of the string.
1495  */
1496 static void
1498 {
1499  state->last_match = NULL;
1500  state->refpoint = state->str1;
1501  state->refpos = 0;
1502 }
1503 
1504 static void
1506 {
1507  /* no cleanup needed */
1508 }
1509 
1510 
1511 static void
1513 {
1514  if (!OidIsValid(collid))
1515  {
1516  /*
1517  * This typically means that the parser could not resolve a conflict
1518  * of implicit collations, so report it that way.
1519  */
1520  ereport(ERROR,
1521  (errcode(ERRCODE_INDETERMINATE_COLLATION),
1522  errmsg("could not determine which collation to use for string comparison"),
1523  errhint("Use the COLLATE clause to set the collation explicitly.")));
1524  }
1525 }
1526 
1527 /* varstr_cmp()
1528  * Comparison function for text strings with given lengths.
1529  * Includes locale support, but must copy strings to temporary memory
1530  * to allow null-termination for inputs to strcoll().
1531  * Returns an integer less than, equal to, or greater than zero, indicating
1532  * whether arg1 is less than, equal to, or greater than arg2.
1533  *
1534  * Note: many functions that depend on this are marked leakproof; therefore,
1535  * avoid reporting the actual contents of the input when throwing errors.
1536  * All errors herein should be things that can't happen except on corrupt
1537  * data, anyway; otherwise we will have trouble with indexing strings that
1538  * would cause them.
1539  */
1540 int
1541 varstr_cmp(const char *arg1, int len1, const char *arg2, int len2, Oid collid)
1542 {
1543  int result;
1544 
1545  check_collation_set(collid);
1546 
1547  /*
1548  * Unfortunately, there is no strncoll(), so in the non-C locale case we
1549  * have to do some memory copying. This turns out to be significantly
1550  * slower, so we optimize the case where LC_COLLATE is C. We also try to
1551  * optimize relatively-short strings by avoiding palloc/pfree overhead.
1552  */
1553  if (lc_collate_is_c(collid))
1554  {
1555  result = memcmp(arg1, arg2, Min(len1, len2));
1556  if ((result == 0) && (len1 != len2))
1557  result = (len1 < len2) ? -1 : 1;
1558  }
1559  else
1560  {
1561  char a1buf[TEXTBUFLEN];
1562  char a2buf[TEXTBUFLEN];
1563  char *a1p,
1564  *a2p;
1565  pg_locale_t mylocale = 0;
1566 
1567  if (collid != DEFAULT_COLLATION_OID)
1568  mylocale = pg_newlocale_from_collation(collid);
1569 
1570  /*
1571  * memcmp() can't tell us which of two unequal strings sorts first,
1572  * but it's a cheap way to tell if they're equal. Testing shows that
1573  * memcmp() followed by strcoll() is only trivially slower than
1574  * strcoll() by itself, so we don't lose much if this doesn't work out
1575  * very often, and if it does - for example, because there are many
1576  * equal strings in the input - then we win big by avoiding expensive
1577  * collation-aware comparisons.
1578  */
1579  if (len1 == len2 && memcmp(arg1, arg2, len1) == 0)
1580  return 0;
1581 
1582 #ifdef WIN32
1583  /* Win32 does not have UTF-8, so we need to map to UTF-16 */
1584  if (GetDatabaseEncoding() == PG_UTF8
1585  && (!mylocale || mylocale->provider == COLLPROVIDER_LIBC))
1586  {
1587  int a1len;
1588  int a2len;
1589  int r;
1590 
1591  if (len1 >= TEXTBUFLEN / 2)
1592  {
1593  a1len = len1 * 2 + 2;
1594  a1p = palloc(a1len);
1595  }
1596  else
1597  {
1598  a1len = TEXTBUFLEN;
1599  a1p = a1buf;
1600  }
1601  if (len2 >= TEXTBUFLEN / 2)
1602  {
1603  a2len = len2 * 2 + 2;
1604  a2p = palloc(a2len);
1605  }
1606  else
1607  {
1608  a2len = TEXTBUFLEN;
1609  a2p = a2buf;
1610  }
1611 
1612  /* stupid Microsloth API does not work for zero-length input */
1613  if (len1 == 0)
1614  r = 0;
1615  else
1616  {
1617  r = MultiByteToWideChar(CP_UTF8, 0, arg1, len1,
1618  (LPWSTR) a1p, a1len / 2);
1619  if (!r)
1620  ereport(ERROR,
1621  (errmsg("could not convert string to UTF-16: error code %lu",
1622  GetLastError())));
1623  }
1624  ((LPWSTR) a1p)[r] = 0;
1625 
1626  if (len2 == 0)
1627  r = 0;
1628  else
1629  {
1630  r = MultiByteToWideChar(CP_UTF8, 0, arg2, len2,
1631  (LPWSTR) a2p, a2len / 2);
1632  if (!r)
1633  ereport(ERROR,
1634  (errmsg("could not convert string to UTF-16: error code %lu",
1635  GetLastError())));
1636  }
1637  ((LPWSTR) a2p)[r] = 0;
1638 
1639  errno = 0;
1640 #ifdef HAVE_LOCALE_T
1641  if (mylocale)
1642  result = wcscoll_l((LPWSTR) a1p, (LPWSTR) a2p, mylocale->info.lt);
1643  else
1644 #endif
1645  result = wcscoll((LPWSTR) a1p, (LPWSTR) a2p);
1646  if (result == 2147483647) /* _NLSCMPERROR; missing from mingw
1647  * headers */
1648  ereport(ERROR,
1649  (errmsg("could not compare Unicode strings: %m")));
1650 
1651  /* Break tie if necessary. */
1652  if (result == 0 &&
1653  (!mylocale || mylocale->deterministic))
1654  {
1655  result = memcmp(arg1, arg2, Min(len1, len2));
1656  if ((result == 0) && (len1 != len2))
1657  result = (len1 < len2) ? -1 : 1;
1658  }
1659 
1660  if (a1p != a1buf)
1661  pfree(a1p);
1662  if (a2p != a2buf)
1663  pfree(a2p);
1664 
1665  return result;
1666  }
1667 #endif /* WIN32 */
1668 
1669  if (len1 >= TEXTBUFLEN)
1670  a1p = (char *) palloc(len1 + 1);
1671  else
1672  a1p = a1buf;
1673  if (len2 >= TEXTBUFLEN)
1674  a2p = (char *) palloc(len2 + 1);
1675  else
1676  a2p = a2buf;
1677 
1678  memcpy(a1p, arg1, len1);
1679  a1p[len1] = '\0';
1680  memcpy(a2p, arg2, len2);
1681  a2p[len2] = '\0';
1682 
1683  if (mylocale)
1684  {
1685  if (mylocale->provider == COLLPROVIDER_ICU)
1686  {
1687 #ifdef USE_ICU
1688 #ifdef HAVE_UCOL_STRCOLLUTF8
1689  if (GetDatabaseEncoding() == PG_UTF8)
1690  {
1691  UErrorCode status;
1692 
1693  status = U_ZERO_ERROR;
1694  result = ucol_strcollUTF8(mylocale->info.icu.ucol,
1695  arg1, len1,
1696  arg2, len2,
1697  &status);
1698  if (U_FAILURE(status))
1699  ereport(ERROR,
1700  (errmsg("collation failed: %s", u_errorName(status))));
1701  }
1702  else
1703 #endif
1704  {
1705  int32_t ulen1,
1706  ulen2;
1707  UChar *uchar1,
1708  *uchar2;
1709 
1710  ulen1 = icu_to_uchar(&uchar1, arg1, len1);
1711  ulen2 = icu_to_uchar(&uchar2, arg2, len2);
1712 
1713  result = ucol_strcoll(mylocale->info.icu.ucol,
1714  uchar1, ulen1,
1715  uchar2, ulen2);
1716 
1717  pfree(uchar1);
1718  pfree(uchar2);
1719  }
1720 #else /* not USE_ICU */
1721  /* shouldn't happen */
1722  elog(ERROR, "unsupported collprovider: %c", mylocale->provider);
1723 #endif /* not USE_ICU */
1724  }
1725  else
1726  {
1727 #ifdef HAVE_LOCALE_T
1728  result = strcoll_l(a1p, a2p, mylocale->info.lt);
1729 #else
1730  /* shouldn't happen */
1731  elog(ERROR, "unsupported collprovider: %c", mylocale->provider);
1732 #endif
1733  }
1734  }
1735  else
1736  result = strcoll(a1p, a2p);
1737 
1738  /* Break tie if necessary. */
1739  if (result == 0 &&
1740  (!mylocale || mylocale->deterministic))
1741  result = strcmp(a1p, a2p);
1742 
1743  if (a1p != a1buf)
1744  pfree(a1p);
1745  if (a2p != a2buf)
1746  pfree(a2p);
1747  }
1748 
1749  return result;
1750 }
1751 
1752 /* text_cmp()
1753  * Internal comparison function for text strings.
1754  * Returns -1, 0 or 1
1755  */
1756 static int
1757 text_cmp(text *arg1, text *arg2, Oid collid)
1758 {
1759  char *a1p,
1760  *a2p;
1761  int len1,
1762  len2;
1763 
1764  a1p = VARDATA_ANY(arg1);
1765  a2p = VARDATA_ANY(arg2);
1766 
1767  len1 = VARSIZE_ANY_EXHDR(arg1);
1768  len2 = VARSIZE_ANY_EXHDR(arg2);
1769 
1770  return varstr_cmp(a1p, len1, a2p, len2, collid);
1771 }
1772 
1773 /*
1774  * Comparison functions for text strings.
1775  *
1776  * Note: btree indexes need these routines not to leak memory; therefore,
1777  * be careful to free working copies of toasted datums. Most places don't
1778  * need to be so careful.
1779  */
1780 
1781 Datum
1783 {
1784  Oid collid = PG_GET_COLLATION();
1785  bool result;
1786 
1787  check_collation_set(collid);
1788 
1789  if (lc_collate_is_c(collid) ||
1790  collid == DEFAULT_COLLATION_OID ||
1791  pg_newlocale_from_collation(collid)->deterministic)
1792  {
1793  Datum arg1 = PG_GETARG_DATUM(0);
1794  Datum arg2 = PG_GETARG_DATUM(1);
1795  Size len1,
1796  len2;
1797 
1798  /*
1799  * Since we only care about equality or not-equality, we can avoid all
1800  * the expense of strcoll() here, and just do bitwise comparison. In
1801  * fact, we don't even have to do a bitwise comparison if we can show
1802  * the lengths of the strings are unequal; which might save us from
1803  * having to detoast one or both values.
1804  */
1805  len1 = toast_raw_datum_size(arg1);
1806  len2 = toast_raw_datum_size(arg2);
1807  if (len1 != len2)
1808  result = false;
1809  else
1810  {
1811  text *targ1 = DatumGetTextPP(arg1);
1812  text *targ2 = DatumGetTextPP(arg2);
1813 
1814  result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1815  len1 - VARHDRSZ) == 0);
1816 
1817  PG_FREE_IF_COPY(targ1, 0);
1818  PG_FREE_IF_COPY(targ2, 1);
1819  }
1820  }
1821  else
1822  {
1823  text *arg1 = PG_GETARG_TEXT_PP(0);
1824  text *arg2 = PG_GETARG_TEXT_PP(1);
1825 
1826  result = (text_cmp(arg1, arg2, collid) == 0);
1827 
1828  PG_FREE_IF_COPY(arg1, 0);
1829  PG_FREE_IF_COPY(arg2, 1);
1830  }
1831 
1832  PG_RETURN_BOOL(result);
1833 }
1834 
1835 Datum
1837 {
1838  Oid collid = PG_GET_COLLATION();
1839  bool result;
1840 
1841  check_collation_set(collid);
1842 
1843  if (lc_collate_is_c(collid) ||
1844  collid == DEFAULT_COLLATION_OID ||
1845  pg_newlocale_from_collation(collid)->deterministic)
1846  {
1847  Datum arg1 = PG_GETARG_DATUM(0);
1848  Datum arg2 = PG_GETARG_DATUM(1);
1849  Size len1,
1850  len2;
1851 
1852  /* See comment in texteq() */
1853  len1 = toast_raw_datum_size(arg1);
1854  len2 = toast_raw_datum_size(arg2);
1855  if (len1 != len2)
1856  result = true;
1857  else
1858  {
1859  text *targ1 = DatumGetTextPP(arg1);
1860  text *targ2 = DatumGetTextPP(arg2);
1861 
1862  result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1863  len1 - VARHDRSZ) != 0);
1864 
1865  PG_FREE_IF_COPY(targ1, 0);
1866  PG_FREE_IF_COPY(targ2, 1);
1867  }
1868  }
1869  else
1870  {
1871  text *arg1 = PG_GETARG_TEXT_PP(0);
1872  text *arg2 = PG_GETARG_TEXT_PP(1);
1873 
1874  result = (text_cmp(arg1, arg2, collid) != 0);
1875 
1876  PG_FREE_IF_COPY(arg1, 0);
1877  PG_FREE_IF_COPY(arg2, 1);
1878  }
1879 
1880  PG_RETURN_BOOL(result);
1881 }
1882 
1883 Datum
1885 {
1886  text *arg1 = PG_GETARG_TEXT_PP(0);
1887  text *arg2 = PG_GETARG_TEXT_PP(1);
1888  bool result;
1889 
1890  result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) < 0);
1891 
1892  PG_FREE_IF_COPY(arg1, 0);
1893  PG_FREE_IF_COPY(arg2, 1);
1894 
1895  PG_RETURN_BOOL(result);
1896 }
1897 
1898 Datum
1900 {
1901  text *arg1 = PG_GETARG_TEXT_PP(0);
1902  text *arg2 = PG_GETARG_TEXT_PP(1);
1903  bool result;
1904 
1905  result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) <= 0);
1906 
1907  PG_FREE_IF_COPY(arg1, 0);
1908  PG_FREE_IF_COPY(arg2, 1);
1909 
1910  PG_RETURN_BOOL(result);
1911 }
1912 
1913 Datum
1915 {
1916  text *arg1 = PG_GETARG_TEXT_PP(0);
1917  text *arg2 = PG_GETARG_TEXT_PP(1);
1918  bool result;
1919 
1920  result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) > 0);
1921 
1922  PG_FREE_IF_COPY(arg1, 0);
1923  PG_FREE_IF_COPY(arg2, 1);
1924 
1925  PG_RETURN_BOOL(result);
1926 }
1927 
1928 Datum
1930 {
1931  text *arg1 = PG_GETARG_TEXT_PP(0);
1932  text *arg2 = PG_GETARG_TEXT_PP(1);
1933  bool result;
1934 
1935  result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) >= 0);
1936 
1937  PG_FREE_IF_COPY(arg1, 0);
1938  PG_FREE_IF_COPY(arg2, 1);
1939 
1940  PG_RETURN_BOOL(result);
1941 }
1942 
1943 Datum
1945 {
1946  Datum arg1 = PG_GETARG_DATUM(0);
1947  Datum arg2 = PG_GETARG_DATUM(1);
1948  Oid collid = PG_GET_COLLATION();
1949  pg_locale_t mylocale = 0;
1950  bool result;
1951  Size len1,
1952  len2;
1953 
1954  check_collation_set(collid);
1955 
1956  if (!lc_collate_is_c(collid) && collid != DEFAULT_COLLATION_OID)
1957  mylocale = pg_newlocale_from_collation(collid);
1958 
1959  if (mylocale && !mylocale->deterministic)
1960  ereport(ERROR,
1961  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1962  errmsg("nondeterministic collations are not supported for substring searches")));
1963 
1964  len1 = toast_raw_datum_size(arg1);
1965  len2 = toast_raw_datum_size(arg2);
1966  if (len2 > len1)
1967  result = false;
1968  else
1969  {
1970  text *targ1 = text_substring(arg1, 1, len2, false);
1971  text *targ2 = DatumGetTextPP(arg2);
1972 
1973  result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1974  VARSIZE_ANY_EXHDR(targ2)) == 0);
1975 
1976  PG_FREE_IF_COPY(targ1, 0);
1977  PG_FREE_IF_COPY(targ2, 1);
1978  }
1979 
1980  PG_RETURN_BOOL(result);
1981 }
1982 
1983 Datum
1985 {
1986  text *arg1 = PG_GETARG_TEXT_PP(0);
1987  text *arg2 = PG_GETARG_TEXT_PP(1);
1988  int32 result;
1989 
1990  result = text_cmp(arg1, arg2, PG_GET_COLLATION());
1991 
1992  PG_FREE_IF_COPY(arg1, 0);
1993  PG_FREE_IF_COPY(arg2, 1);
1994 
1995  PG_RETURN_INT32(result);
1996 }
1997 
1998 Datum
2000 {
2002  Oid collid = ssup->ssup_collation;
2003  MemoryContext oldcontext;
2004 
2005  oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
2006 
2007  /* Use generic string SortSupport */
2008  varstr_sortsupport(ssup, TEXTOID, collid);
2009 
2010  MemoryContextSwitchTo(oldcontext);
2011 
2012  PG_RETURN_VOID();
2013 }
2014 
2015 /*
2016  * Generic sortsupport interface for character type's operator classes.
2017  * Includes locale support, and support for BpChar semantics (i.e. removing
2018  * trailing spaces before comparison).
2019  *
2020  * Relies on the assumption that text, VarChar, BpChar, and bytea all have the
2021  * same representation. Callers that always use the C collation (e.g.
2022  * non-collatable type callers like bytea) may have NUL bytes in their strings;
2023  * this will not work with any other collation, though.
2024  */
2025 void
2027 {
2028  bool abbreviate = ssup->abbreviate;
2029  bool collate_c = false;
2030  VarStringSortSupport *sss;
2031  pg_locale_t locale = 0;
2032 
2033  check_collation_set(collid);
2034 
2035  /*
2036  * If possible, set ssup->comparator to a function which can be used to
2037  * directly compare two datums. If we can do this, we'll avoid the
2038  * overhead of a trip through the fmgr layer for every comparison, which
2039  * can be substantial.
2040  *
2041  * Most typically, we'll set the comparator to varlenafastcmp_locale,
2042  * which uses strcoll() to perform comparisons. We use that for the
2043  * BpChar case too, but type NAME uses namefastcmp_locale. However, if
2044  * LC_COLLATE = C, we can make things quite a bit faster with
2045  * varstrfastcmp_c, bpcharfastcmp_c, or namefastcmp_c, all of which use
2046  * memcmp() rather than strcoll().
2047  */
2048  if (lc_collate_is_c(collid))
2049  {
2050  if (typid == BPCHAROID)
2051  ssup->comparator = bpcharfastcmp_c;
2052  else if (typid == NAMEOID)
2053  {
2054  ssup->comparator = namefastcmp_c;
2055  /* Not supporting abbreviation with type NAME, for now */
2056  abbreviate = false;
2057  }
2058  else
2059  ssup->comparator = varstrfastcmp_c;
2060 
2061  collate_c = true;
2062  }
2063  else
2064  {
2065  /*
2066  * We need a collation-sensitive comparison. To make things faster,
2067  * we'll figure out the collation based on the locale id and cache the
2068  * result.
2069  */
2070  if (collid != DEFAULT_COLLATION_OID)
2071  locale = pg_newlocale_from_collation(collid);
2072 
2073  /*
2074  * There is a further exception on Windows. When the database
2075  * encoding is UTF-8 and we are not using the C collation, complex
2076  * hacks are required. We don't currently have a comparator that
2077  * handles that case, so we fall back on the slow method of having the
2078  * sort code invoke bttextcmp() (in the case of text) via the fmgr
2079  * trampoline. ICU locales work just the same on Windows, however.
2080  */
2081 #ifdef WIN32
2082  if (GetDatabaseEncoding() == PG_UTF8 &&
2083  !(locale && locale->provider == COLLPROVIDER_ICU))
2084  return;
2085 #endif
2086 
2087  /*
2088  * We use varlenafastcmp_locale except for type NAME.
2089  */
2090  if (typid == NAMEOID)
2091  {
2093  /* Not supporting abbreviation with type NAME, for now */
2094  abbreviate = false;
2095  }
2096  else
2098  }
2099 
2100  /*
2101  * Unfortunately, it seems that abbreviation for non-C collations is
2102  * broken on many common platforms; testing of multiple versions of glibc
2103  * reveals that, for many locales, strcoll() and strxfrm() do not return
2104  * consistent results, which is fatal to this optimization. While no
2105  * other libc other than Cygwin has so far been shown to have a problem,
2106  * we take the conservative course of action for right now and disable
2107  * this categorically. (Users who are certain this isn't a problem on
2108  * their system can define TRUST_STRXFRM.)
2109  *
2110  * Even apart from the risk of broken locales, it's possible that there
2111  * are platforms where the use of abbreviated keys should be disabled at
2112  * compile time. Having only 4 byte datums could make worst-case
2113  * performance drastically more likely, for example. Moreover, macOS's
2114  * strxfrm() implementation is known to not effectively concentrate a
2115  * significant amount of entropy from the original string in earlier
2116  * transformed blobs. It's possible that other supported platforms are
2117  * similarly encumbered. So, if we ever get past disabling this
2118  * categorically, we may still want or need to disable it for particular
2119  * platforms.
2120  */
2121 #ifndef TRUST_STRXFRM
2122  if (!collate_c && !(locale && locale->provider == COLLPROVIDER_ICU))
2123  abbreviate = false;
2124 #endif
2125 
2126  /*
2127  * If we're using abbreviated keys, or if we're using a locale-aware
2128  * comparison, we need to initialize a VarStringSortSupport object. Both
2129  * cases will make use of the temporary buffers we initialize here for
2130  * scratch space (and to detect requirement for BpChar semantics from
2131  * caller), and the abbreviation case requires additional state.
2132  */
2133  if (abbreviate || !collate_c)
2134  {
2135  sss = palloc(sizeof(VarStringSortSupport));
2136  sss->buf1 = palloc(TEXTBUFLEN);
2137  sss->buflen1 = TEXTBUFLEN;
2138  sss->buf2 = palloc(TEXTBUFLEN);
2139  sss->buflen2 = TEXTBUFLEN;
2140  /* Start with invalid values */
2141  sss->last_len1 = -1;
2142  sss->last_len2 = -1;
2143  /* Initialize */
2144  sss->last_returned = 0;
2145  sss->locale = locale;
2146 
2147  /*
2148  * To avoid somehow confusing a strxfrm() blob and an original string,
2149  * constantly keep track of the variety of data that buf1 and buf2
2150  * currently contain.
2151  *
2152  * Comparisons may be interleaved with conversion calls. Frequently,
2153  * conversions and comparisons are batched into two distinct phases,
2154  * but the correctness of caching cannot hinge upon this. For
2155  * comparison caching, buffer state is only trusted if cache_blob is
2156  * found set to false, whereas strxfrm() caching only trusts the state
2157  * when cache_blob is found set to true.
2158  *
2159  * Arbitrarily initialize cache_blob to true.
2160  */
2161  sss->cache_blob = true;
2162  sss->collate_c = collate_c;
2163  sss->typid = typid;
2164  ssup->ssup_extra = sss;
2165 
2166  /*
2167  * If possible, plan to use the abbreviated keys optimization. The
2168  * core code may switch back to authoritative comparator should
2169  * abbreviation be aborted.
2170  */
2171  if (abbreviate)
2172  {
2173  sss->prop_card = 0.20;
2174  initHyperLogLog(&sss->abbr_card, 10);
2175  initHyperLogLog(&sss->full_card, 10);
2176  ssup->abbrev_full_comparator = ssup->comparator;
2177  ssup->comparator = varstrcmp_abbrev;
2180  }
2181  }
2182 }
2183 
2184 /*
2185  * sortsupport comparison func (for C locale case)
2186  */
2187 static int
2189 {
2190  VarString *arg1 = DatumGetVarStringPP(x);
2191  VarString *arg2 = DatumGetVarStringPP(y);
2192  char *a1p,
2193  *a2p;
2194  int len1,
2195  len2,
2196  result;
2197 
2198  a1p = VARDATA_ANY(arg1);
2199  a2p = VARDATA_ANY(arg2);
2200 
2201  len1 = VARSIZE_ANY_EXHDR(arg1);
2202  len2 = VARSIZE_ANY_EXHDR(arg2);
2203 
2204  result = memcmp(a1p, a2p, Min(len1, len2));
2205  if ((result == 0) && (len1 != len2))
2206  result = (len1 < len2) ? -1 : 1;
2207 
2208  /* We can't afford to leak memory here. */
2209  if (PointerGetDatum(arg1) != x)
2210  pfree(arg1);
2211  if (PointerGetDatum(arg2) != y)
2212  pfree(arg2);
2213 
2214  return result;
2215 }
2216 
2217 /*
2218  * sortsupport comparison func (for BpChar C locale case)
2219  *
2220  * BpChar outsources its sortsupport to this module. Specialization for the
2221  * varstr_sortsupport BpChar case, modeled on
2222  * internal_bpchar_pattern_compare().
2223  */
2224 static int
2226 {
2227  BpChar *arg1 = DatumGetBpCharPP(x);
2228  BpChar *arg2 = DatumGetBpCharPP(y);
2229  char *a1p,
2230  *a2p;
2231  int len1,
2232  len2,
2233  result;
2234 
2235  a1p = VARDATA_ANY(arg1);
2236  a2p = VARDATA_ANY(arg2);
2237 
2238  len1 = bpchartruelen(a1p, VARSIZE_ANY_EXHDR(arg1));
2239  len2 = bpchartruelen(a2p, VARSIZE_ANY_EXHDR(arg2));
2240 
2241  result = memcmp(a1p, a2p, Min(len1, len2));
2242  if ((result == 0) && (len1 != len2))
2243  result = (len1 < len2) ? -1 : 1;
2244 
2245  /* We can't afford to leak memory here. */
2246  if (PointerGetDatum(arg1) != x)
2247  pfree(arg1);
2248  if (PointerGetDatum(arg2) != y)
2249  pfree(arg2);
2250 
2251  return result;
2252 }
2253 
2254 /*
2255  * sortsupport comparison func (for NAME C locale case)
2256  */
2257 static int
2259 {
2260  Name arg1 = DatumGetName(x);
2261  Name arg2 = DatumGetName(y);
2262 
2263  return strncmp(NameStr(*arg1), NameStr(*arg2), NAMEDATALEN);
2264 }
2265 
2266 /*
2267  * sortsupport comparison func (for locale case with all varlena types)
2268  */
2269 static int
2271 {
2272  VarString *arg1 = DatumGetVarStringPP(x);
2273  VarString *arg2 = DatumGetVarStringPP(y);
2274  char *a1p,
2275  *a2p;
2276  int len1,
2277  len2,
2278  result;
2279 
2280  a1p = VARDATA_ANY(arg1);
2281  a2p = VARDATA_ANY(arg2);
2282 
2283  len1 = VARSIZE_ANY_EXHDR(arg1);
2284  len2 = VARSIZE_ANY_EXHDR(arg2);
2285 
2286  result = varstrfastcmp_locale(a1p, len1, a2p, len2, ssup);
2287 
2288  /* We can't afford to leak memory here. */
2289  if (PointerGetDatum(arg1) != x)
2290  pfree(arg1);
2291  if (PointerGetDatum(arg2) != y)
2292  pfree(arg2);
2293 
2294  return result;
2295 }
2296 
2297 /*
2298  * sortsupport comparison func (for locale case with NAME type)
2299  */
2300 static int
2302 {
2303  Name arg1 = DatumGetName(x);
2304  Name arg2 = DatumGetName(y);
2305 
2306  return varstrfastcmp_locale(NameStr(*arg1), strlen(NameStr(*arg1)),
2307  NameStr(*arg2), strlen(NameStr(*arg2)),
2308  ssup);
2309 }
2310 
2311 /*
2312  * sortsupport comparison func for locale cases
2313  */
2314 static int
2315 varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup)
2316 {
2318  int result;
2319  bool arg1_match;
2320 
2321  /* Fast pre-check for equality, as discussed in varstr_cmp() */
2322  if (len1 == len2 && memcmp(a1p, a2p, len1) == 0)
2323  {
2324  /*
2325  * No change in buf1 or buf2 contents, so avoid changing last_len1 or
2326  * last_len2. Existing contents of buffers might still be used by
2327  * next call.
2328  *
2329  * It's fine to allow the comparison of BpChar padding bytes here,
2330  * even though that implies that the memcmp() will usually be
2331  * performed for BpChar callers (though multibyte characters could
2332  * still prevent that from occurring). The memcmp() is still very
2333  * cheap, and BpChar's funny semantics have us remove trailing spaces
2334  * (not limited to padding), so we need make no distinction between
2335  * padding space characters and "real" space characters.
2336  */
2337  return 0;
2338  }
2339 
2340  if (sss->typid == BPCHAROID)
2341  {
2342  /* Get true number of bytes, ignoring trailing spaces */
2343  len1 = bpchartruelen(a1p, len1);
2344  len2 = bpchartruelen(a2p, len2);
2345  }
2346 
2347  if (len1 >= sss->buflen1)
2348  {
2349  pfree(sss->buf1);
2350  sss->buflen1 = Max(len1 + 1, Min(sss->buflen1 * 2, MaxAllocSize));
2351  sss->buf1 = MemoryContextAlloc(ssup->ssup_cxt, sss->buflen1);
2352  }
2353  if (len2 >= sss->buflen2)
2354  {
2355  pfree(sss->buf2);
2356  sss->buflen2 = Max(len2 + 1, Min(sss->buflen2 * 2, MaxAllocSize));
2357  sss->buf2 = MemoryContextAlloc(ssup->ssup_cxt, sss->buflen2);
2358  }
2359 
2360  /*
2361  * We're likely to be asked to compare the same strings repeatedly, and
2362  * memcmp() is so much cheaper than strcoll() that it pays to try to cache
2363  * comparisons, even though in general there is no reason to think that
2364  * that will work out (every string datum may be unique). Caching does
2365  * not slow things down measurably when it doesn't work out, and can speed
2366  * things up by rather a lot when it does. In part, this is because the
2367  * memcmp() compares data from cachelines that are needed in L1 cache even
2368  * when the last comparison's result cannot be reused.
2369  */
2370  arg1_match = true;
2371  if (len1 != sss->last_len1 || memcmp(sss->buf1, a1p, len1) != 0)
2372  {
2373  arg1_match = false;
2374  memcpy(sss->buf1, a1p, len1);
2375  sss->buf1[len1] = '\0';
2376  sss->last_len1 = len1;
2377  }
2378 
2379  /*
2380  * If we're comparing the same two strings as last time, we can return the
2381  * same answer without calling strcoll() again. This is more likely than
2382  * it seems (at least with moderate to low cardinality sets), because
2383  * quicksort compares the same pivot against many values.
2384  */
2385  if (len2 != sss->last_len2 || memcmp(sss->buf2, a2p, len2) != 0)
2386  {
2387  memcpy(sss->buf2, a2p, len2);
2388  sss->buf2[len2] = '\0';
2389  sss->last_len2 = len2;
2390  }
2391  else if (arg1_match && !sss->cache_blob)
2392  {
2393  /* Use result cached following last actual strcoll() call */
2394  return sss->last_returned;
2395  }
2396 
2397  if (sss->locale)
2398  {
2399  if (sss->locale->provider == COLLPROVIDER_ICU)
2400  {
2401 #ifdef USE_ICU
2402 #ifdef HAVE_UCOL_STRCOLLUTF8
2403  if (GetDatabaseEncoding() == PG_UTF8)
2404  {
2405  UErrorCode status;
2406 
2407  status = U_ZERO_ERROR;
2408  result = ucol_strcollUTF8(sss->locale->info.icu.ucol,
2409  a1p, len1,
2410  a2p, len2,
2411  &status);
2412  if (U_FAILURE(status))
2413  ereport(ERROR,
2414  (errmsg("collation failed: %s", u_errorName(status))));
2415  }
2416  else
2417 #endif
2418  {
2419  int32_t ulen1,
2420  ulen2;
2421  UChar *uchar1,
2422  *uchar2;
2423 
2424  ulen1 = icu_to_uchar(&uchar1, a1p, len1);
2425  ulen2 = icu_to_uchar(&uchar2, a2p, len2);
2426 
2427  result = ucol_strcoll(sss->locale->info.icu.ucol,
2428  uchar1, ulen1,
2429  uchar2, ulen2);
2430 
2431  pfree(uchar1);
2432  pfree(uchar2);
2433  }
2434 #else /* not USE_ICU */
2435  /* shouldn't happen */
2436  elog(ERROR, "unsupported collprovider: %c", sss->locale->provider);
2437 #endif /* not USE_ICU */
2438  }
2439  else
2440  {
2441 #ifdef HAVE_LOCALE_T
2442  result = strcoll_l(sss->buf1, sss->buf2, sss->locale->info.lt);
2443 #else
2444  /* shouldn't happen */
2445  elog(ERROR, "unsupported collprovider: %c", sss->locale->provider);
2446 #endif
2447  }
2448  }
2449  else
2450  result = strcoll(sss->buf1, sss->buf2);
2451 
2452  /* Break tie if necessary. */
2453  if (result == 0 &&
2454  (!sss->locale || sss->locale->deterministic))
2455  result = strcmp(sss->buf1, sss->buf2);
2456 
2457  /* Cache result, perhaps saving an expensive strcoll() call next time */
2458  sss->cache_blob = false;
2459  sss->last_returned = result;
2460  return result;
2461 }
2462 
2463 /*
2464  * Abbreviated key comparison func
2465  */
2466 static int
2468 {
2469  /*
2470  * When 0 is returned, the core system will call varstrfastcmp_c()
2471  * (bpcharfastcmp_c() in BpChar case) or varlenafastcmp_locale(). Even a
2472  * strcmp() on two non-truncated strxfrm() blobs cannot indicate *equality*
2473  * authoritatively, for the same reason that there is a strcoll()
2474  * tie-breaker call to strcmp() in varstr_cmp().
2475  */
2476  if (x > y)
2477  return 1;
2478  else if (x == y)
2479  return 0;
2480  else
2481  return -1;
2482 }
2483 
2484 /*
2485  * Conversion routine for sortsupport. Converts original to abbreviated key
2486  * representation. Our encoding strategy is simple -- pack the first 8 bytes
2487  * of a strxfrm() blob into a Datum (on little-endian machines, the 8 bytes are
2488  * stored in reverse order), and treat it as an unsigned integer. When the "C"
2489  * locale is used, or in case of bytea, just memcpy() from original instead.
2490  */
2491 static Datum
2493 {
2495  VarString *authoritative = DatumGetVarStringPP(original);
2496  char *authoritative_data = VARDATA_ANY(authoritative);
2497 
2498  /* working state */
2499  Datum res;
2500  char *pres;
2501  int len;
2502  uint32 hash;
2503 
2504  pres = (char *) &res;
2505  /* memset(), so any non-overwritten bytes are NUL */
2506  memset(pres, 0, sizeof(Datum));
2507  len = VARSIZE_ANY_EXHDR(authoritative);
2508 
2509  /* Get number of bytes, ignoring trailing spaces */
2510  if (sss->typid == BPCHAROID)
2511  len = bpchartruelen(authoritative_data, len);
2512 
2513  /*
2514  * If we're using the C collation, use memcpy(), rather than strxfrm(), to
2515  * abbreviate keys. The full comparator for the C locale is always
2516  * memcmp(). It would be incorrect to allow bytea callers (callers that
2517  * always force the C collation -- bytea isn't a collatable type, but this
2518  * approach is convenient) to use strxfrm(). This is because bytea
2519  * strings may contain NUL bytes. Besides, this should be faster, too.
2520  *
2521  * More generally, it's okay that bytea callers can have NUL bytes in
2522  * strings because varstrcmp_abbrev() need not make a distinction between
2523  * terminating NUL bytes, and NUL bytes representing actual NULs in the
2524  * authoritative representation. Hopefully a comparison at or past one
2525  * abbreviated key's terminating NUL byte will resolve the comparison
2526  * without consulting the authoritative representation; specifically, some
2527  * later non-NUL byte in the longer string can resolve the comparison
2528  * against a subsequent terminating NUL in the shorter string. There will
2529  * usually be what is effectively a "length-wise" resolution there and
2530  * then.
2531  *
2532  * If that doesn't work out -- if all bytes in the longer string
2533  * positioned at or past the offset of the smaller string's (first)
2534  * terminating NUL are actually representative of NUL bytes in the
2535  * authoritative binary string (perhaps with some *terminating* NUL bytes
2536  * towards the end of the longer string iff it happens to still be small)
2537  * -- then an authoritative tie-breaker will happen, and do the right
2538  * thing: explicitly consider string length.
2539  */
2540  if (sss->collate_c)
2541  memcpy(pres, authoritative_data, Min(len, sizeof(Datum)));
2542  else
2543  {
2544  Size bsize;
2545 #ifdef USE_ICU
2546  int32_t ulen = -1;
2547  UChar *uchar = NULL;
2548 #endif
2549 
2550  /*
2551  * We're not using the C collation, so fall back on strxfrm or ICU
2552  * analogs.
2553  */
2554 
2555  /* By convention, we use buffer 1 to store and NUL-terminate */
2556  if (len >= sss->buflen1)
2557  {
2558  pfree(sss->buf1);
2559  sss->buflen1 = Max(len + 1, Min(sss->buflen1 * 2, MaxAllocSize));
2560  sss->buf1 = palloc(sss->buflen1);
2561  }
2562 
2563  /* Might be able to reuse strxfrm() blob from last call */
2564  if (sss->last_len1 == len && sss->cache_blob &&
2565  memcmp(sss->buf1, authoritative_data, len) == 0)
2566  {
2567  memcpy(pres, sss->buf2, Min(sizeof(Datum), sss->last_len2));
2568  /* No change affecting cardinality, so no hashing required */
2569  goto done;
2570  }
2571 
2572  memcpy(sss->buf1, authoritative_data, len);
2573 
2574  /*
2575  * Just like strcoll(), strxfrm() expects a NUL-terminated string. Not
2576  * necessary for ICU, but doesn't hurt.
2577  */
2578  sss->buf1[len] = '\0';
2579  sss->last_len1 = len;
2580 
2581 #ifdef USE_ICU
2582  /* When using ICU and not UTF8, convert string to UChar. */
2583  if (sss->locale && sss->locale->provider == COLLPROVIDER_ICU &&
2585  ulen = icu_to_uchar(&uchar, sss->buf1, len);
2586 #endif
2587 
2588  /*
2589  * Loop: Call strxfrm() or ucol_getSortKey(), possibly enlarge buffer,
2590  * and try again. Both of these functions have the result buffer
2591  * content undefined if the result did not fit, so we need to retry
2592  * until everything fits, even though we only need the first few bytes
2593  * in the end. When using ucol_nextSortKeyPart(), however, we only
2594  * ask for as many bytes as we actually need.
2595  */
2596  for (;;)
2597  {
2598 #ifdef USE_ICU
2599  if (sss->locale && sss->locale->provider == COLLPROVIDER_ICU)
2600  {
2601  /*
2602  * When using UTF8, use the iteration interface so we only
2603  * need to produce as many bytes as we actually need.
2604  */
2605  if (GetDatabaseEncoding() == PG_UTF8)
2606  {
2607  UCharIterator iter;
2608  uint32_t state[2];
2609  UErrorCode status;
2610 
2611  uiter_setUTF8(&iter, sss->buf1, len);
2612  state[0] = state[1] = 0; /* won't need that again */
2613  status = U_ZERO_ERROR;
2614  bsize = ucol_nextSortKeyPart(sss->locale->info.icu.ucol,
2615  &iter,
2616  state,
2617  (uint8_t *) sss->buf2,
2618  Min(sizeof(Datum), sss->buflen2),
2619  &status);
2620  if (U_FAILURE(status))
2621  ereport(ERROR,
2622  (errmsg("sort key generation failed: %s",
2623  u_errorName(status))));
2624  }
2625  else
2626  bsize = ucol_getSortKey(sss->locale->info.icu.ucol,
2627  uchar, ulen,
2628  (uint8_t *) sss->buf2, sss->buflen2);
2629  }
2630  else
2631 #endif
2632 #ifdef HAVE_LOCALE_T
2633  if (sss->locale && sss->locale->provider == COLLPROVIDER_LIBC)
2634  bsize = strxfrm_l(sss->buf2, sss->buf1,
2635  sss->buflen2, sss->locale->info.lt);
2636  else
2637 #endif
2638  bsize = strxfrm(sss->buf2, sss->buf1, sss->buflen2);
2639 
2640  sss->last_len2 = bsize;
2641  if (bsize < sss->buflen2)
2642  break;
2643 
2644  /*
2645  * Grow buffer and retry.
2646  */
2647  pfree(sss->buf2);
2648  sss->buflen2 = Max(bsize + 1,
2649  Min(sss->buflen2 * 2, MaxAllocSize));
2650  sss->buf2 = palloc(sss->buflen2);
2651  }
2652 
2653  /*
2654  * Every Datum byte is always compared. This is safe because the
2655  * strxfrm() blob is itself NUL terminated, leaving no danger of
2656  * misinterpreting any NUL bytes not intended to be interpreted as
2657  * logically representing termination.
2658  *
2659  * (Actually, even if there were NUL bytes in the blob it would be
2660  * okay. See remarks on bytea case above.)
2661  */
2662  memcpy(pres, sss->buf2, Min(sizeof(Datum), bsize));
2663 
2664 #ifdef USE_ICU
2665  if (uchar)
2666  pfree(uchar);
2667 #endif
2668  }
2669 
2670  /*
2671  * Maintain approximate cardinality of both abbreviated keys and original,
2672  * authoritative keys using HyperLogLog. Used as cheap insurance against
2673  * the worst case, where we do many string transformations for no saving
2674  * in full strcoll()-based comparisons. These statistics are used by
2675  * varstr_abbrev_abort().
2676  *
2677  * First, Hash key proper, or a significant fraction of it. Mix in length
2678  * in order to compensate for cases where differences are past
2679  * PG_CACHE_LINE_SIZE bytes, so as to limit the overhead of hashing.
2680  */
2681  hash = DatumGetUInt32(hash_any((unsigned char *) authoritative_data,
2682  Min(len, PG_CACHE_LINE_SIZE)));
2683 
2684  if (len > PG_CACHE_LINE_SIZE)
2685  hash ^= DatumGetUInt32(hash_uint32((uint32) len));
2686 
2687  addHyperLogLog(&sss->full_card, hash);
2688 
2689  /* Hash abbreviated key */
2690 #if SIZEOF_DATUM == 8
2691  {
2692  uint32 lohalf,
2693  hihalf;
2694 
2695  lohalf = (uint32) res;
2696  hihalf = (uint32) (res >> 32);
2697  hash = DatumGetUInt32(hash_uint32(lohalf ^ hihalf));
2698  }
2699 #else /* SIZEOF_DATUM != 8 */
2700  hash = DatumGetUInt32(hash_uint32((uint32) res));
2701 #endif
2702 
2703  addHyperLogLog(&sss->abbr_card, hash);
2704 
2705  /* Cache result, perhaps saving an expensive strxfrm() call next time */
2706  sss->cache_blob = true;
2707 done:
2708 
2709  /*
2710  * Byteswap on little-endian machines.
2711  *
2712  * This is needed so that varstrcmp_abbrev() (an unsigned integer 3-way
2713  * comparator) works correctly on all platforms. If we didn't do this,
2714  * the comparator would have to call memcmp() with a pair of pointers to
2715  * the first byte of each abbreviated key, which is slower.
2716  */
2717  res = DatumBigEndianToNative(res);
2718 
2719  /* Don't leak memory here */
2720  if (PointerGetDatum(authoritative) != original)
2721  pfree(authoritative);
2722 
2723  return res;
2724 }
2725 
2726 /*
2727  * Callback for estimating effectiveness of abbreviated key optimization, using
2728  * heuristic rules. Returns value indicating if the abbreviation optimization
2729  * should be aborted, based on its projected effectiveness.
2730  */
2731 static bool
2732 varstr_abbrev_abort(int memtupcount, SortSupport ssup)
2733 {
2735  double abbrev_distinct,
2736  key_distinct;
2737 
2738  Assert(ssup->abbreviate);
2739 
2740  /* Have a little patience */
2741  if (memtupcount < 100)
2742  return false;
2743 
2744  abbrev_distinct = estimateHyperLogLog(&sss->abbr_card);
2745  key_distinct = estimateHyperLogLog(&sss->full_card);
2746 
2747  /*
2748  * Clamp cardinality estimates to at least one distinct value. While
2749  * NULLs are generally disregarded, if only NULL values were seen so far,
2750  * that might misrepresent costs if we failed to clamp.
2751  */
2752  if (abbrev_distinct <= 1.0)
2753  abbrev_distinct = 1.0;
2754 
2755  if (key_distinct <= 1.0)
2756  key_distinct = 1.0;
2757 
2758  /*
2759  * In the worst case all abbreviated keys are identical, while at the same
2760  * time there are differences within full key strings not captured in
2761  * abbreviations.
2762  */
2763 #ifdef TRACE_SORT
2764  if (trace_sort)
2765  {
2766  double norm_abbrev_card = abbrev_distinct / (double) memtupcount;
2767 
2768  elog(LOG, "varstr_abbrev: abbrev_distinct after %d: %f "
2769  "(key_distinct: %f, norm_abbrev_card: %f, prop_card: %f)",
2770  memtupcount, abbrev_distinct, key_distinct, norm_abbrev_card,
2771  sss->prop_card);
2772  }
2773 #endif
2774 
2775  /*
2776  * If the number of distinct abbreviated keys approximately matches the
2777  * number of distinct authoritative original keys, that's reason enough to
2778  * proceed. We can win even with a very low cardinality set if most
2779  * tie-breakers only memcmp(). This is by far the most important
2780  * consideration.
2781  *
2782  * While comparisons that are resolved at the abbreviated key level are
2783  * considerably cheaper than tie-breakers resolved with memcmp(), both of
2784  * those two outcomes are so much cheaper than a full strcoll() once
2785  * sorting is underway that it doesn't seem worth it to weigh abbreviated
2786  * cardinality against the overall size of the set in order to more
2787  * accurately model costs. Assume that an abbreviated comparison, and an
2788  * abbreviated comparison with a cheap memcmp()-based authoritative
2789  * resolution are equivalent.
2790  */
2791  if (abbrev_distinct > key_distinct * sss->prop_card)
2792  {
2793  /*
2794  * When we have exceeded 10,000 tuples, decay required cardinality
2795  * aggressively for next call.
2796  *
2797  * This is useful because the number of comparisons required on
2798  * average increases at a linearithmic rate, and at roughly 10,000
2799  * tuples that factor will start to dominate over the linear costs of
2800  * string transformation (this is a conservative estimate). The decay
2801  * rate is chosen to be a little less aggressive than halving -- which
2802  * (since we're called at points at which memtupcount has doubled)
2803  * would never see the cost model actually abort past the first call
2804  * following a decay. This decay rate is mostly a precaution against
2805  * a sudden, violent swing in how well abbreviated cardinality tracks
2806  * full key cardinality. The decay also serves to prevent a marginal
2807  * case from being aborted too late, when too much has already been
2808  * invested in string transformation.
2809  *
2810  * It's possible for sets of several million distinct strings with
2811  * mere tens of thousands of distinct abbreviated keys to still
2812  * benefit very significantly. This will generally occur provided
2813  * each abbreviated key is a proxy for a roughly uniform number of the
2814  * set's full keys. If it isn't so, we hope to catch that early and
2815  * abort. If it isn't caught early, by the time the problem is
2816  * apparent it's probably not worth aborting.
2817  */
2818  if (memtupcount > 10000)
2819  sss->prop_card *= 0.65;
2820 
2821  return false;
2822  }
2823 
2824  /*
2825  * Abort abbreviation strategy.
2826  *
2827  * The worst case, where all abbreviated keys are identical while all
2828  * original strings differ will typically only see a regression of about
2829  * 10% in execution time for small to medium sized lists of strings.
2830  * Whereas on modern CPUs where cache stalls are the dominant cost, we can
2831  * often expect very large improvements, particularly with sets of strings
2832  * of moderately high to high abbreviated cardinality. There is little to
2833  * lose but much to gain, which our strategy reflects.
2834  */
2835 #ifdef TRACE_SORT
2836  if (trace_sort)
2837  elog(LOG, "varstr_abbrev: aborted abbreviation at %d "
2838  "(abbrev_distinct: %f, key_distinct: %f, prop_card: %f)",
2839  memtupcount, abbrev_distinct, key_distinct, sss->prop_card);
2840 #endif
2841 
2842  return true;
2843 }
2844 
2845 /*
2846  * Generic equalimage support function for character type's operator classes.
2847  * Disables the use of deduplication with nondeterministic collations.
2848  */
2849 Datum
2851 {
2852  /* Oid opcintype = PG_GETARG_OID(0); */
2853  Oid collid = PG_GET_COLLATION();
2854 
2855  check_collation_set(collid);
2856 
2857  if (lc_collate_is_c(collid) ||
2858  collid == DEFAULT_COLLATION_OID ||
2860  PG_RETURN_BOOL(true);
2861  else
2862  PG_RETURN_BOOL(false);
2863 }
2864 
2865 Datum
2867 {
2868  text *arg1 = PG_GETARG_TEXT_PP(0);
2869  text *arg2 = PG_GETARG_TEXT_PP(1);
2870  text *result;
2871 
2872  result = ((text_cmp(arg1, arg2, PG_GET_COLLATION()) > 0) ? arg1 : arg2);
2873 
2874  PG_RETURN_TEXT_P(result);
2875 }
2876 
2877 Datum
2879 {
2880  text *arg1 = PG_GETARG_TEXT_PP(0);
2881  text *arg2 = PG_GETARG_TEXT_PP(1);
2882  text *result;
2883 
2884  result = ((text_cmp(arg1, arg2, PG_GET_COLLATION()) < 0) ? arg1 : arg2);
2885 
2886  PG_RETURN_TEXT_P(result);
2887 }
2888 
2889 
2890 /*
2891  * Cross-type comparison functions for types text and name.
2892  */
2893 
2894 Datum
2896 {
2897  Name arg1 = PG_GETARG_NAME(0);
2898  text *arg2 = PG_GETARG_TEXT_PP(1);
2899  size_t len1 = strlen(NameStr(*arg1));
2900  size_t len2 = VARSIZE_ANY_EXHDR(arg2);
2901  Oid collid = PG_GET_COLLATION();
2902  bool result;
2903 
2904  check_collation_set(collid);
2905 
2906  if (collid == C_COLLATION_OID)
2907  result = (len1 == len2 &&
2908  memcmp(NameStr(*arg1), VARDATA_ANY(arg2), len1) == 0);
2909  else
2910  result = (varstr_cmp(NameStr(*arg1), len1,
2911  VARDATA_ANY(arg2), len2,
2912  collid) == 0);
2913 
2914  PG_FREE_IF_COPY(arg2, 1);
2915 
2916  PG_RETURN_BOOL(result);
2917 }
2918 
2919 Datum
2921 {
2922  text *arg1 = PG_GETARG_TEXT_PP(0);
2923  Name arg2 = PG_GETARG_NAME(1);
2924  size_t len1 = VARSIZE_ANY_EXHDR(arg1);
2925  size_t len2 = strlen(NameStr(*arg2));
2926  Oid collid = PG_GET_COLLATION();
2927  bool result;
2928 
2929  check_collation_set(collid);
2930 
2931  if (collid == C_COLLATION_OID)
2932  result = (len1 == len2 &&
2933  memcmp(VARDATA_ANY(arg1), NameStr(*arg2), len1) == 0);
2934  else
2935  result = (varstr_cmp(VARDATA_ANY(arg1), len1,
2936  NameStr(*arg2), len2,
2937  collid) == 0);
2938 
2939  PG_FREE_IF_COPY(arg1, 0);
2940 
2941  PG_RETURN_BOOL(result);
2942 }
2943 
2944 Datum
2946 {
2947  Name arg1 = PG_GETARG_NAME(0);
2948  text *arg2 = PG_GETARG_TEXT_PP(1);
2949  size_t len1 = strlen(NameStr(*arg1));
2950  size_t len2 = VARSIZE_ANY_EXHDR(arg2);
2951  Oid collid = PG_GET_COLLATION();
2952  bool result;
2953 
2954  check_collation_set(collid);
2955 
2956  if (collid == C_COLLATION_OID)
2957  result = !(len1 == len2 &&
2958  memcmp(NameStr(*arg1), VARDATA_ANY(arg2), len1) == 0);
2959  else
2960  result = !(varstr_cmp(NameStr(*arg1), len1,
2961  VARDATA_ANY(arg2), len2,
2962  collid) == 0);
2963 
2964  PG_FREE_IF_COPY(arg2, 1);
2965 
2966  PG_RETURN_BOOL(result);
2967 }
2968 
2969 Datum
2971 {
2972  text *arg1 = PG_GETARG_TEXT_PP(0);
2973  Name arg2 = PG_GETARG_NAME(1);
2974  size_t len1 = VARSIZE_ANY_EXHDR(arg1);
2975  size_t len2 = strlen(NameStr(*arg2));
2976  Oid collid = PG_GET_COLLATION();
2977  bool result;
2978 
2979  check_collation_set(collid);
2980 
2981  if (collid == C_COLLATION_OID)
2982  result = !(len1 == len2 &&
2983  memcmp(VARDATA_ANY(arg1), NameStr(*arg2), len1) == 0);
2984  else
2985  result = !(varstr_cmp(VARDATA_ANY(arg1), len1,
2986  NameStr(*arg2), len2,
2987  collid) == 0);
2988 
2989  PG_FREE_IF_COPY(arg1, 0);
2990 
2991  PG_RETURN_BOOL(result);
2992 }
2993 
2994 Datum
2996 {
2997  Name arg1 = PG_GETARG_NAME(0);
2998  text *arg2 = PG_GETARG_TEXT_PP(1);
2999  int32 result;
3000 
3001  result = varstr_cmp(NameStr(*arg1), strlen(NameStr(*arg1)),
3002  VARDATA_ANY(arg2), VARSIZE_ANY_EXHDR(arg2),
3003  PG_GET_COLLATION());
3004 
3005  PG_FREE_IF_COPY(arg2, 1);
3006 
3007  PG_RETURN_INT32(result);
3008 }
3009 
3010 Datum
3012 {
3013  text *arg1 = PG_GETARG_TEXT_PP(0);
3014  Name arg2 = PG_GETARG_NAME(1);
3015  int32 result;
3016 
3017  result = varstr_cmp(VARDATA_ANY(arg1), VARSIZE_ANY_EXHDR(arg1),
3018  NameStr(*arg2), strlen(NameStr(*arg2)),
3019  PG_GET_COLLATION());
3020 
3021  PG_FREE_IF_COPY(arg1, 0);
3022 
3023  PG_RETURN_INT32(result);
3024 }
3025 
3026 #define CmpCall(cmpfunc) \
3027  DatumGetInt32(DirectFunctionCall2Coll(cmpfunc, \
3028  PG_GET_COLLATION(), \
3029  PG_GETARG_DATUM(0), \
3030  PG_GETARG_DATUM(1)))
3031 
3032 Datum
3034 {
3036 }
3037 
3038 Datum
3040 {
3042 }
3043 
3044 Datum
3046 {
3048 }
3049 
3050 Datum
3052 {
3054 }
3055 
3056 Datum
3058 {
3060 }
3061 
3062 Datum
3064 {
3066 }
3067 
3068 Datum
3070 {
3072 }
3073 
3074 Datum
3076 {
3078 }
3079 
3080 #undef CmpCall
3081 
3082 
3083 /*
3084  * The following operators support character-by-character comparison
3085  * of text datums, to allow building indexes suitable for LIKE clauses.
3086  * Note that the regular texteq/textne comparison operators, and regular
3087  * support functions 1 and 2 with "C" collation are assumed to be
3088  * compatible with these!
3089  */
3090 
3091 static int
3093 {
3094  int result;
3095  int len1,
3096  len2;
3097 
3098  len1 = VARSIZE_ANY_EXHDR(arg1);
3099  len2 = VARSIZE_ANY_EXHDR(arg2);
3100 
3101  result = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3102  if (result != 0)
3103  return result;
3104  else if (len1 < len2)
3105  return -1;
3106  else if (len1 > len2)
3107  return 1;
3108  else
3109  return 0;
3110 }
3111 
3112 
3113 Datum
3115 {
3116  text *arg1 = PG_GETARG_TEXT_PP(0);
3117  text *arg2 = PG_GETARG_TEXT_PP(1);
3118  int result;
3119 
3120  result = internal_text_pattern_compare(arg1, arg2);
3121 
3122  PG_FREE_IF_COPY(arg1, 0);
3123  PG_FREE_IF_COPY(arg2, 1);
3124 
3125  PG_RETURN_BOOL(result < 0);
3126 }
3127 
3128 
3129 Datum
3131 {
3132  text *arg1 = PG_GETARG_TEXT_PP(0);
3133  text *arg2 = PG_GETARG_TEXT_PP(1);
3134  int result;
3135 
3136  result = internal_text_pattern_compare(arg1, arg2);
3137 
3138  PG_FREE_IF_COPY(arg1, 0);
3139  PG_FREE_IF_COPY(arg2, 1);
3140 
3141  PG_RETURN_BOOL(result <= 0);
3142 }
3143 
3144 
3145 Datum
3147 {
3148  text *arg1 = PG_GETARG_TEXT_PP(0);
3149  text *arg2 = PG_GETARG_TEXT_PP(1);
3150  int result;
3151 
3152  result = internal_text_pattern_compare(arg1, arg2);
3153 
3154  PG_FREE_IF_COPY(arg1, 0);
3155  PG_FREE_IF_COPY(arg2, 1);
3156 
3157  PG_RETURN_BOOL(result >= 0);
3158 }
3159 
3160 
3161 Datum
3163 {
3164  text *arg1 = PG_GETARG_TEXT_PP(0);
3165  text *arg2 = PG_GETARG_TEXT_PP(1);
3166  int result;
3167 
3168  result = internal_text_pattern_compare(arg1, arg2);
3169 
3170  PG_FREE_IF_COPY(arg1, 0);
3171  PG_FREE_IF_COPY(arg2, 1);
3172 
3173  PG_RETURN_BOOL(result > 0);
3174 }
3175 
3176 
3177 Datum
3179 {
3180  text *arg1 = PG_GETARG_TEXT_PP(0);
3181  text *arg2 = PG_GETARG_TEXT_PP(1);
3182  int result;
3183 
3184  result = internal_text_pattern_compare(arg1, arg2);
3185 
3186  PG_FREE_IF_COPY(arg1, 0);
3187  PG_FREE_IF_COPY(arg2, 1);
3188 
3189  PG_RETURN_INT32(result);
3190 }
3191 
3192 
3193 Datum
3195 {
3197  MemoryContext oldcontext;
3198 
3199  oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
3200 
3201  /* Use generic string SortSupport, forcing "C" collation */
3202  varstr_sortsupport(ssup, TEXTOID, C_COLLATION_OID);
3203 
3204  MemoryContextSwitchTo(oldcontext);
3205 
3206  PG_RETURN_VOID();
3207 }
3208 
3209 
3210 /*-------------------------------------------------------------
3211  * byteaoctetlen
3212  *
3213  * get the number of bytes contained in an instance of type 'bytea'
3214  *-------------------------------------------------------------
3215  */
3216 Datum
3218 {
3219  Datum str = PG_GETARG_DATUM(0);
3220 
3221  /* We need not detoast the input at all */
3223 }
3224 
3225 /*
3226  * byteacat -
3227  * takes two bytea* and returns a bytea* that is the concatenation of
3228  * the two.
3229  *
3230  * Cloned from textcat and modified as required.
3231  */
3232 Datum
3234 {
3235  bytea *t1 = PG_GETARG_BYTEA_PP(0);
3236  bytea *t2 = PG_GETARG_BYTEA_PP(1);
3237 
3239 }
3240 
3241 /*
3242  * bytea_catenate
3243  * Guts of byteacat(), broken out so it can be used by other functions
3244  *
3245  * Arguments can be in short-header form, but not compressed or out-of-line
3246  */
3247 static bytea *
3249 {
3250  bytea *result;
3251  int len1,
3252  len2,
3253  len;
3254  char *ptr;
3255 
3256  len1 = VARSIZE_ANY_EXHDR(t1);
3257  len2 = VARSIZE_ANY_EXHDR(t2);
3258 
3259  /* paranoia ... probably should throw error instead? */
3260  if (len1 < 0)
3261  len1 = 0;
3262  if (len2 < 0)
3263  len2 = 0;
3264 
3265  len = len1 + len2 + VARHDRSZ;
3266  result = (bytea *) palloc(len);
3267 
3268  /* Set size of result string... */
3269  SET_VARSIZE(result, len);
3270 
3271  /* Fill data field of result string... */
3272  ptr = VARDATA(result);
3273  if (len1 > 0)
3274  memcpy(ptr, VARDATA_ANY(t1), len1);
3275  if (len2 > 0)
3276  memcpy(ptr + len1, VARDATA_ANY(t2), len2);
3277 
3278  return result;
3279 }
3280 
3281 #define PG_STR_GET_BYTEA(str_) \
3282  DatumGetByteaPP(DirectFunctionCall1(byteain, CStringGetDatum(str_)))
3283 
3284 /*
3285  * bytea_substr()
3286  * Return a substring starting at the specified position.
3287  * Cloned from text_substr and modified as required.
3288  *
3289  * Input:
3290  * - string
3291  * - starting position (is one-based)
3292  * - string length (optional)
3293  *
3294  * If the starting position is zero or less, then return from the start of the string
3295  * adjusting the length to be consistent with the "negative start" per SQL.
3296  * If the length is less than zero, an ERROR is thrown. If no third argument
3297  * (length) is provided, the length to the end of the string is assumed.
3298  */
3299 Datum
3301 {
3303  PG_GETARG_INT32(1),
3304  PG_GETARG_INT32(2),
3305  false));
3306 }
3307 
3308 /*
3309  * bytea_substr_no_len -
3310  * Wrapper to avoid opr_sanity failure due to
3311  * one function accepting a different number of args.
3312  */
3313 Datum
3315 {
3317  PG_GETARG_INT32(1),
3318  -1,
3319  true));
3320 }
3321 
3322 static bytea *
3324  int S,
3325  int L,
3326  bool length_not_specified)
3327 {
3328  int32 S1; /* adjusted start position */
3329  int32 L1; /* adjusted substring length */
3330  int32 E; /* end position */
3331 
3332  /*
3333  * The logic here should generally match text_substring().
3334  */
3335  S1 = Max(S, 1);
3336 
3337  if (length_not_specified)
3338  {
3339  /*
3340  * Not passed a length - DatumGetByteaPSlice() grabs everything to the
3341  * end of the string if we pass it a negative value for length.
3342  */
3343  L1 = -1;
3344  }
3345  else if (L < 0)
3346  {
3347  /* SQL99 says to throw an error for E < S, i.e., negative length */
3348  ereport(ERROR,
3349  (errcode(ERRCODE_SUBSTRING_ERROR),
3350  errmsg("negative substring length not allowed")));
3351  L1 = -1; /* silence stupider compilers */
3352  }
3353  else if (pg_add_s32_overflow(S, L, &E))
3354  {
3355  /*
3356  * L could be large enough for S + L to overflow, in which case the
3357  * substring must run to end of string.
3358  */
3359  L1 = -1;
3360  }
3361  else
3362  {
3363  /*
3364  * A zero or negative value for the end position can happen if the
3365  * start was negative or one. SQL99 says to return a zero-length
3366  * string.
3367  */
3368  if (E < 1)
3369  return PG_STR_GET_BYTEA("");
3370 
3371  L1 = E - S1;
3372  }
3373 
3374  /*
3375  * If the start position is past the end of the string, SQL99 says to
3376  * return a zero-length string -- DatumGetByteaPSlice() will do that for
3377  * us. We need only convert S1 to zero-based starting position.
3378  */
3379  return DatumGetByteaPSlice(str, S1 - 1, L1);
3380 }
3381 
3382 /*
3383  * byteaoverlay
3384  * Replace specified substring of first string with second
3385  *
3386  * The SQL standard defines OVERLAY() in terms of substring and concatenation.
3387  * This code is a direct implementation of what the standard says.
3388  */
3389 Datum
3391 {
3392  bytea *t1 = PG_GETARG_BYTEA_PP(0);
3393  bytea *t2 = PG_GETARG_BYTEA_PP(1);
3394  int sp = PG_GETARG_INT32(2); /* substring start position */
3395  int sl = PG_GETARG_INT32(3); /* substring length */
3396 
3397  PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl));
3398 }
3399 
3400 Datum
3402 {
3403  bytea *t1 = PG_GETARG_BYTEA_PP(0);
3404  bytea *t2 = PG_GETARG_BYTEA_PP(1);
3405  int sp = PG_GETARG_INT32(2); /* substring start position */
3406  int sl;
3407 
3408  sl = VARSIZE_ANY_EXHDR(t2); /* defaults to length(t2) */
3409  PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl));
3410 }
3411 
3412 static bytea *
3413 bytea_overlay(bytea *t1, bytea *t2, int sp, int sl)
3414 {
3415  bytea *result;
3416  bytea *s1;
3417  bytea *s2;
3418  int sp_pl_sl;
3419 
3420  /*
3421  * Check for possible integer-overflow cases. For negative sp, throw a
3422  * "substring length" error because that's what should be expected
3423  * according to the spec's definition of OVERLAY().
3424  */
3425  if (sp <= 0)
3426  ereport(ERROR,
3427  (errcode(ERRCODE_SUBSTRING_ERROR),
3428  errmsg("negative substring length not allowed")));
3429  if (pg_add_s32_overflow(sp, sl, &sp_pl_sl))
3430  ereport(ERROR,
3431  (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
3432  errmsg("integer out of range")));
3433 
3434  s1 = bytea_substring(PointerGetDatum(t1), 1, sp - 1, false);
3435  s2 = bytea_substring(PointerGetDatum(t1), sp_pl_sl, -1, true);
3436  result = bytea_catenate(s1, t2);
3437  result = bytea_catenate(result, s2);
3438 
3439  return result;
3440 }
3441 
3442 /*
3443  * byteapos -
3444  * Return the position of the specified substring.
3445  * Implements the SQL POSITION() function.
3446  * Cloned from textpos and modified as required.
3447  */
3448 Datum
3450 {
3451  bytea *t1 = PG_GETARG_BYTEA_PP(0);
3452  bytea *t2 = PG_GETARG_BYTEA_PP(1);
3453  int pos;
3454  int px,
3455  p;
3456  int len1,
3457  len2;
3458  char *p1,
3459  *p2;
3460 
3461  len1 = VARSIZE_ANY_EXHDR(t1);
3462  len2 = VARSIZE_ANY_EXHDR(t2);
3463 
3464  if (len2 <= 0)
3465  PG_RETURN_INT32(1); /* result for empty pattern */
3466 
3467  p1 = VARDATA_ANY(t1);
3468  p2 = VARDATA_ANY(t2);
3469 
3470  pos = 0;
3471  px = (len1 - len2);
3472  for (p = 0; p <= px; p++)
3473  {
3474  if ((*p2 == *p1) && (memcmp(p1, p2, len2) == 0))
3475  {
3476  pos = p + 1;
3477  break;
3478  };
3479  p1++;
3480  };
3481 
3482  PG_RETURN_INT32(pos);
3483 }
3484 
3485 /*-------------------------------------------------------------
3486  * byteaGetByte
3487  *
3488  * this routine treats "bytea" as an array of bytes.
3489  * It returns the Nth byte (a number between 0 and 255).
3490  *-------------------------------------------------------------
3491  */
3492 Datum
3494 {
3495  bytea *v = PG_GETARG_BYTEA_PP(0);
3496  int32 n = PG_GETARG_INT32(1);
3497  int len;
3498  int byte;
3499 
3500  len = VARSIZE_ANY_EXHDR(v);
3501 
3502  if (n < 0 || n >= len)
3503  ereport(ERROR,
3504  (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3505  errmsg("index %d out of valid range, 0..%d",
3506  n, len - 1)));
3507 
3508  byte = ((unsigned char *) VARDATA_ANY(v))[n];
3509 
3510  PG_RETURN_INT32(byte);
3511 }
3512 
3513 /*-------------------------------------------------------------
3514  * byteaGetBit
3515  *
3516  * This routine treats a "bytea" type like an array of bits.
3517  * It returns the value of the Nth bit (0 or 1).
3518  *
3519  *-------------------------------------------------------------
3520  */
3521 Datum
3523 {
3524  bytea *v = PG_GETARG_BYTEA_PP(0);
3525  int64 n = PG_GETARG_INT64(1);
3526  int byteNo,
3527  bitNo;
3528  int len;
3529  int byte;
3530 
3531  len = VARSIZE_ANY_EXHDR(v);
3532 
3533  if (n < 0 || n >= (int64) len * 8)
3534  ereport(ERROR,
3535  (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3536  errmsg("index %lld out of valid range, 0..%lld",
3537  (long long) n, (long long) len * 8 - 1)));
3538 
3539  /* n/8 is now known < len, so safe to cast to int */
3540  byteNo = (int) (n / 8);
3541  bitNo = (int) (n % 8);
3542 
3543  byte = ((unsigned char *) VARDATA_ANY(v))[byteNo];
3544 
3545  if (byte & (1 << bitNo))
3546  PG_RETURN_INT32(1);
3547  else
3548  PG_RETURN_INT32(0);
3549 }
3550 
3551 /*-------------------------------------------------------------
3552  * byteaSetByte
3553  *
3554  * Given an instance of type 'bytea' creates a new one with
3555  * the Nth byte set to the given value.
3556  *
3557  *-------------------------------------------------------------
3558  */
3559 Datum
3561 {
3562  bytea *res = PG_GETARG_BYTEA_P_COPY(0);
3563  int32 n = PG_GETARG_INT32(1);
3564  int32 newByte = PG_GETARG_INT32(2);
3565  int len;
3566 
3567  len = VARSIZE(res) - VARHDRSZ;
3568 
3569  if (n < 0 || n >= len)
3570  ereport(ERROR,
3571  (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3572  errmsg("index %d out of valid range, 0..%d",
3573  n, len - 1)));
3574 
3575  /*
3576  * Now set the byte.
3577  */
3578  ((unsigned char *) VARDATA(res))[n] = newByte;
3579 
3580  PG_RETURN_BYTEA_P(res);
3581 }
3582 
3583 /*-------------------------------------------------------------
3584  * byteaSetBit
3585  *
3586  * Given an instance of type 'bytea' creates a new one with
3587  * the Nth bit set to the given value.
3588  *
3589  *-------------------------------------------------------------
3590  */
3591 Datum
3593 {
3594  bytea *res = PG_GETARG_BYTEA_P_COPY(0);
3595  int64 n = PG_GETARG_INT64(1);
3596  int32 newBit = PG_GETARG_INT32(2);
3597  int len;
3598  int oldByte,
3599  newByte;
3600  int byteNo,
3601  bitNo;
3602 
3603  len = VARSIZE(res) - VARHDRSZ;
3604 
3605  if (n < 0 || n >= (int64) len * 8)
3606  ereport(ERROR,
3607  (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3608  errmsg("index %lld out of valid range, 0..%lld",
3609  (long long) n, (long long) len * 8 - 1)));
3610 
3611  /* n/8 is now known < len, so safe to cast to int */
3612  byteNo = (int) (n / 8);
3613  bitNo = (int) (n % 8);
3614 
3615  /*
3616  * sanity check!
3617  */
3618  if (newBit != 0 && newBit != 1)
3619  ereport(ERROR,
3620  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
3621  errmsg("new bit must be 0 or 1")));
3622 
3623  /*
3624  * Update the byte.
3625  */
3626  oldByte = ((unsigned char *) VARDATA(res))[byteNo];
3627 
3628  if (newBit == 0)
3629  newByte = oldByte & (~(1 << bitNo));
3630  else
3631  newByte = oldByte | (1 << bitNo);
3632 
3633  ((unsigned char *) VARDATA(res))[byteNo] = newByte;
3634 
3635  PG_RETURN_BYTEA_P(res);
3636 }
3637 
3638 
3639 /* text_name()
3640  * Converts a text type to a Name type.
3641  */
3642 Datum
3644 {
3645  text *s = PG_GETARG_TEXT_PP(0);
3646  Name result;
3647  int len;
3648 
3649  len = VARSIZE_ANY_EXHDR(s);
3650 
3651  /* Truncate oversize input */
3652  if (len >= NAMEDATALEN)
3653  len = pg_mbcliplen(VARDATA_ANY(s), len, NAMEDATALEN - 1);
3654 
3655  /* We use palloc0 here to ensure result is zero-padded */
3656  result = (Name) palloc0(NAMEDATALEN);
3657  memcpy(NameStr(*result), VARDATA_ANY(s), len);
3658 
3659  PG_RETURN_NAME(result);
3660 }
3661 
3662 /* name_text()
3663  * Converts a Name type to a text type.
3664  */
3665 Datum
3667 {
3668  Name s = PG_GETARG_NAME(0);
3669 
3671 }
3672 
3673 
3674 /*
3675  * textToQualifiedNameList - convert a text object to list of names
3676  *
3677  * This implements the input parsing needed by nextval() and other
3678  * functions that take a text parameter representing a qualified name.
3679  * We split the name at dots, downcase if not double-quoted, and
3680  * truncate names if they're too long.
3681  */
3682 List *
3684 {
3685  char *rawname;
3686  List *result = NIL;
3687  List *namelist;
3688  ListCell *l;
3689 
3690  /* Convert to C string (handles possible detoasting). */
3691  /* Note we rely on being able to modify rawname below. */
3692  rawname = text_to_cstring(textval);
3693 
3694  if (!SplitIdentifierString(rawname, '.', &namelist))
3695  ereport(ERROR,
3696  (errcode(ERRCODE_INVALID_NAME),
3697  errmsg("invalid name syntax")));
3698 
3699  if (namelist == NIL)
3700  ereport(ERROR,
3701  (errcode(ERRCODE_INVALID_NAME),
3702  errmsg("invalid name syntax")));
3703 
3704  foreach(l, namelist)
3705  {
3706  char *curname = (char *) lfirst(l);
3707 
3708  result = lappend(result, makeString(pstrdup(curname)));
3709  }
3710 
3711  pfree(rawname);
3712  list_free(namelist);
3713 
3714  return result;
3715 }
3716 
3717 /*
3718  * SplitIdentifierString --- parse a string containing identifiers
3719  *
3720  * This is the guts of textToQualifiedNameList, and is exported for use in
3721  * other situations such as parsing GUC variables. In the GUC case, it's
3722  * important to avoid memory leaks, so the API is designed to minimize the
3723  * amount of stuff that needs to be allocated and freed.
3724  *
3725  * Inputs:
3726  * rawstring: the input string; must be overwritable! On return, it's
3727  * been modified to contain the separated identifiers.
3728  * separator: the separator punctuation expected between identifiers
3729  * (typically '.' or ','). Whitespace may also appear around
3730  * identifiers.
3731  * Outputs:
3732  * namelist: filled with a palloc'd list of pointers to identifiers within
3733  * rawstring. Caller should list_free() this even on error return.
3734  *
3735  * Returns true if okay, false if there is a syntax error in the string.
3736  *
3737  * Note that an empty string is considered okay here, though not in
3738  * textToQualifiedNameList.
3739  */
3740 bool
3741 SplitIdentifierString(char *rawstring, char separator,
3742  List **namelist)
3743 {
3744  char *nextp = rawstring;
3745  bool done = false;
3746 
3747  *namelist = NIL;
3748 
3749  while (scanner_isspace(*nextp))
3750  nextp++; /* skip leading whitespace */
3751 
3752  if (*nextp == '\0')
3753  return true; /* allow empty string */
3754 
3755  /* At the top of the loop, we are at start of a new identifier. */
3756  do
3757  {
3758  char *curname;
3759  char *endp;
3760 
3761  if (*nextp == '"')
3762  {
3763  /* Quoted name --- collapse quote-quote pairs, no downcasing */
3764  curname = nextp + 1;
3765  for (;;)
3766  {
3767  endp = strchr(nextp + 1, '"');
3768  if (endp == NULL)
3769  return false; /* mismatched quotes */
3770  if (endp[1] != '"')
3771  break; /* found end of quoted name */
3772  /* Collapse adjacent quotes into one quote, and look again */
3773  memmove(endp, endp + 1, strlen(endp));
3774  nextp = endp;
3775  }
3776  /* endp now points at the terminating quote */
3777  nextp = endp + 1;
3778  }
3779  else
3780  {
3781  /* Unquoted name --- extends to separator or whitespace */
3782  char *downname;
3783  int len;
3784 
3785  curname = nextp;
3786  while (*nextp && *nextp != separator &&
3787  !scanner_isspace(*nextp))
3788  nextp++;
3789  endp = nextp;
3790  if (curname == nextp)
3791  return false; /* empty unquoted name not allowed */
3792 
3793  /*
3794  * Downcase the identifier, using same code as main lexer does.
3795  *
3796  * XXX because we want to overwrite the input in-place, we cannot
3797  * support a downcasing transformation that increases the string
3798  * length. This is not a problem given the current implementation
3799  * of downcase_truncate_identifier, but we'll probably have to do
3800  * something about this someday.
3801  */
3802  len = endp - curname;
3803  downname = downcase_truncate_identifier(curname, len, false);
3804  Assert(strlen(downname) <= len);
3805  strncpy(curname, downname, len); /* strncpy is required here */
3806  pfree(downname);
3807  }
3808 
3809  while (scanner_isspace(*nextp))
3810  nextp++; /* skip trailing whitespace */
3811 
3812  if (*nextp == separator)
3813  {
3814  nextp++;
3815  while (scanner_isspace(*nextp))
3816  nextp++; /* skip leading whitespace for next */
3817  /* we expect another name, so done remains false */
3818  }
3819  else if (*nextp == '\0')
3820  done = true;
3821  else
3822  return false; /* invalid syntax */
3823 
3824  /* Now safe to overwrite separator with a null */
3825  *endp = '\0';
3826 
3827  /* Truncate name if it's overlength */
3828  truncate_identifier(curname, strlen(curname), false);
3829 
3830  /*
3831  * Finished isolating current name --- add it to list
3832  */
3833  *namelist = lappend(*namelist, curname);
3834 
3835  /* Loop back if we didn't reach end of string */
3836  } while (!done);
3837 
3838  return true;
3839 }
3840 
3841 
3842 /*
3843  * SplitDirectoriesString --- parse a string containing file/directory names
3844  *
3845  * This works fine on file names too; the function name is historical.
3846  *
3847  * This is similar to SplitIdentifierString, except that the parsing
3848  * rules are meant to handle pathnames instead of identifiers: there is
3849  * no downcasing, embedded spaces are allowed, the max length is MAXPGPATH-1,
3850  * and we apply canonicalize_path() to each extracted string. Because of the
3851  * last, the returned strings are separately palloc'd rather than being
3852  * pointers into rawstring --- but we still scribble on rawstring.
3853  *
3854  * Inputs:
3855  * rawstring: the input string; must be modifiable!
3856  * separator: the separator punctuation expected between directories
3857  * (typically ',' or ';'). Whitespace may also appear around
3858  * directories.
3859  * Outputs:
3860  * namelist: filled with a palloc'd list of directory names.
3861  * Caller should list_free_deep() this even on error return.
3862  *
3863  * Returns true if okay, false if there is a syntax error in the string.
3864  *
3865  * Note that an empty string is considered okay here.
3866  */
3867 bool
3868 SplitDirectoriesString(char *rawstring, char separator,
3869  List **namelist)
3870 {
3871  char *nextp = rawstring;
3872  bool done = false;
3873 
3874  *namelist = NIL;
3875 
3876  while (scanner_isspace(*nextp))
3877  nextp++; /* skip leading whitespace */
3878 
3879  if (*nextp == '\0')
3880  return true; /* allow empty string */
3881 
3882  /* At the top of the loop, we are at start of a new directory. */
3883  do
3884  {
3885  char *curname;
3886  char *endp;
3887 
3888  if (*nextp == '"')
3889  {
3890  /* Quoted name --- collapse quote-quote pairs */
3891  curname = nextp + 1;
3892  for (;;)
3893  {
3894  endp = strchr(nextp + 1, '"');
3895  if (endp == NULL)
3896  return false; /* mismatched quotes */
3897  if (endp[1] != '"')
3898  break; /* found end of quoted name */
3899  /* Collapse adjacent quotes into one quote, and look again */
3900  memmove(endp, endp + 1, strlen(endp));
3901  nextp = endp;
3902  }
3903  /* endp now points at the terminating quote */
3904  nextp = endp + 1;
3905  }
3906  else
3907  {
3908  /* Unquoted name --- extends to separator or end of string */
3909  curname = endp = nextp;
3910  while (*nextp && *nextp != separator)
3911  {
3912  /* trailing whitespace should not be included in name */
3913  if (!scanner_isspace(*nextp))
3914  endp = nextp + 1;
3915  nextp++;
3916  }
3917  if (curname == endp)
3918  return false; /* empty unquoted name not allowed */
3919  }
3920 
3921  while (scanner_isspace(*nextp))
3922  nextp++; /* skip trailing whitespace */
3923 
3924  if (*nextp == separator)
3925  {
3926  nextp++;
3927  while (scanner_isspace(*nextp))
3928  nextp++; /* skip leading whitespace for next */
3929  /* we expect another name, so done remains false */
3930  }
3931  else if (*nextp == '\0')
3932  done = true;
3933  else
3934  return false; /* invalid syntax */
3935 
3936  /* Now safe to overwrite separator with a null */
3937  *endp = '\0';
3938 
3939  /* Truncate path if it's overlength */
3940  if (strlen(curname) >= MAXPGPATH)
3941  curname[MAXPGPATH - 1] = '\0';
3942 
3943  /*
3944  * Finished isolating current name --- add it to list
3945  */
3946  curname = pstrdup(curname);
3947  canonicalize_path(curname);
3948  *namelist = lappend(*namelist, curname);
3949 
3950  /* Loop back if we didn't reach end of string */
3951  } while (!done);
3952 
3953  return true;
3954 }
3955 
3956 
3957 /*
3958  * SplitGUCList --- parse a string containing identifiers or file names
3959  *
3960  * This is used to split the value of a GUC_LIST_QUOTE GUC variable, without
3961  * presuming whether the elements will be taken as identifiers or file names.
3962  * We assume the input has already been through flatten_set_variable_args(),
3963  * so that we need never downcase (if appropriate, that was done already).
3964  * Nor do we ever truncate, since we don't know the correct max length.
3965  * We disallow embedded whitespace for simplicity (it shouldn't matter,
3966  * because any embedded whitespace should have led to double-quoting).
3967  * Otherwise the API is identical to SplitIdentifierString.
3968  *
3969  * XXX it's annoying to have so many copies of this string-splitting logic.
3970  * However, it's not clear that having one function with a bunch of option
3971  * flags would be much better.
3972  *
3973  * XXX there is a version of this function in src/bin/pg_dump/dumputils.c.
3974  * Be sure to update that if you have to change this.
3975  *
3976  * Inputs:
3977  * rawstring: the input string; must be overwritable! On return, it's
3978  * been modified to contain the separated identifiers.
3979  * separator: the separator punctuation expected between identifiers
3980  * (typically '.' or ','). Whitespace may also appear around
3981  * identifiers.
3982  * Outputs:
3983  * namelist: filled with a palloc'd list of pointers to identifiers within
3984  * rawstring. Caller should list_free() this even on error return.
3985  *
3986  * Returns true if okay, false if there is a syntax error in the string.
3987  */
3988 bool
3989 SplitGUCList(char *rawstring, char separator,
3990  List **namelist)
3991 {
3992  char *nextp = rawstring;
3993  bool done = false;
3994 
3995  *namelist = NIL;
3996 
3997  while (scanner_isspace(*nextp))
3998  nextp++; /* skip leading whitespace */
3999 
4000  if (*nextp == '\0')
4001  return true; /* allow empty string */
4002 
4003  /* At the top of the loop, we are at start of a new identifier. */
4004  do
4005  {
4006  char *curname;
4007  char *endp;
4008 
4009  if (*nextp == '"')
4010  {
4011  /* Quoted name --- collapse quote-quote pairs */
4012  curname = nextp + 1;
4013  for (;;)
4014  {
4015  endp = strchr(nextp + 1, '"');
4016  if (endp == NULL)
4017  return false; /* mismatched quotes */
4018  if (endp[1] != '"')
4019  break; /* found end of quoted name */
4020  /* Collapse adjacent quotes into one quote, and look again */
4021  memmove(endp, endp + 1, strlen(endp));
4022  nextp = endp;
4023  }
4024  /* endp now points at the terminating quote */
4025  nextp = endp + 1;
4026  }
4027  else
4028  {
4029  /* Unquoted name --- extends to separator or whitespace */
4030  curname = nextp;
4031  while (*nextp && *nextp != separator &&
4032  !scanner_isspace(*nextp))
4033  nextp++;
4034  endp = nextp;
4035  if (curname == nextp)
4036  return false; /* empty unquoted name not allowed */
4037  }
4038 
4039  while (scanner_isspace(*nextp))
4040  nextp++; /* skip trailing whitespace */
4041 
4042  if (*nextp == separator)
4043  {
4044  nextp++;
4045  while (scanner_isspace(*nextp))
4046  nextp++; /* skip leading whitespace for next */
4047  /* we expect another name, so done remains false */
4048  }
4049  else if (*nextp == '\0')
4050  done = true;
4051  else
4052  return false; /* invalid syntax */
4053 
4054  /* Now safe to overwrite separator with a null */
4055  *endp = '\0';
4056 
4057  /*
4058  * Finished isolating current name --- add it to list
4059  */
4060  *namelist = lappend(*namelist, curname);
4061 
4062  /* Loop back if we didn't reach end of string */
4063  } while (!done);
4064 
4065  return true;
4066 }
4067 
4068 
4069 /*****************************************************************************
4070  * Comparison Functions used for bytea
4071  *
4072  * Note: btree indexes need these routines not to leak memory; therefore,
4073  * be careful to free working copies of toasted datums. Most places don't
4074  * need to be so careful.
4075  *****************************************************************************/
4076 
4077 Datum
4079 {
4080  Datum arg1 = PG_GETARG_DATUM(0);
4081  Datum arg2 = PG_GETARG_DATUM(1);
4082  bool result;
4083  Size len1,
4084  len2;
4085 
4086  /*
4087  * We can use a fast path for unequal lengths, which might save us from
4088  * having to detoast one or both values.
4089  */
4090  len1 = toast_raw_datum_size(arg1);
4091  len2 = toast_raw_datum_size(arg2);
4092  if (len1 != len2)
4093  result = false;
4094  else
4095  {
4096  bytea *barg1 = DatumGetByteaPP(arg1);
4097  bytea *barg2 = DatumGetByteaPP(arg2);
4098 
4099  result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2),
4100  len1 - VARHDRSZ) == 0);
4101 
4102  PG_FREE_IF_COPY(barg1, 0);
4103  PG_FREE_IF_COPY(barg2, 1);
4104  }
4105 
4106  PG_RETURN_BOOL(result);
4107 }
4108 
4109 Datum
4111 {
4112  Datum arg1 = PG_GETARG_DATUM(0);
4113  Datum arg2 = PG_GETARG_DATUM(1);
4114  bool result;
4115  Size len1,
4116  len2;
4117 
4118  /*
4119  * We can use a fast path for unequal lengths, which might save us from
4120  * having to detoast one or both values.
4121  */
4122  len1 = toast_raw_datum_size(arg1);
4123  len2 = toast_raw_datum_size(arg2);
4124  if (len1 != len2)
4125  result = true;
4126  else
4127  {
4128  bytea *barg1 = DatumGetByteaPP(arg1);
4129  bytea *barg2 = DatumGetByteaPP(arg2);
4130 
4131  result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2),
4132  len1 - VARHDRSZ) != 0);
4133 
4134  PG_FREE_IF_COPY(barg1, 0);
4135  PG_FREE_IF_COPY(barg2, 1);
4136  }
4137 
4138  PG_RETURN_BOOL(result);
4139 }
4140 
4141 Datum
4143 {
4144  bytea *arg1 = PG_GETARG_BYTEA_PP(0);
4145  bytea *arg2 = PG_GETARG_BYTEA_PP(1);
4146  int len1,
4147  len2;
4148  int cmp;
4149 
4150  len1 = VARSIZE_ANY_EXHDR(arg1);
4151  len2 = VARSIZE_ANY_EXHDR(arg2);
4152 
4153  cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4154 
4155  PG_FREE_IF_COPY(arg1, 0);
4156  PG_FREE_IF_COPY(arg2, 1);
4157 
4158  PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 < len2)));
4159 }
4160 
4161 Datum
4163 {
4164  bytea *arg1 = PG_GETARG_BYTEA_PP(0);
4165  bytea *arg2 = PG_GETARG_BYTEA_PP(1);
4166  int len1,
4167  len2;
4168  int cmp;
4169 
4170  len1 = VARSIZE_ANY_EXHDR(arg1);
4171  len2 = VARSIZE_ANY_EXHDR(arg2);
4172 
4173  cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4174 
4175  PG_FREE_IF_COPY(arg1, 0);
4176  PG_FREE_IF_COPY(arg2, 1);
4177 
4178  PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 <= len2)));
4179 }
4180 
4181 Datum
4183 {
4184  bytea *arg1 = PG_GETARG_BYTEA_PP(0);
4185  bytea *arg2 = PG_GETARG_BYTEA_PP(1);
4186  int len1,
4187  len2;
4188  int cmp;
4189 
4190  len1 = VARSIZE_ANY_EXHDR(arg1);
4191  len2 = VARSIZE_ANY_EXHDR(arg2);
4192 
4193  cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4194 
4195  PG_FREE_IF_COPY(arg1, 0);
4196  PG_FREE_IF_COPY(arg2, 1);
4197 
4198  PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 > len2)));
4199 }
4200 
4201 Datum
4203 {
4204  bytea *arg1 = PG_GETARG_BYTEA_PP(0);
4205  bytea *arg2 = PG_GETARG_BYTEA_PP(1);
4206  int len1,
4207  len2;
4208  int cmp;
4209 
4210  len1 = VARSIZE_ANY_EXHDR(arg1);
4211  len2 = VARSIZE_ANY_EXHDR(arg2);
4212 
4213  cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4214 
4215  PG_FREE_IF_COPY(arg1, 0);
4216  PG_FREE_IF_COPY(arg2, 1);
4217 
4218  PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 >= len2)));
4219 }
4220 
4221 Datum
4223 {
4224  bytea *arg1 = PG_GETARG_BYTEA_PP(0);
4225  bytea *arg2 = PG_GETARG_BYTEA_PP(1);
4226  int len1,
4227  len2;
4228  int cmp;
4229 
4230  len1 = VARSIZE_ANY_EXHDR(arg1);
4231  len2 = VARSIZE_ANY_EXHDR(arg2);
4232 
4233  cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4234  if ((cmp == 0) && (len1 != len2))
4235  cmp = (len1 < len2) ? -1 : 1;
4236 
4237  PG_FREE_IF_COPY(arg1, 0);
4238  PG_FREE_IF_COPY(arg2, 1);
4239 
4240  PG_RETURN_INT32(cmp);
4241 }
4242 
4243 Datum
4245 {
4247  MemoryContext oldcontext;
4248 
4249  oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
4250 
4251  /* Use generic string SortSupport, forcing "C" collation */
4252  varstr_sortsupport(ssup, BYTEAOID, C_COLLATION_OID);
4253 
4254  MemoryContextSwitchTo(oldcontext);
4255 
4256  PG_RETURN_VOID();
4257 }
4258 
4259 /*
4260  * appendStringInfoText
4261  *
4262  * Append a text to str.
4263  * Like appendStringInfoString(str, text_to_cstring(t)) but faster.
4264  */
4265 static void
4267 {
4269 }
4270 
4271 /*
4272  * replace_text
4273  * replace all occurrences of 'old_sub_str' in 'orig_str'
4274  * with 'new_sub_str' to form 'new_str'
4275  *
4276  * returns 'orig_str' if 'old_sub_str' == '' or 'orig_str' == ''
4277  * otherwise returns 'new_str'
4278  */
4279 Datum
4281 {
4282  text *src_text = PG_GETARG_TEXT_PP(0);
4283  text *from_sub_text = PG_GETARG_TEXT_PP(1);
4284  text *to_sub_text = PG_GETARG_TEXT_PP(2);
4285  int src_text_len;
4286  int from_sub_text_len;
4288  text *ret_text;
4289  int chunk_len;
4290  char *curr_ptr;
4291  char *start_ptr;
4293  bool found;
4294 
4295  src_text_len = VARSIZE_ANY_EXHDR(src_text);
4296  from_sub_text_len = VARSIZE_ANY_EXHDR(from_sub_text);
4297 
4298  /* Return unmodified source string if empty source or pattern */
4299  if (src_text_len < 1 || from_sub_text_len < 1)
4300  {
4301  PG_RETURN_TEXT_P(src_text);
4302  }
4303 
4304  text_position_setup(src_text, from_sub_text, PG_GET_COLLATION(), &state);
4305 
4306  found = text_position_next(&state);
4307 
4308  /* When the from_sub_text is not found, there is nothing to do. */
4309  if (!found)
4310  {
4311  text_position_cleanup(&state);
4312  PG_RETURN_TEXT_P(src_text);
4313  }
4314  curr_ptr = text_position_get_match_ptr(&state);
4315  start_ptr = VARDATA_ANY(src_text);
4316 
4317  initStringInfo(&str);
4318 
4319  do
4320  {
4322 
4323  /* copy the data skipped over by last text_position_next() */
4324  chunk_len = curr_ptr - start_ptr;
4325  appendBinaryStringInfo(&str, start_ptr, chunk_len);
4326 
4327  appendStringInfoText(&str, to_sub_text);
4328 
4329  start_ptr = curr_ptr + from_sub_text_len;
4330 
4331  found = text_position_next(&state);
4332  if (found)
4333  curr_ptr = text_position_get_match_ptr(&state);
4334  }
4335  while (found);
4336 
4337  /* copy trailing data */
4338  chunk_len = ((char *) src_text + VARSIZE_ANY(src_text)) - start_ptr;
4339  appendBinaryStringInfo(&str, start_ptr, chunk_len);
4340 
4341  text_position_cleanup(&state);
4342 
4343  ret_text = cstring_to_text_with_len(str.data, str.len);
4344  pfree(str.data);
4345 
4346  PG_RETURN_TEXT_P(ret_text);
4347 }
4348 
4349 /*
4350  * check_replace_text_has_escape_char
4351  *
4352  * check whether replace_text contains escape char.
4353  */
4354 static bool
4356 {
4357  const char *p = VARDATA_ANY(replace_text);
4358  const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);
4359 
4361  {
4362  for (; p < p_end; p++)
4363  {
4364  if (*p == '\\')
4365  return true;
4366  }
4367  }
4368  else
4369  {
4370  for (; p < p_end; p += pg_mblen(p))
4371  {
4372  if (*p == '\\')
4373  return true;
4374  }
4375  }
4376 
4377  return false;
4378 }
4379 
4380 /*
4381  * appendStringInfoRegexpSubstr
4382  *
4383  * Append replace_text to str, substituting regexp back references for
4384  * \n escapes. start_ptr is the start of the match in the source string,
4385  * at logical character position data_pos.
4386  */
4387 static void
4389  regmatch_t *pmatch,
4390  char *start_ptr, int data_pos)
4391 {
4392  const char *p = VARDATA_ANY(replace_text);
4393  const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);
4394  int eml = pg_database_encoding_max_length();
4395 
4396  for (;;)
4397  {
4398  const char *chunk_start = p;
4399  int so;
4400  int eo;
4401 
4402  /* Find next escape char. */
4403  if (eml == 1)
4404  {
4405  for (; p < p_end && *p != '\\'; p++)
4406  /* nothing */ ;
4407  }
4408  else
4409  {
4410  for (; p < p_end && *p != '\\'; p += pg_mblen(p))
4411  /* nothing */ ;
4412  }
4413 
4414  /* Copy the text we just scanned over, if any. */
4415  if (p > chunk_start)
4416  appendBinaryStringInfo(str, chunk_start, p - chunk_start);
4417 
4418  /* Done if at end of string, else advance over escape char. */
4419  if (p >= p_end)
4420  break;
4421  p++;
4422 
4423  if (p >= p_end)
4424  {
4425  /* Escape at very end of input. Treat same as unexpected char */
4426  appendStringInfoChar(str, '\\');
4427  break;
4428  }
4429 
4430  if (*p >= '1' && *p <= '9')
4431  {
4432  /* Use the back reference of regexp. */
4433  int idx = *p - '0';
4434 
4435  so = pmatch[idx].rm_so;
4436  eo = pmatch[idx].rm_eo;
4437  p++;
4438  }
4439  else if (*p == '&')
4440  {
4441  /* Use the entire matched string. */
4442  so = pmatch[0].rm_so;
4443  eo = pmatch[0].rm_eo;
4444  p++;
4445  }
4446  else if (*p == '\\')
4447  {
4448  /* \\ means transfer one \ to output. */
4449  appendStringInfoChar(str, '\\');
4450  p++;
4451  continue;
4452  }
4453  else
4454  {
4455  /*
4456  * If escape char is not followed by any expected char, just treat
4457  * it as ordinary data to copy. (XXX would it be better to throw
4458  * an error?)
4459  */
4460  appendStringInfoChar(str, '\\');
4461  continue;
4462  }
4463 
4464  if (so != -1 && eo != -1)
4465  {
4466  /*
4467  * Copy the text that is back reference of regexp. Note so and eo
4468  * are counted in characters not bytes.
4469  */
4470  char *chunk_start;
4471  int chunk_len;
4472 
4473  Assert(so >= data_pos);
4474  chunk_start = start_ptr;
4475  chunk_start += charlen_to_bytelen(chunk_start, so - data_pos);
4476  chunk_len = charlen_to_bytelen(chunk_start, eo - so);
4477  appendBinaryStringInfo(str, chunk_start, chunk_len);
4478  }
4479  }
4480 }
4481 
4482 #define REGEXP_REPLACE_BACKREF_CNT 10
4483 
4484 /*
4485  * replace_text_regexp
4486  *
4487  * replace text that matches to regexp in src_text to replace_text.
4488  *
4489  * Note: to avoid having to include regex.h in builtins.h, we declare
4490  * the regexp argument as void *, but really it's regex_t *.
4491  */
4492 text *
4493 replace_text_regexp(text *src_text, void *regexp,
4494  text *replace_text, bool glob)
4495 {
4496  text *ret_text;
4497  regex_t *re = (regex_t *) regexp;
4498  int src_text_len = VARSIZE_ANY_EXHDR(src_text);
4501  pg_wchar *data;
4502  size_t data_len;
4503  int search_start;
4504  int data_pos;
4505  char *start_ptr;
4506  bool have_escape;
4507 
4508  initStringInfo(&buf);
4509 
4510  /* Convert data string to wide characters. */
4511  data = (pg_wchar *) palloc((src_text_len + 1) * sizeof(pg_wchar));
4512  data_len = pg_mb2wchar_with_len(VARDATA_ANY(src_text), data, src_text_len);
4513 
4514  /* Check whether replace_text has escape char. */
4515  have_escape = check_replace_text_has_escape_char(replace_text);
4516 
4517  /* start_ptr points to the data_pos'th character of src_text */
4518  start_ptr = (char *) VARDATA_ANY(src_text);
4519  data_pos = 0;
4520 
4521  search_start = 0;
4522  while (search_start <= data_len)
4523  {
4524  int regexec_result;
4525 
4527 
4528  regexec_result = pg_regexec(re,
4529  data,
4530  data_len,
4531  search_start,
4532  NULL, /* no details */
4534  pmatch,
4535  0);
4536 
4537  if (regexec_result == REG_NOMATCH)
4538  break;
4539 
4540  if (regexec_result != REG_OKAY)
4541  {
4542  char errMsg[100];
4543 
4545  pg_regerror(regexec_result, re, errMsg, sizeof(errMsg));
4546  ereport(ERROR,
4547  (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
4548  errmsg("regular expression failed: %s", errMsg)));
4549  }
4550 
4551  /*
4552  * Copy the text to the left of the match position. Note we are given
4553  * character not byte indexes.
4554  */
4555  if (pmatch[0].rm_so - data_pos > 0)
4556  {
4557  int chunk_len;
4558 
4559  chunk_len = charlen_to_bytelen(start_ptr,
4560  pmatch[0].rm_so - data_pos);
4561  appendBinaryStringInfo(&buf, start_ptr, chunk_len);
4562 
4563  /*
4564  * Advance start_ptr over that text, to avoid multiple rescans of
4565  * it if the replace_text contains multiple back-references.
4566  */
4567  start_ptr += chunk_len;
4568  data_pos = pmatch[0].rm_so;
4569  }
4570 
4571  /*
4572  * Copy the replace_text. Process back references when the
4573  * replace_text has escape characters.
4574  */
4575  if (have_escape)
4576  appendStringInfoRegexpSubstr(&buf, replace_text, pmatch,
4577  start_ptr, data_pos);
4578  else
4579  appendStringInfoText(&buf, replace_text);
4580 
4581  /* Advance start_ptr and data_pos over the matched text. */
4582  start_ptr += charlen_to_bytelen(start_ptr,
4583  pmatch[0].rm_eo - data_pos);
4584  data_pos = pmatch[0].rm_eo;
4585 
4586  /*
4587  * When global option is off, replace the first instance only.
4588  */
4589  if (!glob)
4590  break;
4591 
4592  /*
4593  * Advance search position. Normally we start the next search at the
4594  * end of the previous match; but if the match was of zero length, we
4595  * have to advance by one character, or we'd just find the same match
4596  * again.
4597  */
4598  search_start = data_pos;
4599  if (pmatch[0].rm_so == pmatch[0].rm_eo)
4600  search_start++;
4601  }
4602 
4603  /*
4604  * Copy the text to the right of the last match.
4605  */
4606  if (data_pos < data_len)
4607  {
4608  int chunk_len;
4609 
4610  chunk_len = ((char *) src_text + VARSIZE_ANY(src_text)) - start_ptr;
4611  appendBinaryStringInfo(&buf, start_ptr, chunk_len);
4612  }
4613 
4614  ret_text = cstring_to_text_with_len(buf.data, buf.len);
4615  pfree(buf.data);
4616  pfree(data);
4617 
4618  return ret_text;
4619 }
4620 
4621 /*
4622  * split_part
4623  * parse input string based on provided field separator
4624  * return N'th item (1 based, negative counts from end)
4625  */
4626 Datum
4628 {
4629  text *inputstring = PG_GETARG_TEXT_PP(0);
4630  text *fldsep = PG_GETARG_TEXT_PP(1);
4631  int fldnum = PG_GETARG_INT32(2);
4632  int inputstring_len;
4633  int fldsep_len;
4635  char *start_ptr;
4636  char *end_ptr;
4637  text *result_text;
4638  bool found;
4639 
4640  /* field number is 1 based */
4641  if (fldnum == 0)
4642  ereport(ERROR,
4643  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4644  errmsg("field position must not be zero")));
4645 
4646  inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
4647  fldsep_len = VARSIZE_ANY_EXHDR(fldsep);
4648 
4649  /* return empty string for empty input string */
4650  if (inputstring_len < 1)
4652 
4653  /* handle empty field separator */
4654  if (fldsep_len < 1)
4655  {
4656  /* if first or last field, return input string, else empty string */
4657  if (fldnum == 1 || fldnum == -1)
4658  PG_RETURN_TEXT_P(inputstring);
4659  else
4661  }
4662 
4663  /* find the first field separator */
4664  text_position_setup(inputstring, fldsep, PG_GET_COLLATION(), &state);
4665 
4666  found = text_position_next(&state);
4667 
4668  /* special case if fldsep not found at all */
4669  if (!found)
4670  {
4671  text_position_cleanup(&state);
4672  /* if first or last field, return input string, else empty string */
4673  if (fldnum == 1 || fldnum == -1)
4674  PG_RETURN_TEXT_P(inputstring);
4675  else
4677  }
4678 
4679  /*
4680  * take care of a negative field number (i.e. count from the right) by
4681  * converting to a positive field number; we need total number of fields
4682  */
4683  if (fldnum < 0)
4684  {
4685  /* we found a fldsep, so there are at least two fields */
4686  int numfields = 2;
4687 
4688  while (text_position_next(&state))
4689  numfields++;
4690 
4691  /* special case of last field does not require an extra pass */
4692  if (fldnum == -1)
4693  {
4694  start_ptr = text_position_get_match_ptr(&state) + fldsep_len;
4695  end_ptr = VARDATA_ANY(inputstring) + inputstring_len;
4696  text_position_cleanup(&state);
4698  end_ptr - start_ptr));
4699  }
4700 
4701  /* else, convert fldnum to positive notation */
4702  fldnum += numfields + 1;
4703 
4704  /* if nonexistent field, return empty string */
4705  if (fldnum <= 0)
4706  {
4707  text_position_cleanup(&state);
4709  }
4710 
4711  /* reset to pointing at first match, but now with positive fldnum */
4712  text_position_reset(&state);
4713  found = text_position_next(&state);
4714  Assert(found);
4715  }
4716 
4717  /* identify bounds of first field */
4718  start_ptr = VARDATA_ANY(inputstring);
4719  end_ptr = text_position_get_match_ptr(&state);
4720 
4721  while (found && --fldnum > 0)
4722  {
4723  /* identify bounds of next field */
4724  start_ptr = end_ptr + fldsep_len;
4725  found = text_position_next(&state);
4726  if (found)
4727  end_ptr = text_position_get_match_ptr(&state);
4728  }
4729 
4730  text_position_cleanup(&state);
4731 
4732  if (fldnum > 0)
4733  {
4734  /* N'th field separator not found */
4735  /* if last field requested, return it, else empty string */
4736  if (fldnum == 1)
4737  {
4738  int last_len = start_ptr - VARDATA_ANY(inputstring);
4739 
4740  result_text = cstring_to_text_with_len(start_ptr,
4741  inputstring_len - last_len);
4742  }
4743  else
4744  result_text = cstring_to_text("");
4745  }
4746  else
4747  {
4748  /* non-last field requested */
4749  result_text = cstring_to_text_with_len(start_ptr, end_ptr - start_ptr);
4750  }
4751 
4752  PG_RETURN_TEXT_P(result_text);
4753 }
4754 
4755 /*
4756  * Convenience function to return true when two text params are equal.
4757  */
4758 static bool
4759 text_isequal(text *txt1, text *txt2, Oid collid)
4760 {
4762  collid,
4763  PointerGetDatum(txt1),
4764  PointerGetDatum(txt2)));
4765 }
4766 
4767 /*
4768  * text_to_array
4769  * parse input string and return text array of elements,
4770  * based on provided field separator
4771  */
4772 Datum
4774 {
4775  SplitTextOutputData tstate;
4776 
4777  /* For array output, tstate should start as all zeroes */
4778  memset(&tstate, 0, sizeof(tstate));
4779 
4780  if (!split_text(fcinfo, &tstate))
4781  PG_RETURN_NULL();
4782 
4783  if (tstate.astate == NULL)
4785 
4788 }
4789 
4790 /*
4791  * text_to_array_null
4792  * parse input string and return text array of elements,
4793  * based on provided field separator and null string
4794  *
4795  * This is a separate entry point only to prevent the regression tests from
4796  * complaining about different argument sets for the same internal function.
4797  */
4798 Datum
4800 {
4801  return text_to_array(fcinfo);
4802 }
4803 
4804 /*
4805  * text_to_table
4806  * parse input string and return table of elements,
4807  * based on provided field separator
4808  */
4809 Datum
4811 {
4812  ReturnSetInfo *rsi = (ReturnSetInfo *) fcinfo->resultinfo;
4813  SplitTextOutputData tstate;
4814  MemoryContext old_cxt;
4815 
4816  /* check to see if caller supports us returning a tuplestore */
4817  if (rsi == NULL || !IsA(rsi, ReturnSetInfo))
4818  ereport(ERROR,
4819  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
4820  errmsg("set-valued function called in context that cannot accept a set")));
4821  if (!(rsi->allowedModes & SFRM_Materialize))
4822  ereport(ERROR,
4823  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
4824  errmsg("materialize mode required, but it is not allowed in this context")));
4825 
4826  /* OK, prepare tuplestore in per-query memory */
4828 
4829  tstate.astate = NULL;
4830  tstate.tupdesc = CreateTupleDescCopy(rsi->expectedDesc);
4831  tstate.tupstore = tuplestore_begin_heap(true, false, work_mem);
4832 
4833  MemoryContextSwitchTo(old_cxt);
4834 
4835  (void) split_text(fcinfo, &tstate);
4836 
4837  tuplestore_donestoring(tstate.tupstore);
4838 
4840  rsi->setResult = tstate.tupstore;
4841  rsi->setDesc = tstate.tupdesc;
4842 
4843  return (Datum) 0;
4844 }
4845 
4846 /*
4847  * text_to_table_null
4848  * parse input string and return table of elements,
4849  * based on provided field separator and null string
4850  *
4851  * This is a separate entry point only to prevent the regression tests from
4852  * complaining about different argument sets for the same internal function.
4853  */
4854 Datum
4856 {
4857  return text_to_table(fcinfo);
4858 }
4859 
4860 /*
4861  * Common code for text_to_array, text_to_array_null, text_to_table
4862  * and text_to_table_null functions.
4863  *
4864  * These are not strict so we have to test for null inputs explicitly.
4865  * Returns false if result is to be null, else returns true.
4866  *
4867  * Note that if the result is valid but empty (zero elements), we return
4868  * without changing *tstate --- caller must handle that case, too.
4869  */
4870 static bool
4872 {
4873  text *inputstring;
4874  text *fldsep;
4875  text *null_string;
4876  Oid collation = PG_GET_COLLATION();
4877  int inputstring_len;
4878  int fldsep_len;
4879  char *start_ptr;
4880  text *result_text;
4881 
4882  /* when input string is NULL, then result is NULL too */
4883  if (PG_ARGISNULL(0))
4884  return false;
4885 
4886  inputstring = PG_GETARG_TEXT_PP(0);
4887 
4888  /* fldsep can be NULL */
4889  if (!PG_ARGISNULL(1))
4890  fldsep = PG_GETARG_TEXT_PP(1);
4891  else
4892  fldsep = NULL;
4893 
4894  /* null_string can be NULL or omitted */
4895  if (PG_NARGS() > 2 && !PG_ARGISNULL(2))
4896  null_string = PG_GETARG_TEXT_PP(2);
4897  else
4898  null_string = NULL;
4899 
4900  if (fldsep != NULL)
4901  {
4902  /*
4903  * Normal case with non-null fldsep. Use the text_position machinery
4904  * to search for occurrences of fldsep.
4905  */
4907 
4908  inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
4909  fldsep_len = VARSIZE_ANY_EXHDR(fldsep);
4910 
4911  /* return empty set for empty input string */
4912  if (inputstring_len < 1)
4913  return true;
4914 
4915  /* empty field separator: return input string as a one-element set */
4916  if (fldsep_len < 1)
4917  {
4918  split_text_accum_result(tstate, inputstring,
4919  null_string, collation);
4920  return true;
4921  }
4922 
4923  text_position_setup(inputstring, fldsep, collation, &state);
4924 
4925  start_ptr = VARDATA_ANY(inputstring);
4926 
4927  for (;;)
4928  {
4929  bool found;
4930  char *end_ptr;
4931  int chunk_len;
4932 
4934 
4935  found = text_position_next(&state);
4936  if (!found)
4937  {
4938  /* fetch last field */
4939  chunk_len = ((char *) inputstring + VARSIZE_ANY(inputstring)) - start_ptr;
4940  end_ptr = NULL; /* not used, but some compilers complain */
4941  }
4942  else
4943  {
4944  /* fetch non-last field */
4945  end_ptr = text_position_get_match_ptr(&state);
4946  chunk_len = end_ptr - start_ptr;
4947  }
4948 
4949  /* build a temp text datum to pass to split_text_accum_result */
4950  result_text = cstring_to_text_with_len(start_ptr, chunk_len);
4951 
4952  /* stash away this field */
4953  split_text_accum_result(tstate, result_text,
4954  null_string, collation);
4955 
4956  pfree(result_text);
4957 
4958  if (!found)
4959  break;
4960 
4961  start_ptr = end_ptr + fldsep_len;
4962  }
4963 
4964  text_position_cleanup(&state);
4965  }
4966  else
4967  {
4968  /*
4969  * When fldsep is NULL, each character in the input string becomes a
4970  * separate element in the result set. The separator is effectively
4971  * the space between characters.
4972  */
4973  inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
4974 
4975  start_ptr = VARDATA_ANY(inputstring);
4976 
4977  while (inputstring_len > 0)
4978  {
4979  int chunk_len = pg_mblen(start_ptr);
4980 
4982 
4983  /* build a temp text datum to pass to split_text_accum_result */
4984  result_text = cstring_to_text_with_len(start_ptr, chunk_len);
4985 
4986  /* stash away this field */
4987  split_text_accum_result(tstate, result_text,
4988  null_string, collation);
4989 
4990  pfree(result_text);
4991 
4992  start_ptr += chunk_len;
4993  inputstring_len -= chunk_len;
4994  }
4995  }
4996 
4997  return true;
4998 }
4999 
5000 /*
5001  * Add text item to result set (table or array).
5002  *
5003  * This is also responsible for checking to see if the item matches
5004  * the null_string, in which case we should emit NULL instead.
5005  */
5006 static void
5008  text *field_value,
5009  text *null_string,
5010  Oid collation)
5011 {
5012  bool is_null = false;
5013 
5014  if (null_string && text_isequal(field_value, null_string, collation))
5015  is_null = true;
5016 
5017  if (tstate->tupstore)
5018  {
5019  Datum values[1];
5020  bool nulls[1];
5021 
5022  values[0] = PointerGetDatum(field_value);
5023  nulls[0] = is_null;
5024 
5026  tstate->tupdesc,
5027  values,
5028  nulls);
5029  }
5030  else
5031  {
5032  tstate->astate = accumArrayResult(tstate->astate,
5033  PointerGetDatum(field_value),
5034  is_null,
5035  TEXTOID,
5037  }
5038 }
5039 
5040 /*
5041  * array_to_text
5042  * concatenate Cstring representation of input array elements
5043  * using provided field separator
5044  */
5045 Datum
5047 {
5049  char *fldsep = text_to_cstring(PG_GETARG_TEXT_PP(1));
5050 
5051  PG_RETURN_TEXT_P(array_to_text_internal(fcinfo, v, fldsep, NULL));
5052 }
5053 
5054 /*
5055  * array_to_text_null
5056  * concatenate Cstring representation of input array elements
5057  * using provided field separator and null string
5058  *
5059  * This version is not strict so we have to test for null inputs explicitly.
5060  */
5061 Datum
5063 {
5064  ArrayType *v;
5065  char *fldsep;
5066  char *null_string;
5067 
5068  /* returns NULL when first or second parameter is NULL */
5069  if (PG_ARGISNULL(0) || PG_ARGISNULL(1))
5070  PG_RETURN_NULL();
5071 
5072  v = PG_GETARG_ARRAYTYPE_P(0);
5073  fldsep = text_to_cstring(PG_GETARG_TEXT_PP(1));
5074 
5075  /* NULL null string is passed through as a null pointer */
5076  if (!PG_ARGISNULL(2))
5077  null_string = text_to_cstring(PG_GETARG_TEXT_PP(2));
5078  else
5079  null_string = NULL;
5080 
5081  PG_RETURN_TEXT_P(array_to_text_internal(fcinfo, v, fldsep, null_string));
5082 }
5083 
5084 /*
5085  * common code for array_to_text and array_to_text_null functions
5086  */
5087 static text *
5089  const char *fldsep, const char *null_string)
5090 {
5091  text *result;
5092  int nitems,
5093  *dims,
5094  ndims;
5095  Oid element_type;
5096  int typlen;
5097  bool typbyval;
5098  char typalign;
5100  bool printed = false;
5101  char *p;
5102  bits8 *bitmap;
5103  int bitmask;
5104  int i;
5105  ArrayMetaState *my_extra;
5106 
5107  ndims = ARR_NDIM(v);
5108  dims = ARR_DIMS(v);
5109  nitems = ArrayGetNItems(ndims, dims);
5110 
5111  /* if there are no elements, return an empty string */
5112  if (nitems == 0)
5113  return cstring_to_text_with_len("", 0);
5114 
5115  element_type = ARR_ELEMTYPE(v);
5116  initStringInfo(&buf);
5117 
5118  /*
5119  * We arrange to look up info about element type, including its output
5120  * conversion proc, only once per series of calls, assuming the element
5121  * type doesn't change underneath us.
5122  */
5123  my_extra = (ArrayMetaState *) fcinfo->flinfo->fn_extra;
5124  if (my_extra == NULL)
5125  {
5126  fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
5127  sizeof(ArrayMetaState));
5128  my_extra = (ArrayMetaState *) fcinfo->flinfo->fn_extra;
5129  my_extra->element_type = ~element_type;
5130  }
5131 
5132  if (my_extra->element_type != element_type)
5133  {
5134  /*
5135  * Get info about element type, including its output conversion proc
5136  */
5137  get_type_io_data(element_type, IOFunc_output,
5138  &my_extra->typlen, &my_extra->typbyval,
5139  &my_extra->typalign, &my_extra->typdelim,
5140  &my_extra->typioparam, &my_extra->typiofunc);
5141  fmgr_info_cxt(my_extra->typiofunc, &my_extra->proc,
5142  fcinfo->flinfo->fn_mcxt);
5143  my_extra->element_type = element_type;
5144  }
5145  typlen = my_extra->typlen;
5146  typbyval = my_extra->typbyval;
5147  typalign = my_extra->typalign;
5148 
5149  p = ARR_DATA_PTR(v);
5150  bitmap = ARR_NULLBITMAP(v);
5151  bitmask = 1;
5152 
5153  for (i = 0; i < nitems; i++)
5154  {
5155  Datum itemvalue;
5156  char *value;
5157 
5158  /* Get source element, checking for NULL */
5159  if (bitmap && (*bitmap & bitmask) == 0)
5160  {
5161  /* if null_string is NULL, we just ignore null elements */
5162  if (null_string != NULL)
5163  {
5164  if (printed)
5165  appendStringInfo(&buf, "%s%s", fldsep, null_string);
5166  else
5167  appendStringInfoString(&buf, null_string);
5168  printed = true;
5169  }
5170  }
5171  else
5172  {
5173  itemvalue = fetch_att(p, typbyval, typlen);
5174 
5175  value = OutputFunctionCall(&my_extra->proc, itemvalue);
5176 
5177  if (printed)
5178  appendStringInfo(&buf, "%s%s", fldsep, value);
5179  else
5180  appendStringInfoString(&buf, value);
5181  printed = true;
5182 
5183  p = att_addlength_pointer(p, typlen, p);
5184  p = (char *) att_align_nominal(p, typalign);
5185  }
5186 
5187  /* advance bitmap pointer if any */
5188  if (bitmap)
5189  {
5190  bitmask <<= 1;
5191  if (bitmask == 0x100)
5192  {
5193  bitmap++;
5194  bitmask = 1;
5195  }
5196  }
5197  }
5198 
5199  result = cstring_to_text_with_len(buf.data, buf.len);
5200  pfree(buf.data);
5201 
5202  return result;
5203 }
5204 
5205 #define HEXBASE 16
5206 /*
5207  * Convert an int32 to a string containing a base 16 (hex) representation of
5208  * the number.
5209  */
5210 Datum
5212 {
5214  char *ptr;
5215  const char *digits = "0123456789abcdef";
5216  char buf[32]; /* bigger than needed, but reasonable */
5217 
5218  ptr = buf + sizeof(buf) - 1;
5219  *ptr = '\0';
5220 
5221  do
5222  {
5223  *--ptr = digits[value % HEXBASE];
5224  value /= HEXBASE;
5225  } while (ptr > buf && value);
5226 
5228 }
5229 
5230 /*
5231  * Convert an int64 to a string containing a base 16 (hex) representation of
5232  * the number.
5233  */
5234 Datum
5236 {
5237  uint64 value = (uint64) PG_GETARG_INT64(0);
5238  char *ptr;
5239  const char *digits = "0123456789abcdef";
5240  char buf[32]; /* bigger than needed, but reasonable */
5241 
5242  ptr = buf + sizeof(buf) - 1;
5243  *ptr = '\0';
5244 
5245  do
5246  {
5247  *--ptr = digits[value % HEXBASE];
5248  value /= HEXBASE;
5249  } while (ptr > buf && value);
5250 
5252 }
5253 
5254 /*
5255  * Return the size of a datum, possibly compressed
5256  *
5257  * Works on any data type
5258  */
5259 Datum
5261 {
5263  int32 result;
5264  int typlen;
5265 
5266  /* On first call, get the input type's typlen, and save at *fn_extra */
5267  if (fcinfo->flinfo->fn_extra == NULL)
5268  {
5269  /* Lookup the datatype of the supplied argument */
5270  Oid argtypeid = get_fn_expr_argtype(fcinfo->flinfo, 0);
5271 
5272  typlen = get_typlen(argtypeid);
5273  if (typlen == 0) /* should not happen */
5274  elog(ERROR, "cache lookup failed for type %u", argtypeid);
5275 
5276  fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
5277  sizeof(int));
5278  *((int *) fcinfo->flinfo->fn_extra) = typlen;
5279  }
5280  else
5281  typlen = *((int *) fcinfo->flinfo->fn_extra);
5282 
5283  if (typlen == -1)
5284  {
5285  /* varlena type, possibly toasted */
5286  result = toast_datum_size(value);
5287  }
5288  else if (typlen == -2)
5289  {
5290  /* cstring */
5291  result = strlen(DatumGetCString(value)) + 1;
5292  }
5293  else
5294  {
5295  /* ordinary fixed-width type */
5296  result = typlen;
5297  }
5298 
5299  PG_RETURN_INT32(result);
5300 }
5301 
5302 /*
5303  * string_agg - Concatenates values and returns string.
5304  *
5305  * Syntax: string_agg(value text, delimiter text) RETURNS text
5306  *
5307  * Note: Any NULL values are ignored. The first-call delimiter isn't
5308  * actually used at all, and on subsequent calls the delimiter precedes
5309  * the associated value.
5310  */
5311 
5312 /* subroutine to initialize state */
5313 static StringInfo
5315 {
5316  StringInfo state;
5317  MemoryContext aggcontext;
5318  MemoryContext oldcontext;
5319 
5320  if (!AggCheckCallContext(fcinfo, &aggcontext))
5321  {
5322  /* cannot be called directly because of internal-type argument */
5323  elog(ERROR, "string_agg_transfn called in non-aggregate context");
5324  }
5325 
5326  /*
5327  * Create state in aggregate context. It'll stay there across subsequent
5328  * calls.
5329  */
5330  oldcontext = MemoryContextSwitchTo(aggcontext);
5331  state = makeStringInfo();
5332  MemoryContextSwitchTo(oldcontext);
5333 
5334  return state;
5335 }
5336 
5337 Datum
5339 {
5340  StringInfo state;
5341 
5342  state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
5343 
5344  /* Append the value unless null. */
5345  if (!PG_ARGISNULL(1))
5346  {
5347  /* On the first time through, we ignore the delimiter. */
5348  if (state == NULL)
5349  state = makeStringAggState(fcinfo);
5350  else if (!PG_ARGISNULL(2))
5351  appendStringInfoText(state, PG_GETARG_TEXT_PP(2)); /* delimiter */
5352 
5353  appendStringInfoText(state, PG_GETARG_TEXT_PP(1)); /* value */
5354  }
5355 
5356  /*
5357  * The transition type for string_agg() is declared to be "internal",
5358  * which is a pass-by-value type the same size as a pointer.
5359  */
5360  PG_RETURN_POINTER(state);
5361 }
5362 
5363 Datum
5365 {
5366  StringInfo state;
5367 
5368  /* cannot be called directly because of internal-type argument */
5369  Assert(AggCheckCallContext(fcinfo, NULL));
5370 
5371  state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
5372 
5373  if (state != NULL)
5375  else
5376  PG_RETURN_NULL();
5377 }
5378 
5379 /*
5380  * Prepare cache with fmgr info for the output functions of the datatypes of
5381  * the arguments of a concat-like function, beginning with argument "argidx".
5382  * (Arguments before that will have corresponding slots in the resulting
5383  * FmgrInfo array, but we don't fill those slots.)
5384  */
5385 static FmgrInfo *
5387 {
5388  FmgrInfo *foutcache;
5389  int i;
5390 
5391  /* We keep the info in fn_mcxt so it survives across calls */
5392  foutcache = (FmgrInfo *) MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
5393  PG_NARGS() * sizeof(FmgrInfo));
5394 
5395  for (i = argidx; i < PG_NARGS(); i++)
5396  {
5397  Oid valtype;
5398  Oid typOutput;
5399  bool typIsVarlena;
5400 
5401  valtype = get_fn_expr_argtype(fcinfo->flinfo, i);
5402  if (!OidIsValid(valtype))
5403  elog(ERROR, "could not determine data type of concat() input");
5404 
5405  getTypeOutputInfo(valtype, &typOutput, &typIsVarlena);
5406  fmgr_info_cxt(typOutput, &foutcache[i], fcinfo->flinfo->fn_mcxt);
5407  }
5408 
5409  fcinfo->flinfo->fn_extra = foutcache;
5410 
5411  return foutcache;
5412 }
5413 
5414 /*
5415  * Implementation of both concat() and concat_ws().
5416  *
5417  * sepstr is the separator string to place between values.
5418  * argidx identifies the first argument to concatenate (counting from zero);
5419  * note that this must be constant across any one series of calls.
5420  *
5421  * Returns NULL if result should be NULL, else text value.
5422  */
5423 static text *
5424 concat_internal(const char *sepstr, int argidx,
5425  FunctionCallInfo fcinfo)
5426 {
5427  text *result;
5429  FmgrInfo *foutcache;
5430  bool first_arg = true;
5431  int i;
5432 
5433  /*
5434  * concat(VARIADIC some-array) is essentially equivalent to
5435  * array_to_text(), ie concat the array elements with the given separator.
5436  * So we just pass the case off to that code.
5437  */
5438  if (get_fn_expr_variadic(fcinfo->flinfo))
5439  {
5440  ArrayType *arr;
5441 
5442  /* Should have just the one argument */
5443  Assert(argidx == PG_NARGS() - 1);
5444 
5445  /* concat(VARIADIC NULL) is defined as NULL */
5446  if (PG_ARGISNULL(argidx))
5447  return NULL;
5448 
5449  /*
5450  * Non-null argument had better be an array. We assume that any call
5451  * context that could let get_fn_expr_variadic return true will have
5452  * checked that a VARIADIC-labeled parameter actually is an array. So
5453  * it should be okay to just Assert that it's an array rather than
5454  * doing a full-fledged error check.
5455  */
5457 
5458  /* OK, safe to fetch the array value */
5459  arr = PG_GETARG_ARRAYTYPE_P(argidx);
5460 
5461  /*
5462  * And serialize the array. We tell array_to_text to ignore null
5463  * elements, which matches the behavior of the loop below.
5464  */
5465  return array_to_text_internal(fcinfo, arr, sepstr, NULL);
5466  }
5467 
5468  /* Normal case without explicit VARIADIC marker */
5469  initStringInfo(&str);
5470 
5471  /* Get output function info, building it if first time through */
5472  foutcache = (FmgrInfo *) fcinfo->flinfo->fn_extra;
5473  if (foutcache == NULL)
5474  foutcache = build_concat_foutcache(fcinfo, argidx);
5475 
5476  for (i = argidx; i < PG_NARGS(); i++)
5477  {
5478  if (!PG_ARGISNULL(i))
5479  {
5481 
5482  /* add separator if appropriate */
5483  if (first_arg)
5484  first_arg = false;
5485  else
5486  appendStringInfoString(&str, sepstr);
5487 
5488  /* call the appropriate type output function, append the result */
5490  OutputFunctionCall(&foutcache[i], value));
5491  }
5492  }
5493 
5494  result = cstring_to_text_with_len(str.data, str.len);
5495  pfree(str.data);
5496 
5497  return result;
5498 }
5499 
5500 /*
5501  * Concatenate all arguments. NULL arguments are ignored.
5502  */
5503 Datum
5505 {
5506  text *result;
5507 
5508  result = concat_internal("", 0, fcinfo);
5509  if (result == NULL)
5510  PG_RETURN_NULL();
5511  PG_RETURN_TEXT_P(result);
5512 }
5513 
5514 /*
5515  * Concatenate all but first argument value with separators. The first
5516  * parameter is used as the separator. NULL arguments are ignored.
5517  */
5518 Datum
5520 {
5521  char *sep;
5522  text *result;
5523 
5524  /* return NULL when separator is NULL */
5525  if (PG_ARGISNULL(0))
5526  PG_RETURN_NULL();
5528 
5529  result = concat_internal(sep, 1, fcinfo);
5530  if (result == NULL)
5531  PG_RETURN_NULL();
5532  PG_RETURN_TEXT_P(result);
5533 }
5534 
5535 /*
5536  * Return first n characters in the string. When n is negative,
5537  * return all but last |n| characters.
5538  */
5539 Datum
5541 {
5542  int n = PG_GETARG_INT32(1);
5543 
5544  if (n < 0)
5545  {
5546  text *str = PG_GETARG_TEXT_PP(0);
5547  const char *p = VARDATA_ANY(str);
5548  int len = VARSIZE_ANY_EXHDR(str);
5549  int rlen;
5550 
5551  n = pg_mbstrlen_with_len(p, len) + n;
5552  rlen = pg_mbcharcliplen(p, len, n);
5554  }
5555  else
5557 }
5558 
5559 /*
5560  * Return last n characters in the string. When n is negative,
5561  * return all but first |n| characters.
5562  */
5563 Datum
5565 {
5566  text *str = PG_GETARG_TEXT_PP(0);
5567  const char *p = VARDATA_ANY(str);
5568  int len = VARSIZE_ANY_EXHDR(str);
5569  int n = PG_GETARG_INT32(1);
5570  int off;
5571 
5572  if (n < 0)
5573  n = -n;
5574  else
5575  n = pg_mbstrlen_with_len(p, len) - n;
5576  off = pg_mbcharcliplen(p, len, n);
5577 
5578  PG_RETURN_TEXT_P(cstring_to_text_with_len(p + off, len - off));
5579 }
5580 
5581 /*
5582  * Return reversed string
5583  */
5584 Datum
5586 {
5587  text *str = PG_GETARG_TEXT_PP(0);
5588  const char *p = VARDATA_ANY(str);
5589  int len = VARSIZE_ANY_EXHDR(str);
5590  const char *endp = p + len;
5591  text *result;
5592  char *dst;
5593 
5594  result = palloc(len + VARHDRSZ);
5595  dst = (char *) VARDATA(result) + len;
5596  SET_VARSIZE(result, len + VARHDRSZ);
5597 
5599  {
5600  /* multibyte version */
5601  while (p < endp)
5602  {
5603  int sz;
5604 
5605  sz = pg_mblen(p);
5606  dst -= sz;
5607  memcpy(dst, p, sz);
5608  p += sz;
5609  }
5610  }
5611  else
5612  {
5613  /* single byte version */
5614  while (p < endp)
5615  *(--dst) = *p++;
5616  }
5617 
5618  PG_RETURN_TEXT_P(result);
5619 }
5620 
5621 
5622 /*
5623  * Support macros for text_format()
5624  */
5625 #define TEXT_FORMAT_FLAG_MINUS 0x0001 /* is minus flag present? */
5626 
5627 #define ADVANCE_PARSE_POINTER(ptr,end_ptr) \
5628  do { \
5629  if (++(ptr) >= (end_ptr)) \
5630  ereport(ERROR, \
5631  (errcode(ERRCODE_INVALID_PARAMETER_VALUE), \
5632  errmsg("unterminated format() type specifier"), \
5633  errhint("For a single \"%%\" use \"%%%%\"."))); \
5634  } while (0)
5635 
5636 /*
5637  * Returns a formatted string
5638  */
5639 Datum
5641 {
5642  text *fmt;
5644  const char *cp;
5645  const char *start_ptr;
5646  const char *end_ptr;
5647  text *result;
5648  int arg;
5649  bool funcvariadic;
5650  int nargs;
5651  Datum *elements = NULL;
5652  bool *nulls = NULL;
5653  Oid element_type = InvalidOid;
5654  Oid prev_type = InvalidOid;
5655  Oid prev_width_type = InvalidOid;
5656  FmgrInfo typoutputfinfo;
5657  FmgrInfo typoutputinfo_width;
5658 
5659  /* When format string is null, immediately return null */
5660  if (PG_ARGISNULL(0))
5661  PG_RETURN_NULL();
5662 
5663  /* If argument is marked VARIADIC, expand array into elements */
5664  if (get_fn_expr_variadic(fcinfo->flinfo))
5665  {
5666  ArrayType *arr;
5667  int16 elmlen;
5668  bool elmbyval;
5669  char elmalign;
5670  int nitems;
5671 
5672  /* Should have just the one argument */
5673  Assert(PG_NARGS() == 2);
5674 
5675  /* If argument is NULL, we treat it as zero-length array */
5676  if (PG_ARGISNULL(1))
5677  nitems = 0;
5678  else
5679  {
5680  /*
5681  * Non-null argument had better be an array. We assume that any
5682  * call context that could let get_fn_expr_variadic return true
5683  * will have checked that a VARIADIC-labeled parameter actually is
5684  * an array. So it should be okay to just Assert that it's an
5685  * array rather than doing a full-fledged error check.
5686  */
5688 
5689  /* OK, safe to fetch the array value */
5690  arr = PG_GETARG_ARRAYTYPE_P(1);
5691 
5692  /* Get info about array element type */
5693  element_type = ARR_ELEMTYPE(arr);
5694  get_typlenbyvalalign(element_type,
5695  &elmlen, &elmbyval, &elmalign);
5696 
5697  /* Extract all array elements */
5698  deconstruct_array(arr, element_type, elmlen, elmbyval, elmalign,
5699  &elements, &nulls, &nitems);
5700  }
5701 
5702  nargs = nitems + 1;
5703  funcvariadic = true;
5704  }
5705  else
5706  {
5707  /* Non-variadic case, we'll process the arguments individually */
5708  nargs = PG_NARGS();
5709  funcvariadic = false;
5710  }
5711 
5712  /* Setup for main loop. */
5713  fmt = PG_GETARG_TEXT_PP(0);
5714  start_ptr = VARDATA_ANY(fmt);
5715  end_ptr = start_ptr + VARSIZE_ANY_EXHDR(fmt);
5716  initStringInfo(&str);
5717  arg = 1; /* next argument position to print */
5718 
5719  /* Scan format string, looking for conversion specifiers. */
5720  for (cp = start_ptr; cp < end_ptr; cp++)
5721  {
5722  int argpos;
5723  int widthpos;
5724  int flags;
5725  int width;
5726  Datum value;
5727  bool isNull;
5728  Oid typid;
5729 
5730  /*
5731  * If it's not the start of a conversion specifier, just copy it to
5732  * the output buffer.
5733  */
5734  if (*cp != '%')
5735  {
5736  appendStringInfoCharMacro(&str, *cp);
5737  continue;
5738  }
5739 
5740  ADVANCE_PARSE_POINTER(cp, end_ptr);
5741 
5742  /* Easy case: %% outputs a single % */
5743  if (*cp == '%')
5744  {
5745  appendStringInfoCharMacro(&str, *cp);
5746  continue;
5747  }
5748 
5749  /* Parse the optional portions of the format specifier */
5750  cp = text_format_parse_format(cp, end_ptr,
5751  &argpos, &widthpos,
5752  &flags, &width);
5753 
5754  /*
5755  * Next we should see the main conversion specifier. Whether or not
5756  * an argument position was present, it's known that at least one
5757  * character remains in the string at this point. Experience suggests
5758  * that it's worth checking that that character is one of the expected
5759  * ones before we try to fetch arguments, so as to produce the least
5760  * confusing response to a mis-formatted specifier.
5761  */
5762  if (strchr("sIL", *cp) == NULL)
5763  ereport(ERROR,
5764  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5765  errmsg("unrecognized format() type specifier \"%.*s\"",
5766  pg_mblen(cp), cp),
5767  errhint("For a single \"%%\" use \"%%%%\".")));
5768 
5769  /* If indirect width was specified, get its value */
5770  if (widthpos >= 0)
5771  {
5772  /* Collect the specified or next argument position */
5773  if (widthpos > 0)
5774  arg = widthpos;
5775  if (arg >= nargs)
5776  ereport(ERROR,
5777  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5778  errmsg("too few arguments for format()")));
5779 
5780  /* Get the value and type of the selected argument */
5781  if (!funcvariadic)
5782  {
5783  value = PG_GETARG_DATUM(arg);
5784  isNull = PG_ARGISNULL(arg);
5785  typid = get_fn_expr_argtype(fcinfo->flinfo, arg);
5786  }
5787  else
5788  {
5789  value = elements[arg - 1];
5790  isNull = nulls[arg - 1];
5791  typid = element_type;
5792  }
5793  if (!OidIsValid(typid))
5794  elog(ERROR, "could not determine data type of format() input");
5795 
5796  arg++;
5797 
5798  /* We can treat NULL width the same as zero */
5799  if (isNull)
5800  width = 0;
5801  else if (typid == INT4OID)
5802  width = DatumGetInt32(value);
5803  else if (typid == INT2OID)
5804  width = DatumGetInt16(value);
5805  else
5806  {
5807  /* For less-usual datatypes, convert to text then to int */
5808  char *str;
5809 
5810  if (typid != prev_width_type)
5811  {
5812  Oid typoutputfunc;
5813  bool typIsVarlena;
5814 
5815  getTypeOutputInfo(typid, &typoutputfunc, &typIsVarlena);
5816  fmgr_info(typoutputfunc, &typoutputinfo_width);
5817  prev_width_type = typid;
5818  }
5819 
5820  str = OutputFunctionCall(&typoutputinfo_width, value);
5821 
5822  /* pg_strtoint32 will complain about bad data or overflow */
5823  width = pg_strtoint32(str);
5824 
5825  pfree(str);
5826  }
5827  }
5828 
5829  /* Collect the specified or next argument position */
5830  if (argpos > 0)
5831  arg = argpos;
5832  if (arg >= nargs)
5833  ereport(ERROR,
5834  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5835  errmsg("too few arguments for format()")));
5836 
5837  /* Get the value and type of the selected argument */
5838  if (!funcvariadic)
5839  {
5840  value = PG_GETARG_DATUM(arg);
5841  isNull = PG_ARGISNULL(arg);
5842  typid = get_fn_expr_argtype(fcinfo->flinfo, arg);
5843  }
5844  else
5845  {
5846  value = elements[arg - 1];
5847  isNull = nulls[arg - 1];
5848  typid = element_type;
5849  }
5850  if (!OidIsValid(typid))
5851  elog(ERROR, "could not determine data type of format() input");
5852 
5853  arg++;
5854 
5855  /*
5856  * Get the appropriate typOutput function, reusing previous one if
5857  * same type as previous argument. That's particularly useful in the
5858  * variadic-array case, but often saves work even for ordinary calls.
5859  */
5860  if (typid != prev_type)
5861  {
5862  Oid typoutputfunc;
5863  bool typIsVarlena;
5864 
5865  getTypeOutputInfo(typid, &typoutputfunc, &typIsVarlena);
5866  fmgr_info(typoutputfunc, &typoutputfinfo);
5867  prev_type = typid;
5868  }
5869 
5870  /*
5871  * And now we can format the value.
5872  */
5873  switch (*cp)
5874  {
5875  case 's':
5876  case 'I':
5877  case 'L':
5878  text_format_string_conversion(&str, *cp, &typoutputfinfo,
5879  value, isNull,
5880  flags, width);
5881  break;
5882  default:
5883  /* should not get here, because of previous check */
5884  ereport(ERROR,
5885  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5886  errmsg("unrecognized format() type specifier \"%.*s\"",
5887  pg_mblen(cp), cp),
5888  errhint("For a single \"%%\" use \"%%%%\".")));
5889  break;
5890  }
5891  }
5892 
5893  /* Don't need deconstruct_array results anymore. */
5894  if (elements != NULL)
5895  pfree(elements);
5896  if (nulls != NULL)
5897  pfree(nulls);
5898 
5899  /* Generate results. */
5900  result = cstring_to_text_with_len(str.data, str.len);
5901  pfree(str.data);
5902 
5903  PG_RETURN_TEXT_P(result);
5904 }
5905 
5906 /*
5907  * Parse contiguous digits as a decimal number.
5908  *
5909  * Returns true if some digits could be parsed.
5910  * The value is returned into *value, and *ptr is advanced to the next
5911  * character to be parsed.
5912  *
5913  * Note parsing invariant: at least one character is known available before
5914  * string end (end_ptr) at entry, and this is still true at exit.
5915  */
5916 static bool
5917 text_format_parse_digits(const char **ptr, const char *end_ptr, int *value)
5918 {
5919  bool found = false;
5920  const char *cp = *ptr;
5921  int val = 0;
5922 
5923  while (*cp >= '0' && *cp <= '9')
5924  {
5925  int8 digit = (*cp - '0');
5926 
5927  if (unlikely(pg_mul_s32_overflow(val, 10, &val)) ||
5928  unlikely(pg_add_s32_overflow(val, digit, &val)))
5929  ereport(ERROR,
5930  (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
5931  errmsg("number is out of range")));
5932  ADVANCE_PARSE_POINTER(cp, end_ptr);
5933  found = true;
5934  }
5935 
5936  *ptr = cp;
5937  *value = val;
5938 
5939  return found;
5940 }
5941 
5942 /*
5943  * Parse a format specifier (generally following the SUS printf spec).
5944  *
5945  * We have already advanced over the initial '%', and we are looking for
5946  * [argpos][flags][width]type (but the type character is not consumed here).
5947  *
5948  * Inputs are start_ptr (the position after '%') and end_ptr (string end + 1).
5949  * Output parameters:
5950  * argpos: argument position for value to be printed. -1 means unspecified.
5951  * widthpos: argument position for width. Zero means the argument position
5952  * was unspecified (ie, take the next arg) and -1 means no width
5953  * argument (width was omitted or specified as a constant).
5954  * flags: bitmask of flags.
5955  * width: directly-specified width value. Zero means the width was omitted
5956  * (note it's not necessary to distinguish this case from an explicit
5957  * zero width value).
5958  *
5959  * The function result is the next character position to be parsed, ie, the
5960  * location where the type character is/should be.
5961  *
5962  * Note parsing invariant: at least one character is known available before
5963  * string end (end_ptr) at entry, and this is still true at exit.
5964  */
5965 static const char *
5966 text_format_parse_format(const char *start_ptr, const char *end_ptr,
5967  int *argpos, int *widthpos,
5968  int *flags, int *width)
5969 {
5970  const char *cp = start_ptr;
5971  int n;
5972 
5973  /* set defaults for output parameters */
5974  *argpos = -1;
5975  *widthpos = -1;
5976  *flags = 0;
5977  *width = 0;
5978 
5979  /* try to identify first number */
5980  if (text_format_parse_digits(&cp, end_ptr, &n))
5981  {
5982  if (*cp != '$')
5983  {
5984  /* Must be just a width and a type, so we're done */
5985  *width = n;
5986  return cp;
5987  }
5988  /* The number was argument position */
5989  *argpos = n;
5990  /* Explicit 0 for argument index is immediately refused */
5991  if (n == 0)
5992  ereport(ERROR,
5993  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5994  errmsg("format specifies argument 0, but arguments are numbered from 1")));
5995  ADVANCE_PARSE_POINTER(cp, end_ptr);
5996  }
5997 
5998  /* Handle flags (only minus is supported now) */
5999  while (*cp == '-')
6000  {
6001  *flags |= TEXT_FORMAT_FLAG_MINUS;
6002  ADVANCE_PARSE_POINTER(cp, end_ptr);
6003  }
6004 
6005  if (*cp == '*')
6006  {
6007  /* Handle indirect width */
6008  ADVANCE_PARSE_POINTER(cp, end_ptr);
6009  if (text_format_parse_digits(&cp, end_ptr, &n))
6010  {
6011  /* number in this position must be closed by $ */
6012  if (*cp != '$')
6013  ereport(ERROR,
6014  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6015  errmsg("width argument position must be ended by \"$\"")));
6016  /* The number was width argument position */
6017  *widthpos = n;
6018  /* Explicit 0 for argument index is immediately refused */
6019  if (n == 0)
6020  ereport(ERROR,
6021  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6022  errmsg("format specifies argument 0, but arguments are numbered from 1")));
6023  ADVANCE_PARSE_POINTER(cp, end_ptr);
6024  }
6025  else
6026  *widthpos = 0; /* width's argument position is unspecified */
6027  }
6028  else
6029  {
6030  /* Check for direct width specification */
6031  if (text_format_parse_digits(&cp, end_ptr, &n))
6032  *width = n;
6033  }
6034 
6035  /* cp should now be pointing at type character */
6036  return cp;
6037 }
6038 
6039 /*
6040  * Format a %s, %I, or %L conversion
6041  */
6042 static void
6044  FmgrInfo *typOutputInfo,
6045  Datum value, bool isNull,
6046  int flags, int width)
6047 {
6048  char *str;
6049 
6050  /* Handle NULL arguments before trying to stringify the value. */
6051  if (isNull)
6052  {
6053  if (conversion == 's')
6054  text_format_append_string(buf, "", flags, width);
6055  else if (conversion == 'L')
6056  text_format_append_string(buf, "NULL", flags, width);
6057  else if (conversion == 'I')
6058  ereport(ERROR,
6059  (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
6060  errmsg("null values cannot be formatted as an SQL identifier")));
6061  return;
6062  }
6063 
6064  /* Stringify. */
6065  str = OutputFunctionCall(typOutputInfo, value);
6066 
6067  /* Escape. */
6068  if (conversion == 'I')
6069  {
6070  /* quote_identifier may or may not allocate a new string. */
6071  text_format_append_string(buf, quote_identifier(str), flags, width);
6072  }
6073  else if (conversion == 'L')
6074  {
6075  char *qstr = quote_literal_cstr(str);
6076 
6077  text_format_append_string(buf, qstr, flags, width);
6078  /* quote_literal_cstr() always allocates a new string */
6079  pfree(qstr);
6080  }
6081  else
6082  text_format_append_string(buf, str, flags, width);
6083 
6084  /* Cleanup. */
6085  pfree(str);
6086 }
6087 
6088 /*
6089  * Append str to buf, padding as directed by flags/width
6090  */
6091 static void
6093  int flags, int width)
6094 {
6095  bool align_to_left = false;
6096  int len;
6097 
6098  /* fast path for typical easy case */
6099  if (width == 0)
6100  {
6101  appendStringInfoString(buf, str);
6102  return;
6103  }
6104 
6105  if (width < 0)
6106  {
6107  /* Negative width: implicit '-' flag, then take absolute value */
6108  align_to_left = true;
6109  /* -INT_MIN is undefined */
6110  if (width <= INT_MIN)
6111  ereport(ERROR,
6112  (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
6113  errmsg("number is out of range")));
6114  width = -width;
6115  }
6116  else if (flags & TEXT_FORMAT_FLAG_MINUS)
6117  align_to_left = true;
6118 
6119  len = pg_mbstrlen(str);
6120  if (align_to_left)
6121  {
6122  /* left justify */
6123  appendStringInfoString(buf, str);
6124  if (len < width)
6125  appendStringInfoSpaces(buf, width - len);
6126  }
6127  else
6128  {
6129  /* right justify */
6130  if (len < width)
6131  appendStringInfoSpaces(buf, width - len);
6132  appendStringInfoString(buf, str);
6133  }
6134 }
6135 
6136 /*
6137  * text_format_nv - nonvariadic wrapper for text_format function.
6138  *
6139  * note: this wrapper is necessary to pass the sanity check in opr_sanity,
6140  * which checks that all built-in functions that share the implementing C
6141  * function take the same number of arguments.
6142  */
6143 Datum
6145 {
6146  return text_format(fcinfo);
6147 }
6148 
6149 /*
6150  * Helper function for Levenshtein distance functions. Faster than memcmp(),
6151  * for this use case.
6152  */
6153 static inline bool
6154 rest_of_char_same(const char *s1, const char *s2, int len)
6155 {
6156  while (len > 0)
6157  {
6158  len--;
6159  if (s1[len] != s2[len])
6160  return false;
6161  }
6162  return true;
6163 }
6164 
6165 /* Expand each Levenshtein distance variant */
6166 #include "levenshtein.c"
6167 #define LEVENSHTEIN_LESS_EQUAL
6168 #include "levenshtein.c"
6169 
6170 
6171 /*
6172  * Unicode support
6173  */
6174 
6176 unicode_norm_form_from_string(const char *formstr)
6177 {
6178  UnicodeNormalizationForm form = -1;
6179 
6180  /*
6181  * Might as well check this while we're here.
6182  */
6183  if (GetDatabaseEncoding() != PG_UTF8)
6184  ereport(ERROR,
6185  (errcode(ERRCODE_SYNTAX_ERROR),
6186  errmsg("Unicode normalization can only be performed if server encoding is UTF8")));
6187 
6188  if (pg_strcasecmp(formstr, "NFC") == 0)
6189  form = UNICODE_NFC;
6190  else if (pg_strcasecmp(formstr, "NFD") == 0)
6191  form = UNICODE_NFD;
6192  else if (pg_strcasecmp(formstr, "NFKC") == 0)
6193  form = UNICODE_NFKC;
6194  else if (pg_strcasecmp(formstr, "NFKD") == 0)
6195  form = UNICODE_NFKD;
6196  else
6197  ereport(ERROR,
6198  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6199  errmsg("invalid normalization form: %s", formstr)));
6200 
6201  return form;
6202 }
6203 
6204 Datum
6206 {
6207  text *input = PG_GETARG_TEXT_PP(0);
6208  char *formstr = text_to_cstring(PG_GETARG_TEXT_PP(1));
6210  int size;
6211  pg_wchar *input_chars;
6212  pg_wchar *output_chars;
6213  unsigned char *p;
6214  text *result;
6215  int i;
6216 
6217  form = unicode_norm_form_from_string(formstr);
6218 
6219  /* convert to pg_wchar */
6220  size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
6221  input_chars = palloc((size + 1) * sizeof(pg_wchar));
6222  p = (unsigned char *) VARDATA_ANY(input);
6223  for (i = 0; i < size; i++)
6224  {
6225  input_chars[i] = utf8_to_unicode(p);
6226  p += pg_utf_mblen(p);
6227  }
6228  input_chars[i] = (pg_wchar) '\0';
6229  Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input));
6230 
6231  /* action */
6232  output_chars = unicode_normalize(form, input_chars);
6233 
6234  /* convert back to UTF-8 string */
6235  size = 0;
6236  for (pg_wchar *wp = output_chars; *wp; wp++)
6237  {
6238  unsigned char buf[4];
6239 
6240  unicode_to_utf8(*wp, buf);
6241  size += pg_utf_mblen(buf);
6242  }
6243 
6244  result = palloc(size + VARHDRSZ);
6245  SET_VARSIZE(result, size + VARHDRSZ);
6246 
6247  p = (unsigned char *) VARDATA_ANY(result);