PostgreSQL Source Code  git master
varlena.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * varlena.c
4  * Functions for the variable-length built-in types.
5  *
6  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  * src/backend/utils/adt/varlena.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 #include "postgres.h"
16 
17 #include <ctype.h>
18 #include <limits.h>
19 
20 #include "access/detoast.h"
22 #include "catalog/pg_collation.h"
23 #include "catalog/pg_type.h"
24 #include "common/hashfn.h"
25 #include "common/int.h"
26 #include "common/unicode_norm.h"
27 #include "lib/hyperloglog.h"
28 #include "libpq/pqformat.h"
29 #include "miscadmin.h"
30 #include "nodes/execnodes.h"
31 #include "parser/scansup.h"
32 #include "port/pg_bswap.h"
33 #include "regex/regex.h"
34 #include "utils/builtins.h"
35 #include "utils/bytea.h"
36 #include "utils/lsyscache.h"
37 #include "utils/memutils.h"
38 #include "utils/pg_locale.h"
39 #include "utils/sortsupport.h"
40 #include "utils/varlena.h"
41 
42 
43 /* GUC variable */
45 
46 typedef struct varlena unknown;
47 typedef struct varlena VarString;
48 
49 /*
50  * State for text_position_* functions.
51  */
52 typedef struct
53 {
54  bool is_multibyte; /* T if multibyte encoding */
55  bool is_multibyte_char_in_char; /* need to check char boundaries? */
56 
57  char *str1; /* haystack string */
58  char *str2; /* needle string */
59  int len1; /* string lengths in bytes */
60  int len2;
61 
62  /* Skip table for Boyer-Moore-Horspool search algorithm: */
63  int skiptablemask; /* mask for ANDing with skiptable subscripts */
64  int skiptable[256]; /* skip distance for given mismatched char */
65 
66  char *last_match; /* pointer to last match in 'str1' */
67 
68  /*
69  * Sometimes we need to convert the byte position of a match to a
70  * character position. These store the last position that was converted,
71  * so that on the next call, we can continue from that point, rather than
72  * count characters from the very beginning.
73  */
74  char *refpoint; /* pointer within original haystack string */
75  int refpos; /* 0-based character offset of the same point */
77 
78 typedef struct
79 {
80  char *buf1; /* 1st string, or abbreviation original string
81  * buf */
82  char *buf2; /* 2nd string, or abbreviation strxfrm() buf */
83  int buflen1;
84  int buflen2;
85  int last_len1; /* Length of last buf1 string/strxfrm() input */
86  int last_len2; /* Length of last buf2 string/strxfrm() blob */
87  int last_returned; /* Last comparison result (cache) */
88  bool cache_blob; /* Does buf2 contain strxfrm() blob, etc? */
89  bool collate_c;
90  Oid typid; /* Actual datatype (text/bpchar/bytea/name) */
91  hyperLogLogState abbr_card; /* Abbreviated key cardinality state */
92  hyperLogLogState full_card; /* Full key cardinality state */
93  double prop_card; /* Required cardinality proportion */
96 
97 /*
98  * Output data for split_text(): we output either to an array or a table.
99  * tupstore and tupdesc must be set up in advance to output to a table.
100  */
101 typedef struct
102 {
107 
108 /*
109  * This should be large enough that most strings will fit, but small enough
110  * that we feel comfortable putting it on the stack
111  */
112 #define TEXTBUFLEN 1024
113 
114 #define DatumGetUnknownP(X) ((unknown *) PG_DETOAST_DATUM(X))
115 #define DatumGetUnknownPCopy(X) ((unknown *) PG_DETOAST_DATUM_COPY(X))
116 #define PG_GETARG_UNKNOWN_P(n) DatumGetUnknownP(PG_GETARG_DATUM(n))
117 #define PG_GETARG_UNKNOWN_P_COPY(n) DatumGetUnknownPCopy(PG_GETARG_DATUM(n))
118 #define PG_RETURN_UNKNOWN_P(x) PG_RETURN_POINTER(x)
119 
120 #define DatumGetVarStringP(X) ((VarString *) PG_DETOAST_DATUM(X))
121 #define DatumGetVarStringPP(X) ((VarString *) PG_DETOAST_DATUM_PACKED(X))
122 
123 static int varstrfastcmp_c(Datum x, Datum y, SortSupport ssup);
124 static int bpcharfastcmp_c(Datum x, Datum y, SortSupport ssup);
125 static int namefastcmp_c(Datum x, Datum y, SortSupport ssup);
126 static int varlenafastcmp_locale(Datum x, Datum y, SortSupport ssup);
127 static int namefastcmp_locale(Datum x, Datum y, SortSupport ssup);
128 static int varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup);
129 static int varstrcmp_abbrev(Datum x, Datum y, SortSupport ssup);
130 static Datum varstr_abbrev_convert(Datum original, SortSupport ssup);
131 static bool varstr_abbrev_abort(int memtupcount, SortSupport ssup);
132 static int32 text_length(Datum str);
133 static text *text_catenate(text *t1, text *t2);
134 static text *text_substring(Datum str,
135  int32 start,
136  int32 length,
137  bool length_not_specified);
138 static text *text_overlay(text *t1, text *t2, int sp, int sl);
139 static int text_position(text *t1, text *t2, Oid collid);
140 static void text_position_setup(text *t1, text *t2, Oid collid, TextPositionState *state);
142 static char *text_position_next_internal(char *start_ptr, TextPositionState *state);
146 static void check_collation_set(Oid collid);
147 static int text_cmp(text *arg1, text *arg2, Oid collid);
148 static bytea *bytea_catenate(bytea *t1, bytea *t2);
150  int S,
151  int L,
152  bool length_not_specified);
153 static bytea *bytea_overlay(bytea *t1, bytea *t2, int sp, int sl);
154 static void appendStringInfoText(StringInfo str, const text *t);
155 static bool split_text(FunctionCallInfo fcinfo, SplitTextOutputData *tstate);
156 static void split_text_accum_result(SplitTextOutputData *tstate,
157  text *field_value,
158  text *null_string,
159  Oid collation);
161  const char *fldsep, const char *null_string);
163 static bool text_format_parse_digits(const char **ptr, const char *end_ptr,
164  int *value);
165 static const char *text_format_parse_format(const char *start_ptr,
166  const char *end_ptr,
167  int *argpos, int *widthpos,
168  int *flags, int *width);
169 static void text_format_string_conversion(StringInfo buf, char conversion,
170  FmgrInfo *typOutputInfo,
171  Datum value, bool isNull,
172  int flags, int width);
173 static void text_format_append_string(StringInfo buf, const char *str,
174  int flags, int width);
175 
176 
177 /*****************************************************************************
178  * CONVERSION ROUTINES EXPORTED FOR USE BY C CODE *
179  *****************************************************************************/
180 
181 /*
182  * cstring_to_text
183  *
184  * Create a text value from a null-terminated C string.
185  *
186  * The new text value is freshly palloc'd with a full-size VARHDR.
187  */
188 text *
189 cstring_to_text(const char *s)
190 {
191  return cstring_to_text_with_len(s, strlen(s));
192 }
193 
194 /*
195  * cstring_to_text_with_len
196  *
197  * Same as cstring_to_text except the caller specifies the string length;
198  * the string need not be null_terminated.
199  */
200 text *
201 cstring_to_text_with_len(const char *s, int len)
202 {
203  text *result = (text *) palloc(len + VARHDRSZ);
204 
205  SET_VARSIZE(result, len + VARHDRSZ);
206  memcpy(VARDATA(result), s, len);
207 
208  return result;
209 }
210 
211 /*
212  * text_to_cstring
213  *
214  * Create a palloc'd, null-terminated C string from a text value.
215  *
216  * We support being passed a compressed or toasted text value.
217  * This is a bit bogus since such values shouldn't really be referred to as
218  * "text *", but it seems useful for robustness. If we didn't handle that
219  * case here, we'd need another routine that did, anyway.
220  */
221 char *
223 {
224  /* must cast away the const, unfortunately */
225  text *tunpacked = pg_detoast_datum_packed(unconstify(text *, t));
226  int len = VARSIZE_ANY_EXHDR(tunpacked);
227  char *result;
228 
229  result = (char *) palloc(len + 1);
230  memcpy(result, VARDATA_ANY(tunpacked), len);
231  result[len] = '\0';
232 
233  if (tunpacked != t)
234  pfree(tunpacked);
235 
236  return result;
237 }
238 
239 /*
240  * text_to_cstring_buffer
241  *
242  * Copy a text value into a caller-supplied buffer of size dst_len.
243  *
244  * The text string is truncated if necessary to fit. The result is
245  * guaranteed null-terminated (unless dst_len == 0).
246  *
247  * We support being passed a compressed or toasted text value.
248  * This is a bit bogus since such values shouldn't really be referred to as
249  * "text *", but it seems useful for robustness. If we didn't handle that
250  * case here, we'd need another routine that did, anyway.
251  */
252 void
253 text_to_cstring_buffer(const text *src, char *dst, size_t dst_len)
254 {
255  /* must cast away the const, unfortunately */
256  text *srcunpacked = pg_detoast_datum_packed(unconstify(text *, src));
257  size_t src_len = VARSIZE_ANY_EXHDR(srcunpacked);
258 
259  if (dst_len > 0)
260  {
261  dst_len--;
262  if (dst_len >= src_len)
263  dst_len = src_len;
264  else /* ensure truncation is encoding-safe */
265  dst_len = pg_mbcliplen(VARDATA_ANY(srcunpacked), src_len, dst_len);
266  memcpy(dst, VARDATA_ANY(srcunpacked), dst_len);
267  dst[dst_len] = '\0';
268  }
269 
270  if (srcunpacked != src)
271  pfree(srcunpacked);
272 }
273 
274 
275 /*****************************************************************************
276  * USER I/O ROUTINES *
277  *****************************************************************************/
278 
279 
280 #define VAL(CH) ((CH) - '0')
281 #define DIG(VAL) ((VAL) + '0')
282 
283 /*
284  * byteain - converts from printable representation of byte array
285  *
286  * Non-printable characters must be passed as '\nnn' (octal) and are
287  * converted to internal form. '\' must be passed as '\\'.
288  * ereport(ERROR, ...) if bad form.
289  *
290  * BUGS:
291  * The input is scanned twice.
292  * The error checking of input is minimal.
293  */
294 Datum
296 {
297  char *inputText = PG_GETARG_CSTRING(0);
298  char *tp;
299  char *rp;
300  int bc;
301  bytea *result;
302 
303  /* Recognize hex input */
304  if (inputText[0] == '\\' && inputText[1] == 'x')
305  {
306  size_t len = strlen(inputText);
307 
308  bc = (len - 2) / 2 + VARHDRSZ; /* maximum possible length */
309  result = palloc(bc);
310  bc = hex_decode(inputText + 2, len - 2, VARDATA(result));
311  SET_VARSIZE(result, bc + VARHDRSZ); /* actual length */
312 
313  PG_RETURN_BYTEA_P(result);
314  }
315 
316  /* Else, it's the traditional escaped style */
317  for (bc = 0, tp = inputText; *tp != '\0'; bc++)
318  {
319  if (tp[0] != '\\')
320  tp++;
321  else if ((tp[0] == '\\') &&
322  (tp[1] >= '0' && tp[1] <= '3') &&
323  (tp[2] >= '0' && tp[2] <= '7') &&
324  (tp[3] >= '0' && tp[3] <= '7'))
325  tp += 4;
326  else if ((tp[0] == '\\') &&
327  (tp[1] == '\\'))
328  tp += 2;
329  else
330  {
331  /*
332  * one backslash, not followed by another or ### valid octal
333  */
334  ereport(ERROR,
335  (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
336  errmsg("invalid input syntax for type %s", "bytea")));
337  }
338  }
339 
340  bc += VARHDRSZ;
341 
342  result = (bytea *) palloc(bc);
343  SET_VARSIZE(result, bc);
344 
345  tp = inputText;
346  rp = VARDATA(result);
347  while (*tp != '\0')
348  {
349  if (tp[0] != '\\')
350  *rp++ = *tp++;
351  else if ((tp[0] == '\\') &&
352  (tp[1] >= '0' && tp[1] <= '3') &&
353  (tp[2] >= '0' && tp[2] <= '7') &&
354  (tp[3] >= '0' && tp[3] <= '7'))
355  {
356  bc = VAL(tp[1]);
357  bc <<= 3;
358  bc += VAL(tp[2]);
359  bc <<= 3;
360  *rp++ = bc + VAL(tp[3]);
361 
362  tp += 4;
363  }
364  else if ((tp[0] == '\\') &&
365  (tp[1] == '\\'))
366  {
367  *rp++ = '\\';
368  tp += 2;
369  }
370  else
371  {
372  /*
373  * We should never get here. The first pass should not allow it.
374  */
375  ereport(ERROR,
376  (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
377  errmsg("invalid input syntax for type %s", "bytea")));
378  }
379  }
380 
381  PG_RETURN_BYTEA_P(result);
382 }
383 
384 /*
385  * byteaout - converts to printable representation of byte array
386  *
387  * In the traditional escaped format, non-printable characters are
388  * printed as '\nnn' (octal) and '\' as '\\'.
389  */
390 Datum
392 {
393  bytea *vlena = PG_GETARG_BYTEA_PP(0);
394  char *result;
395  char *rp;
396 
398  {
399  /* Print hex format */
400  rp = result = palloc(VARSIZE_ANY_EXHDR(vlena) * 2 + 2 + 1);
401  *rp++ = '\\';
402  *rp++ = 'x';
403  rp += hex_encode(VARDATA_ANY(vlena), VARSIZE_ANY_EXHDR(vlena), rp);
404  }
405  else if (bytea_output == BYTEA_OUTPUT_ESCAPE)
406  {
407  /* Print traditional escaped format */
408  char *vp;
409  uint64 len;
410  int i;
411 
412  len = 1; /* empty string has 1 char */
413  vp = VARDATA_ANY(vlena);
414  for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++)
415  {
416  if (*vp == '\\')
417  len += 2;
418  else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e)
419  len += 4;
420  else
421  len++;
422  }
423 
424  /*
425  * In principle len can't overflow uint32 if the input fit in 1GB, but
426  * for safety let's check rather than relying on palloc's internal
427  * check.
428  */
429  if (len > MaxAllocSize)
430  ereport(ERROR,
431  (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
432  errmsg_internal("result of bytea output conversion is too large")));
433  rp = result = (char *) palloc(len);
434 
435  vp = VARDATA_ANY(vlena);
436  for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++)
437  {
438  if (*vp == '\\')
439  {
440  *rp++ = '\\';
441  *rp++ = '\\';
442  }
443  else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e)
444  {
445  int val; /* holds unprintable chars */
446 
447  val = *vp;
448  rp[0] = '\\';
449  rp[3] = DIG(val & 07);
450  val >>= 3;
451  rp[2] = DIG(val & 07);
452  val >>= 3;
453  rp[1] = DIG(val & 03);
454  rp += 4;
455  }
456  else
457  *rp++ = *vp;
458  }
459  }
460  else
461  {
462  elog(ERROR, "unrecognized bytea_output setting: %d",
463  bytea_output);
464  rp = result = NULL; /* keep compiler quiet */
465  }
466  *rp = '\0';
467  PG_RETURN_CSTRING(result);
468 }
469 
470 /*
471  * bytearecv - converts external binary format to bytea
472  */
473 Datum
475 {
477  bytea *result;
478  int nbytes;
479 
480  nbytes = buf->len - buf->cursor;
481  result = (bytea *) palloc(nbytes + VARHDRSZ);
482  SET_VARSIZE(result, nbytes + VARHDRSZ);
483  pq_copymsgbytes(buf, VARDATA(result), nbytes);
484  PG_RETURN_BYTEA_P(result);
485 }
486 
487 /*
488  * byteasend - converts bytea to binary format
489  *
490  * This is a special case: just copy the input...
491  */
492 Datum
494 {
495  bytea *vlena = PG_GETARG_BYTEA_P_COPY(0);
496 
497  PG_RETURN_BYTEA_P(vlena);
498 }
499 
500 Datum
502 {
504 
505  state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
506 
507  /* Append the value unless null. */
508  if (!PG_ARGISNULL(1))
509  {
511 
512  /* On the first time through, we ignore the delimiter. */
513  if (state == NULL)
514  state = makeStringAggState(fcinfo);
515  else if (!PG_ARGISNULL(2))
516  {
517  bytea *delim = PG_GETARG_BYTEA_PP(2);
518 
520  }
521 
523  }
524 
525  /*
526  * The transition type for string_agg() is declared to be "internal",
527  * which is a pass-by-value type the same size as a pointer.
528  */
529  PG_RETURN_POINTER(state);
530 }
531 
532 Datum
534 {
536 
537  /* cannot be called directly because of internal-type argument */
538  Assert(AggCheckCallContext(fcinfo, NULL));
539 
540  state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
541 
542  if (state != NULL)
543  {
544  bytea *result;
545 
546  result = (bytea *) palloc(state->len + VARHDRSZ);
547  SET_VARSIZE(result, state->len + VARHDRSZ);
548  memcpy(VARDATA(result), state->data, state->len);
549  PG_RETURN_BYTEA_P(result);
550  }
551  else
552  PG_RETURN_NULL();
553 }
554 
555 /*
556  * textin - converts "..." to internal representation
557  */
558 Datum
560 {
561  char *inputText = PG_GETARG_CSTRING(0);
562 
563  PG_RETURN_TEXT_P(cstring_to_text(inputText));
564 }
565 
566 /*
567  * textout - converts internal representation to "..."
568  */
569 Datum
571 {
572  Datum txt = PG_GETARG_DATUM(0);
573 
575 }
576 
577 /*
578  * textrecv - converts external binary format to text
579  */
580 Datum
582 {
584  text *result;
585  char *str;
586  int nbytes;
587 
588  str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
589 
590  result = cstring_to_text_with_len(str, nbytes);
591  pfree(str);
592  PG_RETURN_TEXT_P(result);
593 }
594 
595 /*
596  * textsend - converts text to binary format
597  */
598 Datum
600 {
601  text *t = PG_GETARG_TEXT_PP(0);
603 
604  pq_begintypsend(&buf);
607 }
608 
609 
610 /*
611  * unknownin - converts "..." to internal representation
612  */
613 Datum
615 {
616  char *str = PG_GETARG_CSTRING(0);
617 
618  /* representation is same as cstring */
620 }
621 
622 /*
623  * unknownout - converts internal representation to "..."
624  */
625 Datum
627 {
628  /* representation is same as cstring */
629  char *str = PG_GETARG_CSTRING(0);
630 
632 }
633 
634 /*
635  * unknownrecv - converts external binary format to unknown
636  */
637 Datum
639 {
641  char *str;
642  int nbytes;
643 
644  str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
645  /* representation is same as cstring */
646  PG_RETURN_CSTRING(str);
647 }
648 
649 /*
650  * unknownsend - converts unknown to binary format
651  */
652 Datum
654 {
655  /* representation is same as cstring */
656  char *str = PG_GETARG_CSTRING(0);
658 
659  pq_begintypsend(&buf);
660  pq_sendtext(&buf, str, strlen(str));
662 }
663 
664 
665 /* ========== PUBLIC ROUTINES ========== */
666 
667 /*
668  * textlen -
669  * returns the logical length of a text*
670  * (which is less than the VARSIZE of the text*)
671  */
672 Datum
674 {
676 
677  /* try to avoid decompressing argument */
679 }
680 
681 /*
682  * text_length -
683  * Does the real work for textlen()
684  *
685  * This is broken out so it can be called directly by other string processing
686  * functions. Note that the argument is passed as a Datum, to indicate that
687  * it may still be in compressed form. We can avoid decompressing it at all
688  * in some cases.
689  */
690 static int32
692 {
693  /* fastpath when max encoding length is one */
696  else
697  {
698  text *t = DatumGetTextPP(str);
699 
701  VARSIZE_ANY_EXHDR(t)));
702  }
703 }
704 
705 /*
706  * textoctetlen -
707  * returns the physical length of a text*
708  * (which is less than the VARSIZE of the text*)
709  */
710 Datum
712 {
714 
715  /* We need not detoast the input at all */
717 }
718 
719 /*
720  * textcat -
721  * takes two text* and returns a text* that is the concatenation of
722  * the two.
723  *
724  * Rewritten by Sapa, sapa@hq.icb.chel.su. 8-Jul-96.
725  * Updated by Thomas, Thomas.Lockhart@jpl.nasa.gov 1997-07-10.
726  * Allocate space for output in all cases.
727  * XXX - thomas 1997-07-10
728  */
729 Datum
731 {
732  text *t1 = PG_GETARG_TEXT_PP(0);
733  text *t2 = PG_GETARG_TEXT_PP(1);
734 
736 }
737 
738 /*
739  * text_catenate
740  * Guts of textcat(), broken out so it can be used by other functions
741  *
742  * Arguments can be in short-header form, but not compressed or out-of-line
743  */
744 static text *
746 {
747  text *result;
748  int len1,
749  len2,
750  len;
751  char *ptr;
752 
753  len1 = VARSIZE_ANY_EXHDR(t1);
754  len2 = VARSIZE_ANY_EXHDR(t2);
755 
756  /* paranoia ... probably should throw error instead? */
757  if (len1 < 0)
758  len1 = 0;
759  if (len2 < 0)
760  len2 = 0;
761 
762  len = len1 + len2 + VARHDRSZ;
763  result = (text *) palloc(len);
764 
765  /* Set size of result string... */
766  SET_VARSIZE(result, len);
767 
768  /* Fill data field of result string... */
769  ptr = VARDATA(result);
770  if (len1 > 0)
771  memcpy(ptr, VARDATA_ANY(t1), len1);
772  if (len2 > 0)
773  memcpy(ptr + len1, VARDATA_ANY(t2), len2);
774 
775  return result;
776 }
777 
778 /*
779  * charlen_to_bytelen()
780  * Compute the number of bytes occupied by n characters starting at *p
781  *
782  * It is caller's responsibility that there actually are n characters;
783  * the string need not be null-terminated.
784  */
785 static int
786 charlen_to_bytelen(const char *p, int n)
787 {
789  {
790  /* Optimization for single-byte encodings */
791  return n;
792  }
793  else
794  {
795  const char *s;
796 
797  for (s = p; n > 0; n--)
798  s += pg_mblen(s);
799 
800  return s - p;
801  }
802 }
803 
804 /*
805  * text_substr()
806  * Return a substring starting at the specified position.
807  * - thomas 1997-12-31
808  *
809  * Input:
810  * - string
811  * - starting position (is one-based)
812  * - string length
813  *
814  * If the starting position is zero or less, then return from the start of the string
815  * adjusting the length to be consistent with the "negative start" per SQL.
816  * If the length is less than zero, return the remaining string.
817  *
818  * Added multibyte support.
819  * - Tatsuo Ishii 1998-4-21
820  * Changed behavior if starting position is less than one to conform to SQL behavior.
821  * Formerly returned the entire string; now returns a portion.
822  * - Thomas Lockhart 1998-12-10
823  * Now uses faster TOAST-slicing interface
824  * - John Gray 2002-02-22
825  * Remove "#ifdef MULTIBYTE" and test for encoding_max_length instead. Change
826  * behaviors conflicting with SQL to meet SQL (if E = S + L < S throw
827  * error; if E < 1, return '', not entire string). Fixed MB related bug when
828  * S > LC and < LC + 4 sometimes garbage characters are returned.
829  * - Joe Conway 2002-08-10
830  */
831 Datum
833 {
835  PG_GETARG_INT32(1),
836  PG_GETARG_INT32(2),
837  false));
838 }
839 
840 /*
841  * text_substr_no_len -
842  * Wrapper to avoid opr_sanity failure due to
843  * one function accepting a different number of args.
844  */
845 Datum
847 {
849  PG_GETARG_INT32(1),
850  -1, true));
851 }
852 
853 /*
854  * text_substring -
855  * Does the real work for text_substr() and text_substr_no_len()
856  *
857  * This is broken out so it can be called directly by other string processing
858  * functions. Note that the argument is passed as a Datum, to indicate that
859  * it may still be in compressed/toasted form. We can avoid detoasting all
860  * of it in some cases.
861  *
862  * The result is always a freshly palloc'd datum.
863  */
864 static text *
865 text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
866 {
868  int32 S = start; /* start position */
869  int32 S1; /* adjusted start position */
870  int32 L1; /* adjusted substring length */
871  int32 E; /* end position */
872 
873  /*
874  * SQL99 says S can be zero or negative, but we still must fetch from the
875  * start of the string.
876  */
877  S1 = Max(S, 1);
878 
879  /* life is easy if the encoding max length is 1 */
880  if (eml == 1)
881  {
882  if (length_not_specified) /* special case - get length to end of
883  * string */
884  L1 = -1;
885  else if (length < 0)
886  {
887  /* SQL99 says to throw an error for E < S, i.e., negative length */
888  ereport(ERROR,
889  (errcode(ERRCODE_SUBSTRING_ERROR),
890  errmsg("negative substring length not allowed")));
891  L1 = -1; /* silence stupider compilers */
892  }
893  else if (pg_add_s32_overflow(S, length, &E))
894  {
895  /*
896  * L could be large enough for S + L to overflow, in which case
897  * the substring must run to end of string.
898  */
899  L1 = -1;
900  }
901  else
902  {
903  /*
904  * A zero or negative value for the end position can happen if the
905  * start was negative or one. SQL99 says to return a zero-length
906  * string.
907  */
908  if (E < 1)
909  return cstring_to_text("");
910 
911  L1 = E - S1;
912  }
913 
914  /*
915  * If the start position is past the end of the string, SQL99 says to
916  * return a zero-length string -- DatumGetTextPSlice() will do that
917  * for us. We need only convert S1 to zero-based starting position.
918  */
919  return DatumGetTextPSlice(str, S1 - 1, L1);
920  }
921  else if (eml > 1)
922  {
923  /*
924  * When encoding max length is > 1, we can't get LC without
925  * detoasting, so we'll grab a conservatively large slice now and go
926  * back later to do the right thing
927  */
928  int32 slice_start;
929  int32 slice_size;
930  int32 slice_strlen;
931  text *slice;
932  int32 E1;
933  int32 i;
934  char *p;
935  char *s;
936  text *ret;
937 
938  /*
939  * We need to start at position zero because there is no way to know
940  * in advance which byte offset corresponds to the supplied start
941  * position.
942  */
943  slice_start = 0;
944 
945  if (length_not_specified) /* special case - get length to end of
946  * string */
947  slice_size = L1 = -1;
948  else if (length < 0)
949  {
950  /* SQL99 says to throw an error for E < S, i.e., negative length */
951  ereport(ERROR,
952  (errcode(ERRCODE_SUBSTRING_ERROR),
953  errmsg("negative substring length not allowed")));
954  slice_size = L1 = -1; /* silence stupider compilers */
955  }
956  else if (pg_add_s32_overflow(S, length, &E))
957  {
958  /*
959  * L could be large enough for S + L to overflow, in which case
960  * the substring must run to end of string.
961  */
962  slice_size = L1 = -1;
963  }
964  else
965  {
966  /*
967  * A zero or negative value for the end position can happen if the
968  * start was negative or one. SQL99 says to return a zero-length
969  * string.
970  */
971  if (E < 1)
972  return cstring_to_text("");
973 
974  /*
975  * if E is past the end of the string, the tuple toaster will
976  * truncate the length for us
977  */
978  L1 = E - S1;
979 
980  /*
981  * Total slice size in bytes can't be any longer than the start
982  * position plus substring length times the encoding max length.
983  * If that overflows, we can just use -1.
984  */
985  if (pg_mul_s32_overflow(E, eml, &slice_size))
986  slice_size = -1;
987  }
988 
989  /*
990  * If we're working with an untoasted source, no need to do an extra
991  * copying step.
992  */
995  slice = DatumGetTextPSlice(str, slice_start, slice_size);
996  else
997  slice = (text *) DatumGetPointer(str);
998 
999  /* see if we got back an empty string */
1000  if (VARSIZE_ANY_EXHDR(slice) == 0)
1001  {
1002  if (slice != (text *) DatumGetPointer(str))
1003  pfree(slice);
1004  return cstring_to_text("");
1005  }
1006 
1007  /* Now we can get the actual length of the slice in MB characters */
1008  slice_strlen = pg_mbstrlen_with_len(VARDATA_ANY(slice),
1009  VARSIZE_ANY_EXHDR(slice));
1010 
1011  /*
1012  * Check that the start position wasn't > slice_strlen. If so, SQL99
1013  * says to return a zero-length string.
1014  */
1015  if (S1 > slice_strlen)
1016  {
1017  if (slice != (text *) DatumGetPointer(str))
1018  pfree(slice);
1019  return cstring_to_text("");
1020  }
1021 
1022  /*
1023  * Adjust L1 and E1 now that we know the slice string length. Again
1024  * remember that S1 is one based, and slice_start is zero based.
1025  */
1026  if (L1 > -1)
1027  E1 = Min(S1 + L1, slice_start + 1 + slice_strlen);
1028  else
1029  E1 = slice_start + 1 + slice_strlen;
1030 
1031  /*
1032  * Find the start position in the slice; remember S1 is not zero based
1033  */
1034  p = VARDATA_ANY(slice);
1035  for (i = 0; i < S1 - 1; i++)
1036  p += pg_mblen(p);
1037 
1038  /* hang onto a pointer to our start position */
1039  s = p;
1040 
1041  /*
1042  * Count the actual bytes used by the substring of the requested
1043  * length.
1044  */
1045  for (i = S1; i < E1; i++)
1046  p += pg_mblen(p);
1047 
1048  ret = (text *) palloc(VARHDRSZ + (p - s));
1049  SET_VARSIZE(ret, VARHDRSZ + (p - s));
1050  memcpy(VARDATA(ret), s, (p - s));
1051 
1052  if (slice != (text *) DatumGetPointer(str))
1053  pfree(slice);
1054 
1055  return ret;
1056  }
1057  else
1058  elog(ERROR, "invalid backend encoding: encoding max length < 1");
1059 
1060  /* not reached: suppress compiler warning */
1061  return NULL;
1062 }
1063 
1064 /*
1065  * textoverlay
1066  * Replace specified substring of first string with second
1067  *
1068  * The SQL standard defines OVERLAY() in terms of substring and concatenation.
1069  * This code is a direct implementation of what the standard says.
1070  */
1071 Datum
1073 {
1074  text *t1 = PG_GETARG_TEXT_PP(0);
1075  text *t2 = PG_GETARG_TEXT_PP(1);
1076  int sp = PG_GETARG_INT32(2); /* substring start position */
1077  int sl = PG_GETARG_INT32(3); /* substring length */
1078 
1079  PG_RETURN_TEXT_P(text_overlay(t1, t2, sp, sl));
1080 }
1081 
1082 Datum
1084 {
1085  text *t1 = PG_GETARG_TEXT_PP(0);
1086  text *t2 = PG_GETARG_TEXT_PP(1);
1087  int sp = PG_GETARG_INT32(2); /* substring start position */
1088  int sl;
1089 
1090  sl = text_length(PointerGetDatum(t2)); /* defaults to length(t2) */
1091  PG_RETURN_TEXT_P(text_overlay(t1, t2, sp, sl));
1092 }
1093 
1094 static text *
1095 text_overlay(text *t1, text *t2, int sp, int sl)
1096 {
1097  text *result;
1098  text *s1;
1099  text *s2;
1100  int sp_pl_sl;
1101 
1102  /*
1103  * Check for possible integer-overflow cases. For negative sp, throw a
1104  * "substring length" error because that's what should be expected
1105  * according to the spec's definition of OVERLAY().
1106  */
1107  if (sp <= 0)
1108  ereport(ERROR,
1109  (errcode(ERRCODE_SUBSTRING_ERROR),
1110  errmsg("negative substring length not allowed")));
1111  if (pg_add_s32_overflow(sp, sl, &sp_pl_sl))
1112  ereport(ERROR,
1113  (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
1114  errmsg("integer out of range")));
1115 
1116  s1 = text_substring(PointerGetDatum(t1), 1, sp - 1, false);
1117  s2 = text_substring(PointerGetDatum(t1), sp_pl_sl, -1, true);
1118  result = text_catenate(s1, t2);
1119  result = text_catenate(result, s2);
1120 
1121  return result;
1122 }
1123 
1124 /*
1125  * textpos -
1126  * Return the position of the specified substring.
1127  * Implements the SQL POSITION() function.
1128  * Ref: A Guide To The SQL Standard, Date & Darwen, 1997
1129  * - thomas 1997-07-27
1130  */
1131 Datum
1133 {
1134  text *str = PG_GETARG_TEXT_PP(0);
1135  text *search_str = PG_GETARG_TEXT_PP(1);
1136 
1137  PG_RETURN_INT32((int32) text_position(str, search_str, PG_GET_COLLATION()));
1138 }
1139 
1140 /*
1141  * text_position -
1142  * Does the real work for textpos()
1143  *
1144  * Inputs:
1145  * t1 - string to be searched
1146  * t2 - pattern to match within t1
1147  * Result:
1148  * Character index of the first matched char, starting from 1,
1149  * or 0 if no match.
1150  *
1151  * This is broken out so it can be called directly by other string processing
1152  * functions.
1153  */
1154 static int
1155 text_position(text *t1, text *t2, Oid collid)
1156 {
1158  int result;
1159 
1160  /* Empty needle always matches at position 1 */
1161  if (VARSIZE_ANY_EXHDR(t2) < 1)
1162  return 1;
1163 
1164  /* Otherwise, can't match if haystack is shorter than needle */
1165  if (VARSIZE_ANY_EXHDR(t1) < VARSIZE_ANY_EXHDR(t2))
1166  return 0;
1167 
1168  text_position_setup(t1, t2, collid, &state);
1169  if (!text_position_next(&state))
1170  result = 0;
1171  else
1172  result = text_position_get_match_pos(&state);
1173  text_position_cleanup(&state);
1174  return result;
1175 }
1176 
1177 
1178 /*
1179  * text_position_setup, text_position_next, text_position_cleanup -
1180  * Component steps of text_position()
1181  *
1182  * These are broken out so that a string can be efficiently searched for
1183  * multiple occurrences of the same pattern. text_position_next may be
1184  * called multiple times, and it advances to the next match on each call.
1185  * text_position_get_match_ptr() and text_position_get_match_pos() return
1186  * a pointer or 1-based character position of the last match, respectively.
1187  *
1188  * The "state" variable is normally just a local variable in the caller.
1189  *
1190  * NOTE: text_position_next skips over the matched portion. For example,
1191  * searching for "xx" in "xxx" returns only one match, not two.
1192  */
1193 
1194 static void
1196 {
1197  int len1 = VARSIZE_ANY_EXHDR(t1);
1198  int len2 = VARSIZE_ANY_EXHDR(t2);
1199  pg_locale_t mylocale = 0;
1200 
1201  check_collation_set(collid);
1202 
1203  if (!lc_collate_is_c(collid) && collid != DEFAULT_COLLATION_OID)
1204  mylocale = pg_newlocale_from_collation(collid);
1205 
1206  if (mylocale && !mylocale->deterministic)
1207  ereport(ERROR,
1208  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1209  errmsg("nondeterministic collations are not supported for substring searches")));
1210 
1211  Assert(len1 > 0);
1212  Assert(len2 > 0);
1213 
1214  /*
1215  * Even with a multi-byte encoding, we perform the search using the raw
1216  * byte sequence, ignoring multibyte issues. For UTF-8, that works fine,
1217  * because in UTF-8 the byte sequence of one character cannot contain
1218  * another character. For other multi-byte encodings, we do the search
1219  * initially as a simple byte search, ignoring multibyte issues, but
1220  * verify afterwards that the match we found is at a character boundary,
1221  * and continue the search if it was a false match.
1222  */
1224  {
1225  state->is_multibyte = false;
1226  state->is_multibyte_char_in_char = false;
1227  }
1228  else if (GetDatabaseEncoding() == PG_UTF8)
1229  {
1230  state->is_multibyte = true;
1231  state->is_multibyte_char_in_char = false;
1232  }
1233  else
1234  {
1235  state->is_multibyte = true;
1236  state->is_multibyte_char_in_char = true;
1237  }
1238 
1239  state->str1 = VARDATA_ANY(t1);
1240  state->str2 = VARDATA_ANY(t2);
1241  state->len1 = len1;
1242  state->len2 = len2;
1243  state->last_match = NULL;
1244  state->refpoint = state->str1;
1245  state->refpos = 0;
1246 
1247  /*
1248  * Prepare the skip table for Boyer-Moore-Horspool searching. In these
1249  * notes we use the terminology that the "haystack" is the string to be
1250  * searched (t1) and the "needle" is the pattern being sought (t2).
1251  *
1252  * If the needle is empty or bigger than the haystack then there is no
1253  * point in wasting cycles initializing the table. We also choose not to
1254  * use B-M-H for needles of length 1, since the skip table can't possibly
1255  * save anything in that case.
1256  */
1257  if (len1 >= len2 && len2 > 1)
1258  {
1259  int searchlength = len1 - len2;
1260  int skiptablemask;
1261  int last;
1262  int i;
1263  const char *str2 = state->str2;
1264 
1265  /*
1266  * First we must determine how much of the skip table to use. The
1267  * declaration of TextPositionState allows up to 256 elements, but for
1268  * short search problems we don't really want to have to initialize so
1269  * many elements --- it would take too long in comparison to the
1270  * actual search time. So we choose a useful skip table size based on
1271  * the haystack length minus the needle length. The closer the needle
1272  * length is to the haystack length the less useful skipping becomes.
1273  *
1274  * Note: since we use bit-masking to select table elements, the skip
1275  * table size MUST be a power of 2, and so the mask must be 2^N-1.
1276  */
1277  if (searchlength < 16)
1278  skiptablemask = 3;
1279  else if (searchlength < 64)
1280  skiptablemask = 7;
1281  else if (searchlength < 128)
1282  skiptablemask = 15;
1283  else if (searchlength < 512)
1284  skiptablemask = 31;
1285  else if (searchlength < 2048)
1286  skiptablemask = 63;
1287  else if (searchlength < 4096)
1288  skiptablemask = 127;
1289  else
1290  skiptablemask = 255;
1291  state->skiptablemask = skiptablemask;
1292 
1293  /*
1294  * Initialize the skip table. We set all elements to the needle
1295  * length, since this is the correct skip distance for any character
1296  * not found in the needle.
1297  */
1298  for (i = 0; i <= skiptablemask; i++)
1299  state->skiptable[i] = len2;
1300 
1301  /*
1302  * Now examine the needle. For each character except the last one,
1303  * set the corresponding table element to the appropriate skip
1304  * distance. Note that when two characters share the same skip table
1305  * entry, the one later in the needle must determine the skip
1306  * distance.
1307  */
1308  last = len2 - 1;
1309 
1310  for (i = 0; i < last; i++)
1311  state->skiptable[(unsigned char) str2[i] & skiptablemask] = last - i;
1312  }
1313 }
1314 
1315 /*
1316  * Advance to the next match, starting from the end of the previous match
1317  * (or the beginning of the string, on first call). Returns true if a match
1318  * is found.
1319  *
1320  * Note that this refuses to match an empty-string needle. Most callers
1321  * will have handled that case specially and we'll never see it here.
1322  */
1323 static bool
1325 {
1326  int needle_len = state->len2;
1327  char *start_ptr;
1328  char *matchptr;
1329 
1330  if (needle_len <= 0)
1331  return false; /* result for empty pattern */
1332 
1333  /* Start from the point right after the previous match. */
1334  if (state->last_match)
1335  start_ptr = state->last_match + needle_len;
1336  else
1337  start_ptr = state->str1;
1338 
1339 retry:
1340  matchptr = text_position_next_internal(start_ptr, state);
1341 
1342  if (!matchptr)
1343  return false;
1344 
1345  /*
1346  * Found a match for the byte sequence. If this is a multibyte encoding,
1347  * where one character's byte sequence can appear inside a longer
1348  * multi-byte character, we need to verify that the match was at a
1349  * character boundary, not in the middle of a multi-byte character.
1350  */
1351  if (state->is_multibyte_char_in_char)
1352  {
1353  /* Walk one character at a time, until we reach the match. */
1354 
1355  /* the search should never move backwards. */
1356  Assert(state->refpoint <= matchptr);
1357 
1358  while (state->refpoint < matchptr)
1359  {
1360  /* step to next character. */
1361  state->refpoint += pg_mblen(state->refpoint);
1362  state->refpos++;
1363 
1364  /*
1365  * If we stepped over the match's start position, then it was a
1366  * false positive, where the byte sequence appeared in the middle
1367  * of a multi-byte character. Skip it, and continue the search at
1368  * the next character boundary.
1369  */
1370  if (state->refpoint > matchptr)
1371  {
1372  start_ptr = state->refpoint;
1373  goto retry;
1374  }
1375  }
1376  }
1377 
1378  state->last_match = matchptr;
1379  return true;
1380 }
1381 
1382 /*
1383  * Subroutine of text_position_next(). This searches for the raw byte
1384  * sequence, ignoring any multi-byte encoding issues. Returns the first
1385  * match starting at 'start_ptr', or NULL if no match is found.
1386  */
1387 static char *
1389 {
1390  int haystack_len = state->len1;
1391  int needle_len = state->len2;
1392  int skiptablemask = state->skiptablemask;
1393  const char *haystack = state->str1;
1394  const char *needle = state->str2;
1395  const char *haystack_end = &haystack[haystack_len];
1396  const char *hptr;
1397 
1398  Assert(start_ptr >= haystack && start_ptr <= haystack_end);
1399 
1400  if (needle_len == 1)
1401  {
1402  /* No point in using B-M-H for a one-character needle */
1403  char nchar = *needle;
1404 
1405  hptr = start_ptr;
1406  while (hptr < haystack_end)
1407  {
1408  if (*hptr == nchar)
1409  return (char *) hptr;
1410  hptr++;
1411  }
1412  }
1413  else
1414  {
1415  const char *needle_last = &needle[needle_len - 1];
1416 
1417  /* Start at startpos plus the length of the needle */
1418  hptr = start_ptr + needle_len - 1;
1419  while (hptr < haystack_end)
1420  {
1421  /* Match the needle scanning *backward* */
1422  const char *nptr;
1423  const char *p;
1424 
1425  nptr = needle_last;
1426  p = hptr;
1427  while (*nptr == *p)
1428  {
1429  /* Matched it all? If so, return 1-based position */
1430  if (nptr == needle)
1431  return (char *) p;
1432  nptr--, p--;
1433  }
1434 
1435  /*
1436  * No match, so use the haystack char at hptr to decide how far to
1437  * advance. If the needle had any occurrence of that character
1438  * (or more precisely, one sharing the same skiptable entry)
1439  * before its last character, then we advance far enough to align
1440  * the last such needle character with that haystack position.
1441  * Otherwise we can advance by the whole needle length.
1442  */
1443  hptr += state->skiptable[(unsigned char) *hptr & skiptablemask];
1444  }
1445  }
1446 
1447  return 0; /* not found */
1448 }
1449 
1450 /*
1451  * Return a pointer to the current match.
1452  *
1453  * The returned pointer points into the original haystack string.
1454  */
1455 static char *
1457 {
1458  return state->last_match;
1459 }
1460 
1461 /*
1462  * Return the offset of the current match.
1463  *
1464  * The offset is in characters, 1-based.
1465  */
1466 static int
1468 {
1469  if (!state->is_multibyte)
1470  return state->last_match - state->str1 + 1;
1471  else
1472  {
1473  /* Convert the byte position to char position. */
1474  while (state->refpoint < state->last_match)
1475  {
1476  state->refpoint += pg_mblen(state->refpoint);
1477  state->refpos++;
1478  }
1479  Assert(state->refpoint == state->last_match);
1480  return state->refpos + 1;
1481  }
1482 }
1483 
1484 /*
1485  * Reset search state to the initial state installed by text_position_setup.
1486  *
1487  * The next call to text_position_next will search from the beginning
1488  * of the string.
1489  */
1490 static void
1492 {
1493  state->last_match = NULL;
1494  state->refpoint = state->str1;
1495  state->refpos = 0;
1496 }
1497 
1498 static void
1500 {
1501  /* no cleanup needed */
1502 }
1503 
1504 
1505 static void
1507 {
1508  if (!OidIsValid(collid))
1509  {
1510  /*
1511  * This typically means that the parser could not resolve a conflict
1512  * of implicit collations, so report it that way.
1513  */
1514  ereport(ERROR,
1515  (errcode(ERRCODE_INDETERMINATE_COLLATION),
1516  errmsg("could not determine which collation to use for string comparison"),
1517  errhint("Use the COLLATE clause to set the collation explicitly.")));
1518  }
1519 }
1520 
1521 /* varstr_cmp()
1522  * Comparison function for text strings with given lengths.
1523  * Includes locale support, but must copy strings to temporary memory
1524  * to allow null-termination for inputs to strcoll().
1525  * Returns an integer less than, equal to, or greater than zero, indicating
1526  * whether arg1 is less than, equal to, or greater than arg2.
1527  *
1528  * Note: many functions that depend on this are marked leakproof; therefore,
1529  * avoid reporting the actual contents of the input when throwing errors.
1530  * All errors herein should be things that can't happen except on corrupt
1531  * data, anyway; otherwise we will have trouble with indexing strings that
1532  * would cause them.
1533  */
1534 int
1535 varstr_cmp(const char *arg1, int len1, const char *arg2, int len2, Oid collid)
1536 {
1537  int result;
1538 
1539  check_collation_set(collid);
1540 
1541  /*
1542  * Unfortunately, there is no strncoll(), so in the non-C locale case we
1543  * have to do some memory copying. This turns out to be significantly
1544  * slower, so we optimize the case where LC_COLLATE is C. We also try to
1545  * optimize relatively-short strings by avoiding palloc/pfree overhead.
1546  */
1547  if (lc_collate_is_c(collid))
1548  {
1549  result = memcmp(arg1, arg2, Min(len1, len2));
1550  if ((result == 0) && (len1 != len2))
1551  result = (len1 < len2) ? -1 : 1;
1552  }
1553  else
1554  {
1555  char a1buf[TEXTBUFLEN];
1556  char a2buf[TEXTBUFLEN];
1557  char *a1p,
1558  *a2p;
1559  pg_locale_t mylocale = 0;
1560 
1561  if (collid != DEFAULT_COLLATION_OID)
1562  mylocale = pg_newlocale_from_collation(collid);
1563 
1564  /*
1565  * memcmp() can't tell us which of two unequal strings sorts first,
1566  * but it's a cheap way to tell if they're equal. Testing shows that
1567  * memcmp() followed by strcoll() is only trivially slower than
1568  * strcoll() by itself, so we don't lose much if this doesn't work out
1569  * very often, and if it does - for example, because there are many
1570  * equal strings in the input - then we win big by avoiding expensive
1571  * collation-aware comparisons.
1572  */
1573  if (len1 == len2 && memcmp(arg1, arg2, len1) == 0)
1574  return 0;
1575 
1576 #ifdef WIN32
1577  /* Win32 does not have UTF-8, so we need to map to UTF-16 */
1578  if (GetDatabaseEncoding() == PG_UTF8
1579  && (!mylocale || mylocale->provider == COLLPROVIDER_LIBC))
1580  {
1581  int a1len;
1582  int a2len;
1583  int r;
1584 
1585  if (len1 >= TEXTBUFLEN / 2)
1586  {
1587  a1len = len1 * 2 + 2;
1588  a1p = palloc(a1len);
1589  }
1590  else
1591  {
1592  a1len = TEXTBUFLEN;
1593  a1p = a1buf;
1594  }
1595  if (len2 >= TEXTBUFLEN / 2)
1596  {
1597  a2len = len2 * 2 + 2;
1598  a2p = palloc(a2len);
1599  }
1600  else
1601  {
1602  a2len = TEXTBUFLEN;
1603  a2p = a2buf;
1604  }
1605 
1606  /* stupid Microsloth API does not work for zero-length input */
1607  if (len1 == 0)
1608  r = 0;
1609  else
1610  {
1611  r = MultiByteToWideChar(CP_UTF8, 0, arg1, len1,
1612  (LPWSTR) a1p, a1len / 2);
1613  if (!r)
1614  ereport(ERROR,
1615  (errmsg("could not convert string to UTF-16: error code %lu",
1616  GetLastError())));
1617  }
1618  ((LPWSTR) a1p)[r] = 0;
1619 
1620  if (len2 == 0)
1621  r = 0;
1622  else
1623  {
1624  r = MultiByteToWideChar(CP_UTF8, 0, arg2, len2,
1625  (LPWSTR) a2p, a2len / 2);
1626  if (!r)
1627  ereport(ERROR,
1628  (errmsg("could not convert string to UTF-16: error code %lu",
1629  GetLastError())));
1630  }
1631  ((LPWSTR) a2p)[r] = 0;
1632 
1633  errno = 0;
1634 #ifdef HAVE_LOCALE_T
1635  if (mylocale)
1636  result = wcscoll_l((LPWSTR) a1p, (LPWSTR) a2p, mylocale->info.lt);
1637  else
1638 #endif
1639  result = wcscoll((LPWSTR) a1p, (LPWSTR) a2p);
1640  if (result == 2147483647) /* _NLSCMPERROR; missing from mingw
1641  * headers */
1642  ereport(ERROR,
1643  (errmsg("could not compare Unicode strings: %m")));
1644 
1645  /* Break tie if necessary. */
1646  if (result == 0 &&
1647  (!mylocale || mylocale->deterministic))
1648  {
1649  result = memcmp(arg1, arg2, Min(len1, len2));
1650  if ((result == 0) && (len1 != len2))
1651  result = (len1 < len2) ? -1 : 1;
1652  }
1653 
1654  if (a1p != a1buf)
1655  pfree(a1p);
1656  if (a2p != a2buf)
1657  pfree(a2p);
1658 
1659  return result;
1660  }
1661 #endif /* WIN32 */
1662 
1663  if (len1 >= TEXTBUFLEN)
1664  a1p = (char *) palloc(len1 + 1);
1665  else
1666  a1p = a1buf;
1667  if (len2 >= TEXTBUFLEN)
1668  a2p = (char *) palloc(len2 + 1);
1669  else
1670  a2p = a2buf;
1671 
1672  memcpy(a1p, arg1, len1);
1673  a1p[len1] = '\0';
1674  memcpy(a2p, arg2, len2);
1675  a2p[len2] = '\0';
1676 
1677  if (mylocale)
1678  {
1679  if (mylocale->provider == COLLPROVIDER_ICU)
1680  {
1681 #ifdef USE_ICU
1682 #ifdef HAVE_UCOL_STRCOLLUTF8
1683  if (GetDatabaseEncoding() == PG_UTF8)
1684  {
1685  UErrorCode status;
1686 
1687  status = U_ZERO_ERROR;
1688  result = ucol_strcollUTF8(mylocale->info.icu.ucol,
1689  arg1, len1,
1690  arg2, len2,
1691  &status);
1692  if (U_FAILURE(status))
1693  ereport(ERROR,
1694  (errmsg("collation failed: %s", u_errorName(status))));
1695  }
1696  else
1697 #endif
1698  {
1699  int32_t ulen1,
1700  ulen2;
1701  UChar *uchar1,
1702  *uchar2;
1703 
1704  ulen1 = icu_to_uchar(&uchar1, arg1, len1);
1705  ulen2 = icu_to_uchar(&uchar2, arg2, len2);
1706 
1707  result = ucol_strcoll(mylocale->info.icu.ucol,
1708  uchar1, ulen1,
1709  uchar2, ulen2);
1710 
1711  pfree(uchar1);
1712  pfree(uchar2);
1713  }
1714 #else /* not USE_ICU */
1715  /* shouldn't happen */
1716  elog(ERROR, "unsupported collprovider: %c", mylocale->provider);
1717 #endif /* not USE_ICU */
1718  }
1719  else
1720  {
1721 #ifdef HAVE_LOCALE_T
1722  result = strcoll_l(a1p, a2p, mylocale->info.lt);
1723 #else
1724  /* shouldn't happen */
1725  elog(ERROR, "unsupported collprovider: %c", mylocale->provider);
1726 #endif
1727  }
1728  }
1729  else
1730  result = strcoll(a1p, a2p);
1731 
1732  /* Break tie if necessary. */
1733  if (result == 0 &&
1734  (!mylocale || mylocale->deterministic))
1735  result = strcmp(a1p, a2p);
1736 
1737  if (a1p != a1buf)
1738  pfree(a1p);
1739  if (a2p != a2buf)
1740  pfree(a2p);
1741  }
1742 
1743  return result;
1744 }
1745 
1746 /* text_cmp()
1747  * Internal comparison function for text strings.
1748  * Returns -1, 0 or 1
1749  */
1750 static int
1751 text_cmp(text *arg1, text *arg2, Oid collid)
1752 {
1753  char *a1p,
1754  *a2p;
1755  int len1,
1756  len2;
1757 
1758  a1p = VARDATA_ANY(arg1);
1759  a2p = VARDATA_ANY(arg2);
1760 
1761  len1 = VARSIZE_ANY_EXHDR(arg1);
1762  len2 = VARSIZE_ANY_EXHDR(arg2);
1763 
1764  return varstr_cmp(a1p, len1, a2p, len2, collid);
1765 }
1766 
1767 /*
1768  * Comparison functions for text strings.
1769  *
1770  * Note: btree indexes need these routines not to leak memory; therefore,
1771  * be careful to free working copies of toasted datums. Most places don't
1772  * need to be so careful.
1773  */
1774 
1775 Datum
1777 {
1778  Oid collid = PG_GET_COLLATION();
1779  bool result;
1780 
1781  check_collation_set(collid);
1782 
1783  if (lc_collate_is_c(collid) ||
1784  collid == DEFAULT_COLLATION_OID ||
1785  pg_newlocale_from_collation(collid)->deterministic)
1786  {
1787  Datum arg1 = PG_GETARG_DATUM(0);
1788  Datum arg2 = PG_GETARG_DATUM(1);
1789  Size len1,
1790  len2;
1791 
1792  /*
1793  * Since we only care about equality or not-equality, we can avoid all
1794  * the expense of strcoll() here, and just do bitwise comparison. In
1795  * fact, we don't even have to do a bitwise comparison if we can show
1796  * the lengths of the strings are unequal; which might save us from
1797  * having to detoast one or both values.
1798  */
1799  len1 = toast_raw_datum_size(arg1);
1800  len2 = toast_raw_datum_size(arg2);
1801  if (len1 != len2)
1802  result = false;
1803  else
1804  {
1805  text *targ1 = DatumGetTextPP(arg1);
1806  text *targ2 = DatumGetTextPP(arg2);
1807 
1808  result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1809  len1 - VARHDRSZ) == 0);
1810 
1811  PG_FREE_IF_COPY(targ1, 0);
1812  PG_FREE_IF_COPY(targ2, 1);
1813  }
1814  }
1815  else
1816  {
1817  text *arg1 = PG_GETARG_TEXT_PP(0);
1818  text *arg2 = PG_GETARG_TEXT_PP(1);
1819 
1820  result = (text_cmp(arg1, arg2, collid) == 0);
1821 
1822  PG_FREE_IF_COPY(arg1, 0);
1823  PG_FREE_IF_COPY(arg2, 1);
1824  }
1825 
1826  PG_RETURN_BOOL(result);
1827 }
1828 
1829 Datum
1831 {
1832  Oid collid = PG_GET_COLLATION();
1833  bool result;
1834 
1835  check_collation_set(collid);
1836 
1837  if (lc_collate_is_c(collid) ||
1838  collid == DEFAULT_COLLATION_OID ||
1839  pg_newlocale_from_collation(collid)->deterministic)
1840  {
1841  Datum arg1 = PG_GETARG_DATUM(0);
1842  Datum arg2 = PG_GETARG_DATUM(1);
1843  Size len1,
1844  len2;
1845 
1846  /* See comment in texteq() */
1847  len1 = toast_raw_datum_size(arg1);
1848  len2 = toast_raw_datum_size(arg2);
1849  if (len1 != len2)
1850  result = true;
1851  else
1852  {
1853  text *targ1 = DatumGetTextPP(arg1);
1854  text *targ2 = DatumGetTextPP(arg2);
1855 
1856  result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1857  len1 - VARHDRSZ) != 0);
1858 
1859  PG_FREE_IF_COPY(targ1, 0);
1860  PG_FREE_IF_COPY(targ2, 1);
1861  }
1862  }
1863  else
1864  {
1865  text *arg1 = PG_GETARG_TEXT_PP(0);
1866  text *arg2 = PG_GETARG_TEXT_PP(1);
1867 
1868  result = (text_cmp(arg1, arg2, collid) != 0);
1869 
1870  PG_FREE_IF_COPY(arg1, 0);
1871  PG_FREE_IF_COPY(arg2, 1);
1872  }
1873 
1874  PG_RETURN_BOOL(result);
1875 }
1876 
1877 Datum
1879 {
1880  text *arg1 = PG_GETARG_TEXT_PP(0);
1881  text *arg2 = PG_GETARG_TEXT_PP(1);
1882  bool result;
1883 
1884  result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) < 0);
1885 
1886  PG_FREE_IF_COPY(arg1, 0);
1887  PG_FREE_IF_COPY(arg2, 1);
1888 
1889  PG_RETURN_BOOL(result);
1890 }
1891 
1892 Datum
1894 {
1895  text *arg1 = PG_GETARG_TEXT_PP(0);
1896  text *arg2 = PG_GETARG_TEXT_PP(1);
1897  bool result;
1898 
1899  result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) <= 0);
1900 
1901  PG_FREE_IF_COPY(arg1, 0);
1902  PG_FREE_IF_COPY(arg2, 1);
1903 
1904  PG_RETURN_BOOL(result);
1905 }
1906 
1907 Datum
1909 {
1910  text *arg1 = PG_GETARG_TEXT_PP(0);
1911  text *arg2 = PG_GETARG_TEXT_PP(1);
1912  bool result;
1913 
1914  result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) > 0);
1915 
1916  PG_FREE_IF_COPY(arg1, 0);
1917  PG_FREE_IF_COPY(arg2, 1);
1918 
1919  PG_RETURN_BOOL(result);
1920 }
1921 
1922 Datum
1924 {
1925  text *arg1 = PG_GETARG_TEXT_PP(0);
1926  text *arg2 = PG_GETARG_TEXT_PP(1);
1927  bool result;
1928 
1929  result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) >= 0);
1930 
1931  PG_FREE_IF_COPY(arg1, 0);
1932  PG_FREE_IF_COPY(arg2, 1);
1933 
1934  PG_RETURN_BOOL(result);
1935 }
1936 
1937 Datum
1939 {
1940  Datum arg1 = PG_GETARG_DATUM(0);
1941  Datum arg2 = PG_GETARG_DATUM(1);
1942  Oid collid = PG_GET_COLLATION();
1943  pg_locale_t mylocale = 0;
1944  bool result;
1945  Size len1,
1946  len2;
1947 
1948  check_collation_set(collid);
1949 
1950  if (!lc_collate_is_c(collid) && collid != DEFAULT_COLLATION_OID)
1951  mylocale = pg_newlocale_from_collation(collid);
1952 
1953  if (mylocale && !mylocale->deterministic)
1954  ereport(ERROR,
1955  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1956  errmsg("nondeterministic collations are not supported for substring searches")));
1957 
1958  len1 = toast_raw_datum_size(arg1);
1959  len2 = toast_raw_datum_size(arg2);
1960  if (len2 > len1)
1961  result = false;
1962  else
1963  {
1964  text *targ1 = text_substring(arg1, 1, len2, false);
1965  text *targ2 = DatumGetTextPP(arg2);
1966 
1967  result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1968  VARSIZE_ANY_EXHDR(targ2)) == 0);
1969 
1970  PG_FREE_IF_COPY(targ1, 0);
1971  PG_FREE_IF_COPY(targ2, 1);
1972  }
1973 
1974  PG_RETURN_BOOL(result);
1975 }
1976 
1977 Datum
1979 {
1980  text *arg1 = PG_GETARG_TEXT_PP(0);
1981  text *arg2 = PG_GETARG_TEXT_PP(1);
1982  int32 result;
1983 
1984  result = text_cmp(arg1, arg2, PG_GET_COLLATION());
1985 
1986  PG_FREE_IF_COPY(arg1, 0);
1987  PG_FREE_IF_COPY(arg2, 1);
1988 
1989  PG_RETURN_INT32(result);
1990 }
1991 
1992 Datum
1994 {
1996  Oid collid = ssup->ssup_collation;
1997  MemoryContext oldcontext;
1998 
1999  oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
2000 
2001  /* Use generic string SortSupport */
2002  varstr_sortsupport(ssup, TEXTOID, collid);
2003 
2004  MemoryContextSwitchTo(oldcontext);
2005 
2006  PG_RETURN_VOID();
2007 }
2008 
2009 /*
2010  * Generic sortsupport interface for character type's operator classes.
2011  * Includes locale support, and support for BpChar semantics (i.e. removing
2012  * trailing spaces before comparison).
2013  *
2014  * Relies on the assumption that text, VarChar, BpChar, and bytea all have the
2015  * same representation. Callers that always use the C collation (e.g.
2016  * non-collatable type callers like bytea) may have NUL bytes in their strings;
2017  * this will not work with any other collation, though.
2018  */
2019 void
2021 {
2022  bool abbreviate = ssup->abbreviate;
2023  bool collate_c = false;
2024  VarStringSortSupport *sss;
2025  pg_locale_t locale = 0;
2026 
2027  check_collation_set(collid);
2028 
2029  /*
2030  * If possible, set ssup->comparator to a function which can be used to
2031  * directly compare two datums. If we can do this, we'll avoid the
2032  * overhead of a trip through the fmgr layer for every comparison, which
2033  * can be substantial.
2034  *
2035  * Most typically, we'll set the comparator to varlenafastcmp_locale,
2036  * which uses strcoll() to perform comparisons. We use that for the
2037  * BpChar case too, but type NAME uses namefastcmp_locale. However, if
2038  * LC_COLLATE = C, we can make things quite a bit faster with
2039  * varstrfastcmp_c, bpcharfastcmp_c, or namefastcmp_c, all of which use
2040  * memcmp() rather than strcoll().
2041  */
2042  if (lc_collate_is_c(collid))
2043  {
2044  if (typid == BPCHAROID)
2045  ssup->comparator = bpcharfastcmp_c;
2046  else if (typid == NAMEOID)
2047  {
2048  ssup->comparator = namefastcmp_c;
2049  /* Not supporting abbreviation with type NAME, for now */
2050  abbreviate = false;
2051  }
2052  else
2053  ssup->comparator = varstrfastcmp_c;
2054 
2055  collate_c = true;
2056  }
2057  else
2058  {
2059  /*
2060  * We need a collation-sensitive comparison. To make things faster,
2061  * we'll figure out the collation based on the locale id and cache the
2062  * result.
2063  */
2064  if (collid != DEFAULT_COLLATION_OID)
2065  locale = pg_newlocale_from_collation(collid);
2066 
2067  /*
2068  * There is a further exception on Windows. When the database
2069  * encoding is UTF-8 and we are not using the C collation, complex
2070  * hacks are required. We don't currently have a comparator that
2071  * handles that case, so we fall back on the slow method of having the
2072  * sort code invoke bttextcmp() (in the case of text) via the fmgr
2073  * trampoline. ICU locales work just the same on Windows, however.
2074  */
2075 #ifdef WIN32
2076  if (GetDatabaseEncoding() == PG_UTF8 &&
2077  !(locale && locale->provider == COLLPROVIDER_ICU))
2078  return;
2079 #endif
2080 
2081  /*
2082  * We use varlenafastcmp_locale except for type NAME.
2083  */
2084  if (typid == NAMEOID)
2085  {
2087  /* Not supporting abbreviation with type NAME, for now */
2088  abbreviate = false;
2089  }
2090  else
2092  }
2093 
2094  /*
2095  * Unfortunately, it seems that abbreviation for non-C collations is
2096  * broken on many common platforms; testing of multiple versions of glibc
2097  * reveals that, for many locales, strcoll() and strxfrm() do not return
2098  * consistent results, which is fatal to this optimization. While no
2099  * other libc other than Cygwin has so far been shown to have a problem,
2100  * we take the conservative course of action for right now and disable
2101  * this categorically. (Users who are certain this isn't a problem on
2102  * their system can define TRUST_STRXFRM.)
2103  *
2104  * Even apart from the risk of broken locales, it's possible that there
2105  * are platforms where the use of abbreviated keys should be disabled at
2106  * compile time. Having only 4 byte datums could make worst-case
2107  * performance drastically more likely, for example. Moreover, macOS's
2108  * strxfrm() implementation is known to not effectively concentrate a
2109  * significant amount of entropy from the original string in earlier
2110  * transformed blobs. It's possible that other supported platforms are
2111  * similarly encumbered. So, if we ever get past disabling this
2112  * categorically, we may still want or need to disable it for particular
2113  * platforms.
2114  */
2115 #ifndef TRUST_STRXFRM
2116  if (!collate_c && !(locale && locale->provider == COLLPROVIDER_ICU))
2117  abbreviate = false;
2118 #endif
2119 
2120  /*
2121  * If we're using abbreviated keys, or if we're using a locale-aware
2122  * comparison, we need to initialize a VarStringSortSupport object. Both
2123  * cases will make use of the temporary buffers we initialize here for
2124  * scratch space (and to detect requirement for BpChar semantics from
2125  * caller), and the abbreviation case requires additional state.
2126  */
2127  if (abbreviate || !collate_c)
2128  {
2129  sss = palloc(sizeof(VarStringSortSupport));
2130  sss->buf1 = palloc(TEXTBUFLEN);
2131  sss->buflen1 = TEXTBUFLEN;
2132  sss->buf2 = palloc(TEXTBUFLEN);
2133  sss->buflen2 = TEXTBUFLEN;
2134  /* Start with invalid values */
2135  sss->last_len1 = -1;
2136  sss->last_len2 = -1;
2137  /* Initialize */
2138  sss->last_returned = 0;
2139  sss->locale = locale;
2140 
2141  /*
2142  * To avoid somehow confusing a strxfrm() blob and an original string,
2143  * constantly keep track of the variety of data that buf1 and buf2
2144  * currently contain.
2145  *
2146  * Comparisons may be interleaved with conversion calls. Frequently,
2147  * conversions and comparisons are batched into two distinct phases,
2148  * but the correctness of caching cannot hinge upon this. For
2149  * comparison caching, buffer state is only trusted if cache_blob is
2150  * found set to false, whereas strxfrm() caching only trusts the state
2151  * when cache_blob is found set to true.
2152  *
2153  * Arbitrarily initialize cache_blob to true.
2154  */
2155  sss->cache_blob = true;
2156  sss->collate_c = collate_c;
2157  sss->typid = typid;
2158  ssup->ssup_extra = sss;
2159 
2160  /*
2161  * If possible, plan to use the abbreviated keys optimization. The
2162  * core code may switch back to authoritative comparator should
2163  * abbreviation be aborted.
2164  */
2165  if (abbreviate)
2166  {
2167  sss->prop_card = 0.20;
2168  initHyperLogLog(&sss->abbr_card, 10);
2169  initHyperLogLog(&sss->full_card, 10);
2170  ssup->abbrev_full_comparator = ssup->comparator;
2171  ssup->comparator = varstrcmp_abbrev;
2174  }
2175  }
2176 }
2177 
2178 /*
2179  * sortsupport comparison func (for C locale case)
2180  */
2181 static int
2183 {
2184  VarString *arg1 = DatumGetVarStringPP(x);
2185  VarString *arg2 = DatumGetVarStringPP(y);
2186  char *a1p,
2187  *a2p;
2188  int len1,
2189  len2,
2190  result;
2191 
2192  a1p = VARDATA_ANY(arg1);
2193  a2p = VARDATA_ANY(arg2);
2194 
2195  len1 = VARSIZE_ANY_EXHDR(arg1);
2196  len2 = VARSIZE_ANY_EXHDR(arg2);
2197 
2198  result = memcmp(a1p, a2p, Min(len1, len2));
2199  if ((result == 0) && (len1 != len2))
2200  result = (len1 < len2) ? -1 : 1;
2201 
2202  /* We can't afford to leak memory here. */
2203  if (PointerGetDatum(arg1) != x)
2204  pfree(arg1);
2205  if (PointerGetDatum(arg2) != y)
2206  pfree(arg2);
2207 
2208  return result;
2209 }
2210 
2211 /*
2212  * sortsupport comparison func (for BpChar C locale case)
2213  *
2214  * BpChar outsources its sortsupport to this module. Specialization for the
2215  * varstr_sortsupport BpChar case, modeled on
2216  * internal_bpchar_pattern_compare().
2217  */
2218 static int
2220 {
2221  BpChar *arg1 = DatumGetBpCharPP(x);
2222  BpChar *arg2 = DatumGetBpCharPP(y);
2223  char *a1p,
2224  *a2p;
2225  int len1,
2226  len2,
2227  result;
2228 
2229  a1p = VARDATA_ANY(arg1);
2230  a2p = VARDATA_ANY(arg2);
2231 
2232  len1 = bpchartruelen(a1p, VARSIZE_ANY_EXHDR(arg1));
2233  len2 = bpchartruelen(a2p, VARSIZE_ANY_EXHDR(arg2));
2234 
2235  result = memcmp(a1p, a2p, Min(len1, len2));
2236  if ((result == 0) && (len1 != len2))
2237  result = (len1 < len2) ? -1 : 1;
2238 
2239  /* We can't afford to leak memory here. */
2240  if (PointerGetDatum(arg1) != x)
2241  pfree(arg1);
2242  if (PointerGetDatum(arg2) != y)
2243  pfree(arg2);
2244 
2245  return result;
2246 }
2247 
2248 /*
2249  * sortsupport comparison func (for NAME C locale case)
2250  */
2251 static int
2253 {
2254  Name arg1 = DatumGetName(x);
2255  Name arg2 = DatumGetName(y);
2256 
2257  return strncmp(NameStr(*arg1), NameStr(*arg2), NAMEDATALEN);
2258 }
2259 
2260 /*
2261  * sortsupport comparison func (for locale case with all varlena types)
2262  */
2263 static int
2265 {
2266  VarString *arg1 = DatumGetVarStringPP(x);
2267  VarString *arg2 = DatumGetVarStringPP(y);
2268  char *a1p,
2269  *a2p;
2270  int len1,
2271  len2,
2272  result;
2273 
2274  a1p = VARDATA_ANY(arg1);
2275  a2p = VARDATA_ANY(arg2);
2276 
2277  len1 = VARSIZE_ANY_EXHDR(arg1);
2278  len2 = VARSIZE_ANY_EXHDR(arg2);
2279 
2280  result = varstrfastcmp_locale(a1p, len1, a2p, len2, ssup);
2281 
2282  /* We can't afford to leak memory here. */
2283  if (PointerGetDatum(arg1) != x)
2284  pfree(arg1);
2285  if (PointerGetDatum(arg2) != y)
2286  pfree(arg2);
2287 
2288  return result;
2289 }
2290 
2291 /*
2292  * sortsupport comparison func (for locale case with NAME type)
2293  */
2294 static int
2296 {
2297  Name arg1 = DatumGetName(x);
2298  Name arg2 = DatumGetName(y);
2299 
2300  return varstrfastcmp_locale(NameStr(*arg1), strlen(NameStr(*arg1)),
2301  NameStr(*arg2), strlen(NameStr(*arg2)),
2302  ssup);
2303 }
2304 
2305 /*
2306  * sortsupport comparison func for locale cases
2307  */
2308 static int
2309 varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup)
2310 {
2312  int result;
2313  bool arg1_match;
2314 
2315  /* Fast pre-check for equality, as discussed in varstr_cmp() */
2316  if (len1 == len2 && memcmp(a1p, a2p, len1) == 0)
2317  {
2318  /*
2319  * No change in buf1 or buf2 contents, so avoid changing last_len1 or
2320  * last_len2. Existing contents of buffers might still be used by
2321  * next call.
2322  *
2323  * It's fine to allow the comparison of BpChar padding bytes here,
2324  * even though that implies that the memcmp() will usually be
2325  * performed for BpChar callers (though multibyte characters could
2326  * still prevent that from occurring). The memcmp() is still very
2327  * cheap, and BpChar's funny semantics have us remove trailing spaces
2328  * (not limited to padding), so we need make no distinction between
2329  * padding space characters and "real" space characters.
2330  */
2331  return 0;
2332  }
2333 
2334  if (sss->typid == BPCHAROID)
2335  {
2336  /* Get true number of bytes, ignoring trailing spaces */
2337  len1 = bpchartruelen(a1p, len1);
2338  len2 = bpchartruelen(a2p, len2);
2339  }
2340 
2341  if (len1 >= sss->buflen1)
2342  {
2343  pfree(sss->buf1);
2344  sss->buflen1 = Max(len1 + 1, Min(sss->buflen1 * 2, MaxAllocSize));
2345  sss->buf1 = MemoryContextAlloc(ssup->ssup_cxt, sss->buflen1);
2346  }
2347  if (len2 >= sss->buflen2)
2348  {
2349  pfree(sss->buf2);
2350  sss->buflen2 = Max(len2 + 1, Min(sss->buflen2 * 2, MaxAllocSize));
2351  sss->buf2 = MemoryContextAlloc(ssup->ssup_cxt, sss->buflen2);
2352  }
2353 
2354  /*
2355  * We're likely to be asked to compare the same strings repeatedly, and
2356  * memcmp() is so much cheaper than strcoll() that it pays to try to cache
2357  * comparisons, even though in general there is no reason to think that
2358  * that will work out (every string datum may be unique). Caching does
2359  * not slow things down measurably when it doesn't work out, and can speed
2360  * things up by rather a lot when it does. In part, this is because the
2361  * memcmp() compares data from cachelines that are needed in L1 cache even
2362  * when the last comparison's result cannot be reused.
2363  */
2364  arg1_match = true;
2365  if (len1 != sss->last_len1 || memcmp(sss->buf1, a1p, len1) != 0)
2366  {
2367  arg1_match = false;
2368  memcpy(sss->buf1, a1p, len1);
2369  sss->buf1[len1] = '\0';
2370  sss->last_len1 = len1;
2371  }
2372 
2373  /*
2374  * If we're comparing the same two strings as last time, we can return the
2375  * same answer without calling strcoll() again. This is more likely than
2376  * it seems (at least with moderate to low cardinality sets), because
2377  * quicksort compares the same pivot against many values.
2378  */
2379  if (len2 != sss->last_len2 || memcmp(sss->buf2, a2p, len2) != 0)
2380  {
2381  memcpy(sss->buf2, a2p, len2);
2382  sss->buf2[len2] = '\0';
2383  sss->last_len2 = len2;
2384  }
2385  else if (arg1_match && !sss->cache_blob)
2386  {
2387  /* Use result cached following last actual strcoll() call */
2388  return sss->last_returned;
2389  }
2390 
2391  if (sss->locale)
2392  {
2393  if (sss->locale->provider == COLLPROVIDER_ICU)
2394  {
2395 #ifdef USE_ICU
2396 #ifdef HAVE_UCOL_STRCOLLUTF8
2397  if (GetDatabaseEncoding() == PG_UTF8)
2398  {
2399  UErrorCode status;
2400 
2401  status = U_ZERO_ERROR;
2402  result = ucol_strcollUTF8(sss->locale->info.icu.ucol,
2403  a1p, len1,
2404  a2p, len2,
2405  &status);
2406  if (U_FAILURE(status))
2407  ereport(ERROR,
2408  (errmsg("collation failed: %s", u_errorName(status))));
2409  }
2410  else
2411 #endif
2412  {
2413  int32_t ulen1,
2414  ulen2;
2415  UChar *uchar1,
2416  *uchar2;
2417 
2418  ulen1 = icu_to_uchar(&uchar1, a1p, len1);
2419  ulen2 = icu_to_uchar(&uchar2, a2p, len2);
2420 
2421  result = ucol_strcoll(sss->locale->info.icu.ucol,
2422  uchar1, ulen1,
2423  uchar2, ulen2);
2424 
2425  pfree(uchar1);
2426  pfree(uchar2);
2427  }
2428 #else /* not USE_ICU */
2429  /* shouldn't happen */
2430  elog(ERROR, "unsupported collprovider: %c", sss->locale->provider);
2431 #endif /* not USE_ICU */
2432  }
2433  else
2434  {
2435 #ifdef HAVE_LOCALE_T
2436  result = strcoll_l(sss->buf1, sss->buf2, sss->locale->info.lt);
2437 #else
2438  /* shouldn't happen */
2439  elog(ERROR, "unsupported collprovider: %c", sss->locale->provider);
2440 #endif
2441  }
2442  }
2443  else
2444  result = strcoll(sss->buf1, sss->buf2);
2445 
2446  /* Break tie if necessary. */
2447  if (result == 0 &&
2448  (!sss->locale || sss->locale->deterministic))
2449  result = strcmp(sss->buf1, sss->buf2);
2450 
2451  /* Cache result, perhaps saving an expensive strcoll() call next time */
2452  sss->cache_blob = false;
2453  sss->last_returned = result;
2454  return result;
2455 }
2456 
2457 /*
2458  * Abbreviated key comparison func
2459  */
2460 static int
2462 {
2463  /*
2464  * When 0 is returned, the core system will call varstrfastcmp_c()
2465  * (bpcharfastcmp_c() in BpChar case) or varlenafastcmp_locale(). Even a
2466  * strcmp() on two non-truncated strxfrm() blobs cannot indicate *equality*
2467  * authoritatively, for the same reason that there is a strcoll()
2468  * tie-breaker call to strcmp() in varstr_cmp().
2469  */
2470  if (x > y)
2471  return 1;
2472  else if (x == y)
2473  return 0;
2474  else
2475  return -1;
2476 }
2477 
2478 /*
2479  * Conversion routine for sortsupport. Converts original to abbreviated key
2480  * representation. Our encoding strategy is simple -- pack the first 8 bytes
2481  * of a strxfrm() blob into a Datum (on little-endian machines, the 8 bytes are
2482  * stored in reverse order), and treat it as an unsigned integer. When the "C"
2483  * locale is used, or in case of bytea, just memcpy() from original instead.
2484  */
2485 static Datum
2487 {
2489  VarString *authoritative = DatumGetVarStringPP(original);
2490  char *authoritative_data = VARDATA_ANY(authoritative);
2491 
2492  /* working state */
2493  Datum res;
2494  char *pres;
2495  int len;
2496  uint32 hash;
2497 
2498  pres = (char *) &res;
2499  /* memset(), so any non-overwritten bytes are NUL */
2500  memset(pres, 0, sizeof(Datum));
2501  len = VARSIZE_ANY_EXHDR(authoritative);
2502 
2503  /* Get number of bytes, ignoring trailing spaces */
2504  if (sss->typid == BPCHAROID)
2505  len = bpchartruelen(authoritative_data, len);
2506 
2507  /*
2508  * If we're using the C collation, use memcpy(), rather than strxfrm(), to
2509  * abbreviate keys. The full comparator for the C locale is always
2510  * memcmp(). It would be incorrect to allow bytea callers (callers that
2511  * always force the C collation -- bytea isn't a collatable type, but this
2512  * approach is convenient) to use strxfrm(). This is because bytea
2513  * strings may contain NUL bytes. Besides, this should be faster, too.
2514  *
2515  * More generally, it's okay that bytea callers can have NUL bytes in
2516  * strings because varstrcmp_abbrev() need not make a distinction between
2517  * terminating NUL bytes, and NUL bytes representing actual NULs in the
2518  * authoritative representation. Hopefully a comparison at or past one
2519  * abbreviated key's terminating NUL byte will resolve the comparison
2520  * without consulting the authoritative representation; specifically, some
2521  * later non-NUL byte in the longer string can resolve the comparison
2522  * against a subsequent terminating NUL in the shorter string. There will
2523  * usually be what is effectively a "length-wise" resolution there and
2524  * then.
2525  *
2526  * If that doesn't work out -- if all bytes in the longer string
2527  * positioned at or past the offset of the smaller string's (first)
2528  * terminating NUL are actually representative of NUL bytes in the
2529  * authoritative binary string (perhaps with some *terminating* NUL bytes
2530  * towards the end of the longer string iff it happens to still be small)
2531  * -- then an authoritative tie-breaker will happen, and do the right
2532  * thing: explicitly consider string length.
2533  */
2534  if (sss->collate_c)
2535  memcpy(pres, authoritative_data, Min(len, sizeof(Datum)));
2536  else
2537  {
2538  Size bsize;
2539 #ifdef USE_ICU
2540  int32_t ulen = -1;
2541  UChar *uchar = NULL;
2542 #endif
2543 
2544  /*
2545  * We're not using the C collation, so fall back on strxfrm or ICU
2546  * analogs.
2547  */
2548 
2549  /* By convention, we use buffer 1 to store and NUL-terminate */
2550  if (len >= sss->buflen1)
2551  {
2552  pfree(sss->buf1);
2553  sss->buflen1 = Max(len + 1, Min(sss->buflen1 * 2, MaxAllocSize));
2554  sss->buf1 = palloc(sss->buflen1);
2555  }
2556 
2557  /* Might be able to reuse strxfrm() blob from last call */
2558  if (sss->last_len1 == len && sss->cache_blob &&
2559  memcmp(sss->buf1, authoritative_data, len) == 0)
2560  {
2561  memcpy(pres, sss->buf2, Min(sizeof(Datum), sss->last_len2));
2562  /* No change affecting cardinality, so no hashing required */
2563  goto done;
2564  }
2565 
2566  memcpy(sss->buf1, authoritative_data, len);
2567 
2568  /*
2569  * Just like strcoll(), strxfrm() expects a NUL-terminated string. Not
2570  * necessary for ICU, but doesn't hurt.
2571  */
2572  sss->buf1[len] = '\0';
2573  sss->last_len1 = len;
2574 
2575 #ifdef USE_ICU
2576  /* When using ICU and not UTF8, convert string to UChar. */
2577  if (sss->locale && sss->locale->provider == COLLPROVIDER_ICU &&
2579  ulen = icu_to_uchar(&uchar, sss->buf1, len);
2580 #endif
2581 
2582  /*
2583  * Loop: Call strxfrm() or ucol_getSortKey(), possibly enlarge buffer,
2584  * and try again. Both of these functions have the result buffer
2585  * content undefined if the result did not fit, so we need to retry
2586  * until everything fits, even though we only need the first few bytes
2587  * in the end. When using ucol_nextSortKeyPart(), however, we only
2588  * ask for as many bytes as we actually need.
2589  */
2590  for (;;)
2591  {
2592 #ifdef USE_ICU
2593  if (sss->locale && sss->locale->provider == COLLPROVIDER_ICU)
2594  {
2595  /*
2596  * When using UTF8, use the iteration interface so we only
2597  * need to produce as many bytes as we actually need.
2598  */
2599  if (GetDatabaseEncoding() == PG_UTF8)
2600  {
2601  UCharIterator iter;
2602  uint32_t state[2];
2603  UErrorCode status;
2604 
2605  uiter_setUTF8(&iter, sss->buf1, len);
2606  state[0] = state[1] = 0; /* won't need that again */
2607  status = U_ZERO_ERROR;
2608  bsize = ucol_nextSortKeyPart(sss->locale->info.icu.ucol,
2609  &iter,
2610  state,
2611  (uint8_t *) sss->buf2,
2612  Min(sizeof(Datum), sss->buflen2),
2613  &status);
2614  if (U_FAILURE(status))
2615  ereport(ERROR,
2616  (errmsg("sort key generation failed: %s",
2617  u_errorName(status))));
2618  }
2619  else
2620  bsize = ucol_getSortKey(sss->locale->info.icu.ucol,
2621  uchar, ulen,
2622  (uint8_t *) sss->buf2, sss->buflen2);
2623  }
2624  else
2625 #endif
2626 #ifdef HAVE_LOCALE_T
2627  if (sss->locale && sss->locale->provider == COLLPROVIDER_LIBC)
2628  bsize = strxfrm_l(sss->buf2, sss->buf1,
2629  sss->buflen2, sss->locale->info.lt);
2630  else
2631 #endif
2632  bsize = strxfrm(sss->buf2, sss->buf1, sss->buflen2);
2633 
2634  sss->last_len2 = bsize;
2635  if (bsize < sss->buflen2)
2636  break;
2637 
2638  /*
2639  * Grow buffer and retry.
2640  */
2641  pfree(sss->buf2);
2642  sss->buflen2 = Max(bsize + 1,
2643  Min(sss->buflen2 * 2, MaxAllocSize));
2644  sss->buf2 = palloc(sss->buflen2);
2645  }
2646 
2647  /*
2648  * Every Datum byte is always compared. This is safe because the
2649  * strxfrm() blob is itself NUL terminated, leaving no danger of
2650  * misinterpreting any NUL bytes not intended to be interpreted as
2651  * logically representing termination.
2652  *
2653  * (Actually, even if there were NUL bytes in the blob it would be
2654  * okay. See remarks on bytea case above.)
2655  */
2656  memcpy(pres, sss->buf2, Min(sizeof(Datum), bsize));
2657 
2658 #ifdef USE_ICU
2659  if (uchar)
2660  pfree(uchar);
2661 #endif
2662  }
2663 
2664  /*
2665  * Maintain approximate cardinality of both abbreviated keys and original,
2666  * authoritative keys using HyperLogLog. Used as cheap insurance against
2667  * the worst case, where we do many string transformations for no saving
2668  * in full strcoll()-based comparisons. These statistics are used by
2669  * varstr_abbrev_abort().
2670  *
2671  * First, Hash key proper, or a significant fraction of it. Mix in length
2672  * in order to compensate for cases where differences are past
2673  * PG_CACHE_LINE_SIZE bytes, so as to limit the overhead of hashing.
2674  */
2675  hash = DatumGetUInt32(hash_any((unsigned char *) authoritative_data,
2676  Min(len, PG_CACHE_LINE_SIZE)));
2677 
2678  if (len > PG_CACHE_LINE_SIZE)
2679  hash ^= DatumGetUInt32(hash_uint32((uint32) len));
2680 
2681  addHyperLogLog(&sss->full_card, hash);
2682 
2683  /* Hash abbreviated key */
2684 #if SIZEOF_DATUM == 8
2685  {
2686  uint32 lohalf,
2687  hihalf;
2688 
2689  lohalf = (uint32) res;
2690  hihalf = (uint32) (res >> 32);
2691  hash = DatumGetUInt32(hash_uint32(lohalf ^ hihalf));
2692  }
2693 #else /* SIZEOF_DATUM != 8 */
2694  hash = DatumGetUInt32(hash_uint32((uint32) res));
2695 #endif
2696 
2697  addHyperLogLog(&sss->abbr_card, hash);
2698 
2699  /* Cache result, perhaps saving an expensive strxfrm() call next time */
2700  sss->cache_blob = true;
2701 done:
2702 
2703  /*
2704  * Byteswap on little-endian machines.
2705  *
2706  * This is needed so that varstrcmp_abbrev() (an unsigned integer 3-way
2707  * comparator) works correctly on all platforms. If we didn't do this,
2708  * the comparator would have to call memcmp() with a pair of pointers to
2709  * the first byte of each abbreviated key, which is slower.
2710  */
2711  res = DatumBigEndianToNative(res);
2712 
2713  /* Don't leak memory here */
2714  if (PointerGetDatum(authoritative) != original)
2715  pfree(authoritative);
2716 
2717  return res;
2718 }
2719 
2720 /*
2721  * Callback for estimating effectiveness of abbreviated key optimization, using
2722  * heuristic rules. Returns value indicating if the abbreviation optimization
2723  * should be aborted, based on its projected effectiveness.
2724  */
2725 static bool
2726 varstr_abbrev_abort(int memtupcount, SortSupport ssup)
2727 {
2729  double abbrev_distinct,
2730  key_distinct;
2731 
2732  Assert(ssup->abbreviate);
2733 
2734  /* Have a little patience */
2735  if (memtupcount < 100)
2736  return false;
2737 
2738  abbrev_distinct = estimateHyperLogLog(&sss->abbr_card);
2739  key_distinct = estimateHyperLogLog(&sss->full_card);
2740 
2741  /*
2742  * Clamp cardinality estimates to at least one distinct value. While
2743  * NULLs are generally disregarded, if only NULL values were seen so far,
2744  * that might misrepresent costs if we failed to clamp.
2745  */
2746  if (abbrev_distinct <= 1.0)
2747  abbrev_distinct = 1.0;
2748 
2749  if (key_distinct <= 1.0)
2750  key_distinct = 1.0;
2751 
2752  /*
2753  * In the worst case all abbreviated keys are identical, while at the same
2754  * time there are differences within full key strings not captured in
2755  * abbreviations.
2756  */
2757 #ifdef TRACE_SORT
2758  if (trace_sort)
2759  {
2760  double norm_abbrev_card = abbrev_distinct / (double) memtupcount;
2761 
2762  elog(LOG, "varstr_abbrev: abbrev_distinct after %d: %f "
2763  "(key_distinct: %f, norm_abbrev_card: %f, prop_card: %f)",
2764  memtupcount, abbrev_distinct, key_distinct, norm_abbrev_card,
2765  sss->prop_card);
2766  }
2767 #endif
2768 
2769  /*
2770  * If the number of distinct abbreviated keys approximately matches the
2771  * number of distinct authoritative original keys, that's reason enough to
2772  * proceed. We can win even with a very low cardinality set if most
2773  * tie-breakers only memcmp(). This is by far the most important
2774  * consideration.
2775  *
2776  * While comparisons that are resolved at the abbreviated key level are
2777  * considerably cheaper than tie-breakers resolved with memcmp(), both of
2778  * those two outcomes are so much cheaper than a full strcoll() once
2779  * sorting is underway that it doesn't seem worth it to weigh abbreviated
2780  * cardinality against the overall size of the set in order to more
2781  * accurately model costs. Assume that an abbreviated comparison, and an
2782  * abbreviated comparison with a cheap memcmp()-based authoritative
2783  * resolution are equivalent.
2784  */
2785  if (abbrev_distinct > key_distinct * sss->prop_card)
2786  {
2787  /*
2788  * When we have exceeded 10,000 tuples, decay required cardinality
2789  * aggressively for next call.
2790  *
2791  * This is useful because the number of comparisons required on
2792  * average increases at a linearithmic rate, and at roughly 10,000
2793  * tuples that factor will start to dominate over the linear costs of
2794  * string transformation (this is a conservative estimate). The decay
2795  * rate is chosen to be a little less aggressive than halving -- which
2796  * (since we're called at points at which memtupcount has doubled)
2797  * would never see the cost model actually abort past the first call
2798  * following a decay. This decay rate is mostly a precaution against
2799  * a sudden, violent swing in how well abbreviated cardinality tracks
2800  * full key cardinality. The decay also serves to prevent a marginal
2801  * case from being aborted too late, when too much has already been
2802  * invested in string transformation.
2803  *
2804  * It's possible for sets of several million distinct strings with
2805  * mere tens of thousands of distinct abbreviated keys to still
2806  * benefit very significantly. This will generally occur provided
2807  * each abbreviated key is a proxy for a roughly uniform number of the
2808  * set's full keys. If it isn't so, we hope to catch that early and
2809  * abort. If it isn't caught early, by the time the problem is
2810  * apparent it's probably not worth aborting.
2811  */
2812  if (memtupcount > 10000)
2813  sss->prop_card *= 0.65;
2814 
2815  return false;
2816  }
2817 
2818  /*
2819  * Abort abbreviation strategy.
2820  *
2821  * The worst case, where all abbreviated keys are identical while all
2822  * original strings differ will typically only see a regression of about
2823  * 10% in execution time for small to medium sized lists of strings.
2824  * Whereas on modern CPUs where cache stalls are the dominant cost, we can
2825  * often expect very large improvements, particularly with sets of strings
2826  * of moderately high to high abbreviated cardinality. There is little to
2827  * lose but much to gain, which our strategy reflects.
2828  */
2829 #ifdef TRACE_SORT
2830  if (trace_sort)
2831  elog(LOG, "varstr_abbrev: aborted abbreviation at %d "
2832  "(abbrev_distinct: %f, key_distinct: %f, prop_card: %f)",
2833  memtupcount, abbrev_distinct, key_distinct, sss->prop_card);
2834 #endif
2835 
2836  return true;
2837 }
2838 
2839 /*
2840  * Generic equalimage support function for character type's operator classes.
2841  * Disables the use of deduplication with nondeterministic collations.
2842  */
2843 Datum
2845 {
2846  /* Oid opcintype = PG_GETARG_OID(0); */
2847  Oid collid = PG_GET_COLLATION();
2848 
2849  check_collation_set(collid);
2850 
2851  if (lc_collate_is_c(collid) ||
2852  collid == DEFAULT_COLLATION_OID ||
2854  PG_RETURN_BOOL(true);
2855  else
2856  PG_RETURN_BOOL(false);
2857 }
2858 
2859 Datum
2861 {
2862  text *arg1 = PG_GETARG_TEXT_PP(0);
2863  text *arg2 = PG_GETARG_TEXT_PP(1);
2864  text *result;
2865 
2866  result = ((text_cmp(arg1, arg2, PG_GET_COLLATION()) > 0) ? arg1 : arg2);
2867 
2868  PG_RETURN_TEXT_P(result);
2869 }
2870 
2871 Datum
2873 {
2874  text *arg1 = PG_GETARG_TEXT_PP(0);
2875  text *arg2 = PG_GETARG_TEXT_PP(1);
2876  text *result;
2877 
2878  result = ((text_cmp(arg1, arg2, PG_GET_COLLATION()) < 0) ? arg1 : arg2);
2879 
2880  PG_RETURN_TEXT_P(result);
2881 }
2882 
2883 
2884 /*
2885  * Cross-type comparison functions for types text and name.
2886  */
2887 
2888 Datum
2890 {
2891  Name arg1 = PG_GETARG_NAME(0);
2892  text *arg2 = PG_GETARG_TEXT_PP(1);
2893  size_t len1 = strlen(NameStr(*arg1));
2894  size_t len2 = VARSIZE_ANY_EXHDR(arg2);
2895  Oid collid = PG_GET_COLLATION();
2896  bool result;
2897 
2898  check_collation_set(collid);
2899 
2900  if (collid == C_COLLATION_OID)
2901  result = (len1 == len2 &&
2902  memcmp(NameStr(*arg1), VARDATA_ANY(arg2), len1) == 0);
2903  else
2904  result = (varstr_cmp(NameStr(*arg1), len1,
2905  VARDATA_ANY(arg2), len2,
2906  collid) == 0);
2907 
2908  PG_FREE_IF_COPY(arg2, 1);
2909 
2910  PG_RETURN_BOOL(result);
2911 }
2912 
2913 Datum
2915 {
2916  text *arg1 = PG_GETARG_TEXT_PP(0);
2917  Name arg2 = PG_GETARG_NAME(1);
2918  size_t len1 = VARSIZE_ANY_EXHDR(arg1);
2919  size_t len2 = strlen(NameStr(*arg2));
2920  Oid collid = PG_GET_COLLATION();
2921  bool result;
2922 
2923  check_collation_set(collid);
2924 
2925  if (collid == C_COLLATION_OID)
2926  result = (len1 == len2 &&
2927  memcmp(VARDATA_ANY(arg1), NameStr(*arg2), len1) == 0);
2928  else
2929  result = (varstr_cmp(VARDATA_ANY(arg1), len1,
2930  NameStr(*arg2), len2,
2931  collid) == 0);
2932 
2933  PG_FREE_IF_COPY(arg1, 0);
2934 
2935  PG_RETURN_BOOL(result);
2936 }
2937 
2938 Datum
2940 {
2941  Name arg1 = PG_GETARG_NAME(0);
2942  text *arg2 = PG_GETARG_TEXT_PP(1);
2943  size_t len1 = strlen(NameStr(*arg1));
2944  size_t len2 = VARSIZE_ANY_EXHDR(arg2);
2945  Oid collid = PG_GET_COLLATION();
2946  bool result;
2947 
2948  check_collation_set(collid);
2949 
2950  if (collid == C_COLLATION_OID)
2951  result = !(len1 == len2 &&
2952  memcmp(NameStr(*arg1), VARDATA_ANY(arg2), len1) == 0);
2953  else
2954  result = !(varstr_cmp(NameStr(*arg1), len1,
2955  VARDATA_ANY(arg2), len2,
2956  collid) == 0);
2957 
2958  PG_FREE_IF_COPY(arg2, 1);
2959 
2960  PG_RETURN_BOOL(result);
2961 }
2962 
2963 Datum
2965 {
2966  text *arg1 = PG_GETARG_TEXT_PP(0);
2967  Name arg2 = PG_GETARG_NAME(1);
2968  size_t len1 = VARSIZE_ANY_EXHDR(arg1);
2969  size_t len2 = strlen(NameStr(*arg2));
2970  Oid collid = PG_GET_COLLATION();
2971  bool result;
2972 
2973  check_collation_set(collid);
2974 
2975  if (collid == C_COLLATION_OID)
2976  result = !(len1 == len2 &&
2977  memcmp(VARDATA_ANY(arg1), NameStr(*arg2), len1) == 0);
2978  else
2979  result = !(varstr_cmp(VARDATA_ANY(arg1), len1,
2980  NameStr(*arg2), len2,
2981  collid) == 0);
2982 
2983  PG_FREE_IF_COPY(arg1, 0);
2984 
2985  PG_RETURN_BOOL(result);
2986 }
2987 
2988 Datum
2990 {
2991  Name arg1 = PG_GETARG_NAME(0);
2992  text *arg2 = PG_GETARG_TEXT_PP(1);
2993  int32 result;
2994 
2995  result = varstr_cmp(NameStr(*arg1), strlen(NameStr(*arg1)),
2996  VARDATA_ANY(arg2), VARSIZE_ANY_EXHDR(arg2),
2997  PG_GET_COLLATION());
2998 
2999  PG_FREE_IF_COPY(arg2, 1);
3000 
3001  PG_RETURN_INT32(result);
3002 }
3003 
3004 Datum
3006 {
3007  text *arg1 = PG_GETARG_TEXT_PP(0);
3008  Name arg2 = PG_GETARG_NAME(1);
3009  int32 result;
3010 
3011  result = varstr_cmp(VARDATA_ANY(arg1), VARSIZE_ANY_EXHDR(arg1),
3012  NameStr(*arg2), strlen(NameStr(*arg2)),
3013  PG_GET_COLLATION());
3014 
3015  PG_FREE_IF_COPY(arg1, 0);
3016 
3017  PG_RETURN_INT32(result);
3018 }
3019 
3020 #define CmpCall(cmpfunc) \
3021  DatumGetInt32(DirectFunctionCall2Coll(cmpfunc, \
3022  PG_GET_COLLATION(), \
3023  PG_GETARG_DATUM(0), \
3024  PG_GETARG_DATUM(1)))
3025 
3026 Datum
3028 {
3030 }
3031 
3032 Datum
3034 {
3036 }
3037 
3038 Datum
3040 {
3042 }
3043 
3044 Datum
3046 {
3048 }
3049 
3050 Datum
3052 {
3054 }
3055 
3056 Datum
3058 {
3060 }
3061 
3062 Datum
3064 {
3066 }
3067 
3068 Datum
3070 {
3072 }
3073 
3074 #undef CmpCall
3075 
3076 
3077 /*
3078  * The following operators support character-by-character comparison
3079  * of text datums, to allow building indexes suitable for LIKE clauses.
3080  * Note that the regular texteq/textne comparison operators, and regular
3081  * support functions 1 and 2 with "C" collation are assumed to be
3082  * compatible with these!
3083  */
3084 
3085 static int
3087 {
3088  int result;
3089  int len1,
3090  len2;
3091 
3092  len1 = VARSIZE_ANY_EXHDR(arg1);
3093  len2 = VARSIZE_ANY_EXHDR(arg2);
3094 
3095  result = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3096  if (result != 0)
3097  return result;
3098  else if (len1 < len2)
3099  return -1;
3100  else if (len1 > len2)
3101  return 1;
3102  else
3103  return 0;
3104 }
3105 
3106 
3107 Datum
3109 {
3110  text *arg1 = PG_GETARG_TEXT_PP(0);
3111  text *arg2 = PG_GETARG_TEXT_PP(1);
3112  int result;
3113 
3114  result = internal_text_pattern_compare(arg1, arg2);
3115 
3116  PG_FREE_IF_COPY(arg1, 0);
3117  PG_FREE_IF_COPY(arg2, 1);
3118 
3119  PG_RETURN_BOOL(result < 0);
3120 }
3121 
3122 
3123 Datum
3125 {
3126  text *arg1 = PG_GETARG_TEXT_PP(0);
3127  text *arg2 = PG_GETARG_TEXT_PP(1);
3128  int result;
3129 
3130  result = internal_text_pattern_compare(arg1, arg2);
3131 
3132  PG_FREE_IF_COPY(arg1, 0);
3133  PG_FREE_IF_COPY(arg2, 1);
3134 
3135  PG_RETURN_BOOL(result <= 0);
3136 }
3137 
3138 
3139 Datum
3141 {
3142  text *arg1 = PG_GETARG_TEXT_PP(0);
3143  text *arg2 = PG_GETARG_TEXT_PP(1);
3144  int result;
3145 
3146  result = internal_text_pattern_compare(arg1, arg2);
3147 
3148  PG_FREE_IF_COPY(arg1, 0);
3149  PG_FREE_IF_COPY(arg2, 1);
3150 
3151  PG_RETURN_BOOL(result >= 0);
3152 }
3153 
3154 
3155 Datum
3157 {
3158  text *arg1 = PG_GETARG_TEXT_PP(0);
3159  text *arg2 = PG_GETARG_TEXT_PP(1);
3160  int result;
3161 
3162  result = internal_text_pattern_compare(arg1, arg2);
3163 
3164  PG_FREE_IF_COPY(arg1, 0);
3165  PG_FREE_IF_COPY(arg2, 1);
3166 
3167  PG_RETURN_BOOL(result > 0);
3168 }
3169 
3170 
3171 Datum
3173 {
3174  text *arg1 = PG_GETARG_TEXT_PP(0);
3175  text *arg2 = PG_GETARG_TEXT_PP(1);
3176  int result;
3177 
3178  result = internal_text_pattern_compare(arg1, arg2);
3179 
3180  PG_FREE_IF_COPY(arg1, 0);
3181  PG_FREE_IF_COPY(arg2, 1);
3182 
3183  PG_RETURN_INT32(result);
3184 }
3185 
3186 
3187 Datum
3189 {
3191  MemoryContext oldcontext;
3192 
3193  oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
3194 
3195  /* Use generic string SortSupport, forcing "C" collation */
3196  varstr_sortsupport(ssup, TEXTOID, C_COLLATION_OID);
3197 
3198  MemoryContextSwitchTo(oldcontext);
3199 
3200  PG_RETURN_VOID();
3201 }
3202 
3203 
3204 /*-------------------------------------------------------------
3205  * byteaoctetlen
3206  *
3207  * get the number of bytes contained in an instance of type 'bytea'
3208  *-------------------------------------------------------------
3209  */
3210 Datum
3212 {
3213  Datum str = PG_GETARG_DATUM(0);
3214 
3215  /* We need not detoast the input at all */
3217 }
3218 
3219 /*
3220  * byteacat -
3221  * takes two bytea* and returns a bytea* that is the concatenation of
3222  * the two.
3223  *
3224  * Cloned from textcat and modified as required.
3225  */
3226 Datum
3228 {
3229  bytea *t1 = PG_GETARG_BYTEA_PP(0);
3230  bytea *t2 = PG_GETARG_BYTEA_PP(1);
3231 
3233 }
3234 
3235 /*
3236  * bytea_catenate
3237  * Guts of byteacat(), broken out so it can be used by other functions
3238  *
3239  * Arguments can be in short-header form, but not compressed or out-of-line
3240  */
3241 static bytea *
3243 {
3244  bytea *result;
3245  int len1,
3246  len2,
3247  len;
3248  char *ptr;
3249 
3250  len1 = VARSIZE_ANY_EXHDR(t1);
3251  len2 = VARSIZE_ANY_EXHDR(t2);
3252 
3253  /* paranoia ... probably should throw error instead? */
3254  if (len1 < 0)
3255  len1 = 0;
3256  if (len2 < 0)
3257  len2 = 0;
3258 
3259  len = len1 + len2 + VARHDRSZ;
3260  result = (bytea *) palloc(len);
3261 
3262  /* Set size of result string... */
3263  SET_VARSIZE(result, len);
3264 
3265  /* Fill data field of result string... */
3266  ptr = VARDATA(result);
3267  if (len1 > 0)
3268  memcpy(ptr, VARDATA_ANY(t1), len1);
3269  if (len2 > 0)
3270  memcpy(ptr + len1, VARDATA_ANY(t2), len2);
3271 
3272  return result;
3273 }
3274 
3275 #define PG_STR_GET_BYTEA(str_) \
3276  DatumGetByteaPP(DirectFunctionCall1(byteain, CStringGetDatum(str_)))
3277 
3278 /*
3279  * bytea_substr()
3280  * Return a substring starting at the specified position.
3281  * Cloned from text_substr and modified as required.
3282  *
3283  * Input:
3284  * - string
3285  * - starting position (is one-based)
3286  * - string length (optional)
3287  *
3288  * If the starting position is zero or less, then return from the start of the string
3289  * adjusting the length to be consistent with the "negative start" per SQL.
3290  * If the length is less than zero, an ERROR is thrown. If no third argument
3291  * (length) is provided, the length to the end of the string is assumed.
3292  */
3293 Datum
3295 {
3297  PG_GETARG_INT32(1),
3298  PG_GETARG_INT32(2),
3299  false));
3300 }
3301 
3302 /*
3303  * bytea_substr_no_len -
3304  * Wrapper to avoid opr_sanity failure due to
3305  * one function accepting a different number of args.
3306  */
3307 Datum
3309 {
3311  PG_GETARG_INT32(1),
3312  -1,
3313  true));
3314 }
3315 
3316 static bytea *
3318  int S,
3319  int L,
3320  bool length_not_specified)
3321 {
3322  int32 S1; /* adjusted start position */
3323  int32 L1; /* adjusted substring length */
3324  int32 E; /* end position */
3325 
3326  /*
3327  * The logic here should generally match text_substring().
3328  */
3329  S1 = Max(S, 1);
3330 
3331  if (length_not_specified)
3332  {
3333  /*
3334  * Not passed a length - DatumGetByteaPSlice() grabs everything to the
3335  * end of the string if we pass it a negative value for length.
3336  */
3337  L1 = -1;
3338  }
3339  else if (L < 0)
3340  {
3341  /* SQL99 says to throw an error for E < S, i.e., negative length */
3342  ereport(ERROR,
3343  (errcode(ERRCODE_SUBSTRING_ERROR),
3344  errmsg("negative substring length not allowed")));
3345  L1 = -1; /* silence stupider compilers */
3346  }
3347  else if (pg_add_s32_overflow(S, L, &E))
3348  {
3349  /*
3350  * L could be large enough for S + L to overflow, in which case the
3351  * substring must run to end of string.
3352  */
3353  L1 = -1;
3354  }
3355  else
3356  {
3357  /*
3358  * A zero or negative value for the end position can happen if the
3359  * start was negative or one. SQL99 says to return a zero-length
3360  * string.
3361  */
3362  if (E < 1)
3363  return PG_STR_GET_BYTEA("");
3364 
3365  L1 = E - S1;
3366  }
3367 
3368  /*
3369  * If the start position is past the end of the string, SQL99 says to
3370  * return a zero-length string -- DatumGetByteaPSlice() will do that for
3371  * us. We need only convert S1 to zero-based starting position.
3372  */
3373  return DatumGetByteaPSlice(str, S1 - 1, L1);
3374 }
3375 
3376 /*
3377  * byteaoverlay
3378  * Replace specified substring of first string with second
3379  *
3380  * The SQL standard defines OVERLAY() in terms of substring and concatenation.
3381  * This code is a direct implementation of what the standard says.
3382  */
3383 Datum
3385 {
3386  bytea *t1 = PG_GETARG_BYTEA_PP(0);
3387  bytea *t2 = PG_GETARG_BYTEA_PP(1);
3388  int sp = PG_GETARG_INT32(2); /* substring start position */
3389  int sl = PG_GETARG_INT32(3); /* substring length */
3390 
3391  PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl));
3392 }
3393 
3394 Datum
3396 {
3397  bytea *t1 = PG_GETARG_BYTEA_PP(0);
3398  bytea *t2 = PG_GETARG_BYTEA_PP(1);
3399  int sp = PG_GETARG_INT32(2); /* substring start position */
3400  int sl;
3401 
3402  sl = VARSIZE_ANY_EXHDR(t2); /* defaults to length(t2) */
3403  PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl));
3404 }
3405 
3406 static bytea *
3407 bytea_overlay(bytea *t1, bytea *t2, int sp, int sl)
3408 {
3409  bytea *result;
3410  bytea *s1;
3411  bytea *s2;
3412  int sp_pl_sl;
3413 
3414  /*
3415  * Check for possible integer-overflow cases. For negative sp, throw a
3416  * "substring length" error because that's what should be expected
3417  * according to the spec's definition of OVERLAY().
3418  */
3419  if (sp <= 0)
3420  ereport(ERROR,
3421  (errcode(ERRCODE_SUBSTRING_ERROR),
3422  errmsg("negative substring length not allowed")));
3423  if (pg_add_s32_overflow(sp, sl, &sp_pl_sl))
3424  ereport(ERROR,
3425  (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
3426  errmsg("integer out of range")));
3427 
3428  s1 = bytea_substring(PointerGetDatum(t1), 1, sp - 1, false);
3429  s2 = bytea_substring(PointerGetDatum(t1), sp_pl_sl, -1, true);
3430  result = bytea_catenate(s1, t2);
3431  result = bytea_catenate(result, s2);
3432 
3433  return result;
3434 }
3435 
3436 /*
3437  * bit_count
3438  */
3439 Datum
3441 {
3442  bytea *t1 = PG_GETARG_BYTEA_PP(0);
3443 
3445 }
3446 
3447 /*
3448  * byteapos -
3449  * Return the position of the specified substring.
3450  * Implements the SQL POSITION() function.
3451  * Cloned from textpos and modified as required.
3452  */
3453 Datum
3455 {
3456  bytea *t1 = PG_GETARG_BYTEA_PP(0);
3457  bytea *t2 = PG_GETARG_BYTEA_PP(1);
3458  int pos;
3459  int px,
3460  p;
3461  int len1,
3462  len2;
3463  char *p1,
3464  *p2;
3465 
3466  len1 = VARSIZE_ANY_EXHDR(t1);
3467  len2 = VARSIZE_ANY_EXHDR(t2);
3468 
3469  if (len2 <= 0)
3470  PG_RETURN_INT32(1); /* result for empty pattern */
3471 
3472  p1 = VARDATA_ANY(t1);
3473  p2 = VARDATA_ANY(t2);
3474 
3475  pos = 0;
3476  px = (len1 - len2);
3477  for (p = 0; p <= px; p++)
3478  {
3479  if ((*p2 == *p1) && (memcmp(p1, p2, len2) == 0))
3480  {
3481  pos = p + 1;
3482  break;
3483  };
3484  p1++;
3485  };
3486 
3487  PG_RETURN_INT32(pos);
3488 }
3489 
3490 /*-------------------------------------------------------------
3491  * byteaGetByte
3492  *
3493  * this routine treats "bytea" as an array of bytes.
3494  * It returns the Nth byte (a number between 0 and 255).
3495  *-------------------------------------------------------------
3496  */
3497 Datum
3499 {
3500  bytea *v = PG_GETARG_BYTEA_PP(0);
3501  int32 n = PG_GETARG_INT32(1);
3502  int len;
3503  int byte;
3504 
3505  len = VARSIZE_ANY_EXHDR(v);
3506 
3507  if (n < 0 || n >= len)
3508  ereport(ERROR,
3509  (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3510  errmsg("index %d out of valid range, 0..%d",
3511  n, len - 1)));
3512 
3513  byte = ((unsigned char *) VARDATA_ANY(v))[n];
3514 
3515  PG_RETURN_INT32(byte);
3516 }
3517 
3518 /*-------------------------------------------------------------
3519  * byteaGetBit
3520  *
3521  * This routine treats a "bytea" type like an array of bits.
3522  * It returns the value of the Nth bit (0 or 1).
3523  *
3524  *-------------------------------------------------------------
3525  */
3526 Datum
3528 {
3529  bytea *v = PG_GETARG_BYTEA_PP(0);
3530  int64 n = PG_GETARG_INT64(1);
3531  int byteNo,
3532  bitNo;
3533  int len;
3534  int byte;
3535 
3536  len = VARSIZE_ANY_EXHDR(v);
3537 
3538  if (n < 0 || n >= (int64) len * 8)
3539  ereport(ERROR,
3540  (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3541  errmsg("index %lld out of valid range, 0..%lld",
3542  (long long) n, (long long) len * 8 - 1)));
3543 
3544  /* n/8 is now known < len, so safe to cast to int */
3545  byteNo = (int) (n / 8);
3546  bitNo = (int) (n % 8);
3547 
3548  byte = ((unsigned char *) VARDATA_ANY(v))[byteNo];
3549 
3550  if (byte & (1 << bitNo))
3551  PG_RETURN_INT32(1);
3552  else
3553  PG_RETURN_INT32(0);
3554 }
3555 
3556 /*-------------------------------------------------------------
3557  * byteaSetByte
3558  *
3559  * Given an instance of type 'bytea' creates a new one with
3560  * the Nth byte set to the given value.
3561  *
3562  *-------------------------------------------------------------
3563  */
3564 Datum
3566 {
3567  bytea *res = PG_GETARG_BYTEA_P_COPY(0);
3568  int32 n = PG_GETARG_INT32(1);
3569  int32 newByte = PG_GETARG_INT32(2);
3570  int len;
3571 
3572  len = VARSIZE(res) - VARHDRSZ;
3573 
3574  if (n < 0 || n >= len)
3575  ereport(ERROR,
3576  (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3577  errmsg("index %d out of valid range, 0..%d",
3578  n, len - 1)));
3579 
3580  /*
3581  * Now set the byte.
3582  */
3583  ((unsigned char *) VARDATA(res))[n] = newByte;
3584 
3585  PG_RETURN_BYTEA_P(res);
3586 }
3587 
3588 /*-------------------------------------------------------------
3589  * byteaSetBit
3590  *
3591  * Given an instance of type 'bytea' creates a new one with
3592  * the Nth bit set to the given value.
3593  *
3594  *-------------------------------------------------------------
3595  */
3596 Datum
3598 {
3599  bytea *res = PG_GETARG_BYTEA_P_COPY(0);
3600  int64 n = PG_GETARG_INT64(1);
3601  int32 newBit = PG_GETARG_INT32(2);
3602  int len;
3603  int oldByte,
3604  newByte;
3605  int byteNo,
3606  bitNo;
3607 
3608  len = VARSIZE(res) - VARHDRSZ;
3609 
3610  if (n < 0 || n >= (int64) len * 8)
3611  ereport(ERROR,
3612  (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3613  errmsg("index %lld out of valid range, 0..%lld",
3614  (long long) n, (long long) len * 8 - 1)));
3615 
3616  /* n/8 is now known < len, so safe to cast to int */
3617  byteNo = (int) (n / 8);
3618  bitNo = (int) (n % 8);
3619 
3620  /*
3621  * sanity check!
3622  */
3623  if (newBit != 0 && newBit != 1)
3624  ereport(ERROR,
3625  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
3626  errmsg("new bit must be 0 or 1")));
3627 
3628  /*
3629  * Update the byte.
3630  */
3631  oldByte = ((unsigned char *) VARDATA(res))[byteNo];
3632 
3633  if (newBit == 0)
3634  newByte = oldByte & (~(1 << bitNo));
3635  else
3636  newByte = oldByte | (1 << bitNo);
3637 
3638  ((unsigned char *) VARDATA(res))[byteNo] = newByte;
3639 
3640  PG_RETURN_BYTEA_P(res);
3641 }
3642 
3643 
3644 /* text_name()
3645  * Converts a text type to a Name type.
3646  */
3647 Datum
3649 {
3650  text *s = PG_GETARG_TEXT_PP(0);
3651  Name result;
3652  int len;
3653 
3654  len = VARSIZE_ANY_EXHDR(s);
3655 
3656  /* Truncate oversize input */
3657  if (len >= NAMEDATALEN)
3658  len = pg_mbcliplen(VARDATA_ANY(s), len, NAMEDATALEN - 1);
3659 
3660  /* We use palloc0 here to ensure result is zero-padded */
3661  result = (Name) palloc0(NAMEDATALEN);
3662  memcpy(NameStr(*result), VARDATA_ANY(s), len);
3663 
3664  PG_RETURN_NAME(result);
3665 }
3666 
3667 /* name_text()
3668  * Converts a Name type to a text type.
3669  */
3670 Datum
3672 {
3673  Name s = PG_GETARG_NAME(0);
3674 
3676 }
3677 
3678 
3679 /*
3680  * textToQualifiedNameList - convert a text object to list of names
3681  *
3682  * This implements the input parsing needed by nextval() and other
3683  * functions that take a text parameter representing a qualified name.
3684  * We split the name at dots, downcase if not double-quoted, and
3685  * truncate names if they're too long.
3686  */
3687 List *
3689 {
3690  char *rawname;
3691  List *result = NIL;
3692  List *namelist;
3693  ListCell *l;
3694 
3695  /* Convert to C string (handles possible detoasting). */
3696  /* Note we rely on being able to modify rawname below. */
3697  rawname = text_to_cstring(textval);
3698 
3699  if (!SplitIdentifierString(rawname, '.', &namelist))
3700  ereport(ERROR,
3701  (errcode(ERRCODE_INVALID_NAME),
3702  errmsg("invalid name syntax")));
3703 
3704  if (namelist == NIL)
3705  ereport(ERROR,
3706  (errcode(ERRCODE_INVALID_NAME),
3707  errmsg("invalid name syntax")));
3708 
3709  foreach(l, namelist)
3710  {
3711  char *curname = (char *) lfirst(l);
3712 
3713  result = lappend(result, makeString(pstrdup(curname)));
3714  }
3715 
3716  pfree(rawname);
3717  list_free(namelist);
3718 
3719  return result;
3720 }
3721 
3722 /*
3723  * SplitIdentifierString --- parse a string containing identifiers
3724  *
3725  * This is the guts of textToQualifiedNameList, and is exported for use in
3726  * other situations such as parsing GUC variables. In the GUC case, it's
3727  * important to avoid memory leaks, so the API is designed to minimize the
3728  * amount of stuff that needs to be allocated and freed.
3729  *
3730  * Inputs:
3731  * rawstring: the input string; must be overwritable! On return, it's
3732  * been modified to contain the separated identifiers.
3733  * separator: the separator punctuation expected between identifiers
3734  * (typically '.' or ','). Whitespace may also appear around
3735  * identifiers.
3736  * Outputs:
3737  * namelist: filled with a palloc'd list of pointers to identifiers within
3738  * rawstring. Caller should list_free() this even on error return.
3739  *
3740  * Returns true if okay, false if there is a syntax error in the string.
3741  *
3742  * Note that an empty string is considered okay here, though not in
3743  * textToQualifiedNameList.
3744  */
3745 bool
3746 SplitIdentifierString(char *rawstring, char separator,
3747  List **namelist)
3748 {
3749  char *nextp = rawstring;
3750  bool done = false;
3751 
3752  *namelist = NIL;
3753 
3754  while (scanner_isspace(*nextp))
3755  nextp++; /* skip leading whitespace */
3756 
3757  if (*nextp == '\0')
3758  return true; /* allow empty string */
3759 
3760  /* At the top of the loop, we are at start of a new identifier. */
3761  do
3762  {
3763  char *curname;
3764  char *endp;
3765 
3766  if (*nextp == '"')
3767  {
3768  /* Quoted name --- collapse quote-quote pairs, no downcasing */
3769  curname = nextp + 1;
3770  for (;;)
3771  {
3772  endp = strchr(nextp + 1, '"');
3773  if (endp == NULL)
3774  return false; /* mismatched quotes */
3775  if (endp[1] != '"')
3776  break; /* found end of quoted name */
3777  /* Collapse adjacent quotes into one quote, and look again */
3778  memmove(endp, endp + 1, strlen(endp));
3779  nextp = endp;
3780  }
3781  /* endp now points at the terminating quote */
3782  nextp = endp + 1;
3783  }
3784  else
3785  {
3786  /* Unquoted name --- extends to separator or whitespace */
3787  char *downname;
3788  int len;
3789 
3790  curname = nextp;
3791  while (*nextp && *nextp != separator &&
3792  !scanner_isspace(*nextp))
3793  nextp++;
3794  endp = nextp;
3795  if (curname == nextp)
3796  return false; /* empty unquoted name not allowed */
3797 
3798  /*
3799  * Downcase the identifier, using same code as main lexer does.
3800  *
3801  * XXX because we want to overwrite the input in-place, we cannot
3802  * support a downcasing transformation that increases the string
3803  * length. This is not a problem given the current implementation
3804  * of downcase_truncate_identifier, but we'll probably have to do
3805  * something about this someday.
3806  */
3807  len = endp - curname;
3808  downname = downcase_truncate_identifier(curname, len, false);
3809  Assert(strlen(downname) <= len);
3810  strncpy(curname, downname, len); /* strncpy is required here */
3811  pfree(downname);
3812  }
3813 
3814  while (scanner_isspace(*nextp))
3815  nextp++; /* skip trailing whitespace */
3816 
3817  if (*nextp == separator)
3818  {
3819  nextp++;
3820  while (scanner_isspace(*nextp))
3821  nextp++; /* skip leading whitespace for next */
3822  /* we expect another name, so done remains false */
3823  }
3824  else if (*nextp == '\0')
3825  done = true;
3826  else
3827  return false; /* invalid syntax */
3828 
3829  /* Now safe to overwrite separator with a null */
3830  *endp = '\0';
3831 
3832  /* Truncate name if it's overlength */
3833  truncate_identifier(curname, strlen(curname), false);
3834 
3835  /*
3836  * Finished isolating current name --- add it to list
3837  */
3838  *namelist = lappend(*namelist, curname);
3839 
3840  /* Loop back if we didn't reach end of string */
3841  } while (!done);
3842 
3843  return true;
3844 }
3845 
3846 
3847 /*
3848  * SplitDirectoriesString --- parse a string containing file/directory names
3849  *
3850  * This works fine on file names too; the function name is historical.
3851  *
3852  * This is similar to SplitIdentifierString, except that the parsing
3853  * rules are meant to handle pathnames instead of identifiers: there is
3854  * no downcasing, embedded spaces are allowed, the max length is MAXPGPATH-1,
3855  * and we apply canonicalize_path() to each extracted string. Because of the
3856  * last, the returned strings are separately palloc'd rather than being
3857  * pointers into rawstring --- but we still scribble on rawstring.
3858  *
3859  * Inputs:
3860  * rawstring: the input string; must be modifiable!
3861  * separator: the separator punctuation expected between directories
3862  * (typically ',' or ';'). Whitespace may also appear around
3863  * directories.
3864  * Outputs:
3865  * namelist: filled with a palloc'd list of directory names.
3866  * Caller should list_free_deep() this even on error return.
3867  *
3868  * Returns true if okay, false if there is a syntax error in the string.
3869  *
3870  * Note that an empty string is considered okay here.
3871  */
3872 bool
3873 SplitDirectoriesString(char *rawstring, char separator,
3874  List **namelist)
3875 {
3876  char *nextp = rawstring;
3877  bool done = false;
3878 
3879  *namelist = NIL;
3880 
3881  while (scanner_isspace(*nextp))
3882  nextp++; /* skip leading whitespace */
3883 
3884  if (*nextp == '\0')
3885  return true; /* allow empty string */
3886 
3887  /* At the top of the loop, we are at start of a new directory. */
3888  do
3889  {
3890  char *curname;
3891  char *endp;
3892 
3893  if (*nextp == '"')
3894  {
3895  /* Quoted name --- collapse quote-quote pairs */
3896  curname = nextp + 1;
3897  for (;;)
3898  {
3899  endp = strchr(nextp + 1, '"');
3900  if (endp == NULL)
3901  return false; /* mismatched quotes */
3902  if (endp[1] != '"')
3903  break; /* found end of quoted name */
3904  /* Collapse adjacent quotes into one quote, and look again */
3905  memmove(endp, endp + 1, strlen(endp));
3906  nextp = endp;
3907  }
3908  /* endp now points at the terminating quote */
3909  nextp = endp + 1;
3910  }
3911  else
3912  {
3913  /* Unquoted name --- extends to separator or end of string */
3914  curname = endp = nextp;
3915  while (*nextp && *nextp != separator)
3916  {
3917  /* trailing whitespace should not be included in name */
3918  if (!scanner_isspace(*nextp))
3919  endp = nextp + 1;
3920  nextp++;
3921  }
3922  if (curname == endp)
3923  return false; /* empty unquoted name not allowed */
3924  }
3925 
3926  while (scanner_isspace(*nextp))
3927  nextp++; /* skip trailing whitespace */
3928 
3929  if (*nextp == separator)
3930  {
3931  nextp++;
3932  while (scanner_isspace(*nextp))
3933  nextp++; /* skip leading whitespace for next */
3934  /* we expect another name, so done remains false */
3935  }
3936  else if (*nextp == '\0')
3937  done = true;
3938  else
3939  return false; /* invalid syntax */
3940 
3941  /* Now safe to overwrite separator with a null */
3942  *endp = '\0';
3943 
3944  /* Truncate path if it's overlength */
3945  if (strlen(curname) >= MAXPGPATH)
3946  curname[MAXPGPATH - 1] = '\0';
3947 
3948  /*
3949  * Finished isolating current name --- add it to list
3950  */
3951  curname = pstrdup(curname);
3952  canonicalize_path(curname);
3953  *namelist = lappend(*namelist, curname);
3954 
3955  /* Loop back if we didn't reach end of string */
3956  } while (!done);
3957 
3958  return true;
3959 }
3960 
3961 
3962 /*
3963  * SplitGUCList --- parse a string containing identifiers or file names
3964  *
3965  * This is used to split the value of a GUC_LIST_QUOTE GUC variable, without
3966  * presuming whether the elements will be taken as identifiers or file names.
3967  * We assume the input has already been through flatten_set_variable_args(),
3968  * so that we need never downcase (if appropriate, that was done already).
3969  * Nor do we ever truncate, since we don't know the correct max length.
3970  * We disallow embedded whitespace for simplicity (it shouldn't matter,
3971  * because any embedded whitespace should have led to double-quoting).
3972  * Otherwise the API is identical to SplitIdentifierString.
3973  *
3974  * XXX it's annoying to have so many copies of this string-splitting logic.
3975  * However, it's not clear that having one function with a bunch of option
3976  * flags would be much better.
3977  *
3978  * XXX there is a version of this function in src/bin/pg_dump/dumputils.c.
3979  * Be sure to update that if you have to change this.
3980  *
3981  * Inputs:
3982  * rawstring: the input string; must be overwritable! On return, it's
3983  * been modified to contain the separated identifiers.
3984  * separator: the separator punctuation expected between identifiers
3985  * (typically '.' or ','). Whitespace may also appear around
3986  * identifiers.
3987  * Outputs:
3988  * namelist: filled with a palloc'd list of pointers to identifiers within
3989  * rawstring. Caller should list_free() this even on error return.
3990  *
3991  * Returns true if okay, false if there is a syntax error in the string.
3992  */
3993 bool
3994 SplitGUCList(char *rawstring, char separator,
3995  List **namelist)
3996 {
3997  char *nextp = rawstring;
3998  bool done = false;
3999 
4000  *namelist = NIL;
4001 
4002  while (scanner_isspace(*nextp))
4003  nextp++; /* skip leading whitespace */
4004 
4005  if (*nextp == '\0')
4006  return true; /* allow empty string */
4007 
4008  /* At the top of the loop, we are at start of a new identifier. */
4009  do
4010  {
4011  char *curname;
4012  char *endp;
4013 
4014  if (*nextp == '"')
4015  {
4016  /* Quoted name --- collapse quote-quote pairs */
4017  curname = nextp + 1;
4018  for (;;)
4019  {
4020  endp = strchr(nextp + 1, '"');
4021  if (endp == NULL)
4022  return false; /* mismatched quotes */
4023  if (endp[1] != '"')
4024  break; /* found end of quoted name */
4025  /* Collapse adjacent quotes into one quote, and look again */
4026  memmove(endp, endp + 1, strlen(endp));
4027  nextp = endp;
4028  }
4029  /* endp now points at the terminating quote */
4030  nextp = endp + 1;
4031  }
4032  else
4033  {
4034  /* Unquoted name --- extends to separator or whitespace */
4035  curname = nextp;
4036  while (*nextp && *nextp != separator &&
4037  !scanner_isspace(*nextp))
4038  nextp++;
4039  endp = nextp;
4040  if (curname == nextp)
4041  return false; /* empty unquoted name not allowed */
4042  }
4043 
4044  while (scanner_isspace(*nextp))
4045  nextp++; /* skip trailing whitespace */
4046 
4047  if (*nextp == separator)
4048  {
4049  nextp++;
4050  while (scanner_isspace(*nextp))
4051  nextp++; /* skip leading whitespace for next */
4052  /* we expect another name, so done remains false */
4053  }
4054  else if (*nextp == '\0')
4055  done = true;
4056  else
4057  return false; /* invalid syntax */
4058 
4059  /* Now safe to overwrite separator with a null */
4060  *endp = '\0';
4061 
4062  /*
4063  * Finished isolating current name --- add it to list
4064  */
4065  *namelist = lappend(*namelist, curname);
4066 
4067  /* Loop back if we didn't reach end of string */
4068  } while (!done);
4069 
4070  return true;
4071 }
4072 
4073 
4074 /*****************************************************************************
4075  * Comparison Functions used for bytea
4076  *
4077  * Note: btree indexes need these routines not to leak memory; therefore,
4078  * be careful to free working copies of toasted datums. Most places don't
4079  * need to be so careful.
4080  *****************************************************************************/
4081 
4082 Datum
4084 {
4085  Datum arg1 = PG_GETARG_DATUM(0);
4086  Datum arg2 = PG_GETARG_DATUM(1);
4087  bool result;
4088  Size len1,
4089  len2;
4090 
4091  /*
4092  * We can use a fast path for unequal lengths, which might save us from
4093  * having to detoast one or both values.
4094  */
4095  len1 = toast_raw_datum_size(arg1);
4096  len2 = toast_raw_datum_size(arg2);
4097  if (len1 != len2)
4098  result = false;
4099  else
4100  {
4101  bytea *barg1 = DatumGetByteaPP(arg1);
4102  bytea *barg2 = DatumGetByteaPP(arg2);
4103 
4104  result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2),
4105  len1 - VARHDRSZ) == 0);
4106 
4107  PG_FREE_IF_COPY(barg1, 0);
4108  PG_FREE_IF_COPY(barg2, 1);
4109  }
4110 
4111  PG_RETURN_BOOL(result);
4112 }
4113 
4114 Datum
4116 {
4117  Datum arg1 = PG_GETARG_DATUM(0);
4118  Datum arg2 = PG_GETARG_DATUM(1);
4119  bool result;
4120  Size len1,
4121  len2;
4122 
4123  /*
4124  * We can use a fast path for unequal lengths, which might save us from
4125  * having to detoast one or both values.
4126  */
4127  len1 = toast_raw_datum_size(arg1);
4128  len2 = toast_raw_datum_size(arg2);
4129  if (len1 != len2)
4130  result = true;
4131  else
4132  {
4133  bytea *barg1 = DatumGetByteaPP(arg1);
4134  bytea *barg2 = DatumGetByteaPP(arg2);
4135 
4136  result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2),
4137  len1 - VARHDRSZ) != 0);
4138 
4139  PG_FREE_IF_COPY(barg1, 0);
4140  PG_FREE_IF_COPY(barg2, 1);
4141  }
4142 
4143  PG_RETURN_BOOL(result);
4144 }
4145 
4146 Datum
4148 {
4149  bytea *arg1 = PG_GETARG_BYTEA_PP(0);
4150  bytea *arg2 = PG_GETARG_BYTEA_PP(1);
4151  int len1,
4152  len2;
4153  int cmp;
4154 
4155  len1 = VARSIZE_ANY_EXHDR(arg1);
4156  len2 = VARSIZE_ANY_EXHDR(arg2);
4157 
4158  cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4159 
4160  PG_FREE_IF_COPY(arg1, 0);
4161  PG_FREE_IF_COPY(arg2, 1);
4162 
4163  PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 < len2)));
4164 }
4165 
4166 Datum
4168 {
4169  bytea *arg1 = PG_GETARG_BYTEA_PP(0);
4170  bytea *arg2 = PG_GETARG_BYTEA_PP(1);
4171  int len1,
4172  len2;
4173  int cmp;
4174 
4175  len1 = VARSIZE_ANY_EXHDR(arg1);
4176  len2 = VARSIZE_ANY_EXHDR(arg2);
4177 
4178  cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4179 
4180  PG_FREE_IF_COPY(arg1, 0);
4181  PG_FREE_IF_COPY(arg2, 1);
4182 
4183  PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 <= len2)));
4184 }
4185 
4186 Datum
4188 {
4189  bytea *arg1 = PG_GETARG_BYTEA_PP(0);
4190  bytea *arg2 = PG_GETARG_BYTEA_PP(1);
4191  int len1,
4192  len2;
4193  int cmp;
4194 
4195  len1 = VARSIZE_ANY_EXHDR(arg1);
4196  len2 = VARSIZE_ANY_EXHDR(arg2);
4197 
4198  cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4199 
4200  PG_FREE_IF_COPY(arg1, 0);
4201  PG_FREE_IF_COPY(arg2, 1);
4202 
4203  PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 > len2)));
4204 }
4205 
4206 Datum
4208 {
4209  bytea *arg1 = PG_GETARG_BYTEA_PP(0);
4210  bytea *arg2 = PG_GETARG_BYTEA_PP(1);
4211  int len1,
4212  len2;
4213  int cmp;
4214 
4215  len1 = VARSIZE_ANY_EXHDR(arg1);
4216  len2 = VARSIZE_ANY_EXHDR(arg2);
4217 
4218  cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4219 
4220  PG_FREE_IF_COPY(arg1, 0);
4221  PG_FREE_IF_COPY(arg2, 1);
4222 
4223  PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 >= len2)));
4224 }
4225 
4226 Datum
4228 {
4229  bytea *arg1 = PG_GETARG_BYTEA_PP(0);
4230  bytea *arg2 = PG_GETARG_BYTEA_PP(1);
4231  int len1,
4232  len2;
4233  int cmp;
4234 
4235  len1 = VARSIZE_ANY_EXHDR(arg1);
4236  len2 = VARSIZE_ANY_EXHDR(arg2);
4237 
4238  cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4239  if ((cmp == 0) && (len1 != len2))
4240  cmp = (len1 < len2) ? -1 : 1;
4241 
4242  PG_FREE_IF_COPY(arg1, 0);
4243  PG_FREE_IF_COPY(arg2, 1);
4244 
4245  PG_RETURN_INT32(cmp);
4246 }
4247 
4248 Datum
4250 {
4252  MemoryContext oldcontext;
4253 
4254  oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
4255 
4256  /* Use generic string SortSupport, forcing "C" collation */
4257  varstr_sortsupport(ssup, BYTEAOID, C_COLLATION_OID);
4258 
4259  MemoryContextSwitchTo(oldcontext);
4260 
4261  PG_RETURN_VOID();
4262 }
4263 
4264 /*
4265  * appendStringInfoText
4266  *
4267  * Append a text to str.
4268  * Like appendStringInfoString(str, text_to_cstring(t)) but faster.
4269  */
4270 static void
4272 {
4274 }
4275 
4276 /*
4277  * replace_text
4278  * replace all occurrences of 'old_sub_str' in 'orig_str'
4279  * with 'new_sub_str' to form 'new_str'
4280  *
4281  * returns 'orig_str' if 'old_sub_str' == '' or 'orig_str' == ''
4282  * otherwise returns 'new_str'
4283  */
4284 Datum
4286 {
4287  text *src_text = PG_GETARG_TEXT_PP(0);
4288  text *from_sub_text = PG_GETARG_TEXT_PP(1);
4289  text *to_sub_text = PG_GETARG_TEXT_PP(2);
4290  int src_text_len;
4291  int from_sub_text_len;
4293  text *ret_text;
4294  int chunk_len;
4295  char *curr_ptr;
4296  char *start_ptr;
4298  bool found;
4299 
4300  src_text_len = VARSIZE_ANY_EXHDR(src_text);
4301  from_sub_text_len = VARSIZE_ANY_EXHDR(from_sub_text);
4302 
4303  /* Return unmodified source string if empty source or pattern */
4304  if (src_text_len < 1 || from_sub_text_len < 1)
4305  {
4306  PG_RETURN_TEXT_P(src_text);
4307  }
4308 
4309  text_position_setup(src_text, from_sub_text, PG_GET_COLLATION(), &state);
4310 
4311  found = text_position_next(&state);
4312 
4313  /* When the from_sub_text is not found, there is nothing to do. */
4314  if (!found)
4315  {
4316  text_position_cleanup(&state);
4317  PG_RETURN_TEXT_P(src_text);
4318  }
4319  curr_ptr = text_position_get_match_ptr(&state);
4320  start_ptr = VARDATA_ANY(src_text);
4321 
4322  initStringInfo(&str);
4323 
4324  do
4325  {
4327 
4328  /* copy the data skipped over by last text_position_next() */
4329  chunk_len = curr_ptr - start_ptr;
4330  appendBinaryStringInfo(&str, start_ptr, chunk_len);
4331 
4332  appendStringInfoText(&str, to_sub_text);
4333 
4334  start_ptr = curr_ptr + from_sub_text_len;
4335 
4336  found = text_position_next(&state);
4337  if (found)
4338  curr_ptr = text_position_get_match_ptr(&state);
4339  }
4340  while (found);
4341 
4342  /* copy trailing data */
4343  chunk_len = ((char *) src_text + VARSIZE_ANY(src_text)) - start_ptr;
4344  appendBinaryStringInfo(&str, start_ptr, chunk_len);
4345 
4346  text_position_cleanup(&state);
4347 
4348  ret_text = cstring_to_text_with_len(str.data, str.len);
4349  pfree(str.data);
4350 
4351  PG_RETURN_TEXT_P(ret_text);
4352 }
4353 
4354 /*
4355  * check_replace_text_has_escape
4356  *
4357  * Returns 0 if text contains no backslashes that need processing.
4358  * Returns 1 if text contains backslashes, but not regexp submatch specifiers.
4359  * Returns 2 if text contains regexp submatch specifiers (\1 .. \9).
4360  */
4361 static int
4363 {
4364  int result = 0;
4365  const char *p = VARDATA_ANY(replace_text);
4366  const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);
4367 
4368  while (p < p_end)
4369  {
4370  /* Find next escape char, if any. */
4371  p = memchr(p, '\\', p_end - p);
4372  if (p == NULL)
4373  break;
4374  p++;
4375  /* Note: a backslash at the end doesn't require extra processing. */
4376  if (p < p_end)
4377  {
4378  if (*p >= '1' && *p <= '9')
4379  return 2; /* Found a submatch specifier, so done */
4380  result = 1; /* Found some other sequence, keep looking */
4381  p++;
4382  }
4383  }
4384  return result;
4385 }
4386 
4387 /*
4388  * appendStringInfoRegexpSubstr
4389  *
4390  * Append replace_text to str, substituting regexp back references for
4391  * \n escapes. start_ptr is the start of the match in the source string,
4392  * at logical character position data_pos.
4393  */
4394 static void
4396  regmatch_t *pmatch,
4397  char *start_ptr, int data_pos)
4398 {
4399  const char *p = VARDATA_ANY(replace_text);
4400  const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);
4401 
4402  while (p < p_end)
4403  {
4404  const char *chunk_start = p;
4405  int so;
4406  int eo;
4407 
4408  /* Find next escape char, if any. */
4409  p = memchr(p, '\\', p_end - p);
4410  if (p == NULL)
4411  p = p_end;
4412 
4413  /* Copy the text we just scanned over, if any. */
4414  if (p > chunk_start)
4415  appendBinaryStringInfo(str, chunk_start, p - chunk_start);
4416 
4417  /* Done if at end of string, else advance over escape char. */
4418  if (p >= p_end)
4419  break;
4420  p++;
4421 
4422  if (p >= p_end)
4423  {
4424  /* Escape at very end of input. Treat same as unexpected char */
4425  appendStringInfoChar(str, '\\');
4426  break;
4427  }
4428 
4429  if (*p >= '1' && *p <= '9')
4430  {
4431  /* Use the back reference of regexp. */
4432  int idx = *p - '0';
4433 
4434  so = pmatch[idx].rm_so;
4435  eo = pmatch[idx].rm_eo;
4436  p++;
4437  }
4438  else if (*p == '&')
4439  {
4440  /* Use the entire matched string. */
4441  so = pmatch[0].rm_so;
4442  eo = pmatch[0].rm_eo;
4443  p++;
4444  }
4445  else if (*p == '\\')
4446  {
4447  /* \\ means transfer one \ to output. */
4448  appendStringInfoChar(str, '\\');
4449  p++;
4450  continue;
4451  }
4452  else
4453  {
4454  /*
4455  * If escape char is not followed by any expected char, just treat
4456  * it as ordinary data to copy. (XXX would it be better to throw
4457  * an error?)
4458  */
4459  appendStringInfoChar(str, '\\');
4460  continue;
4461  }
4462 
4463  if (so >= 0 && eo >= 0)
4464  {
4465  /*
4466  * Copy the text that is back reference of regexp. Note so and eo
4467  * are counted in characters not bytes.
4468  */
4469  char *chunk_start;
4470  int chunk_len;
4471 
4472  Assert(so >= data_pos);
4473  chunk_start = start_ptr;
4474  chunk_start += charlen_to_bytelen(chunk_start, so - data_pos);
4475  chunk_len = charlen_to_bytelen(chunk_start, eo - so);
4476  appendBinaryStringInfo(str, chunk_start, chunk_len);
4477  }
4478  }
4479 }
4480 
4481 /*
4482  * replace_text_regexp
4483  *
4484  * replace substring(s) in src_text that match pattern with replace_text.
4485  * The replace_text can contain backslash markers to substitute
4486  * (parts of) the matched text.
4487  *
4488  * cflags: regexp compile flags.
4489  * collation: collation to use.
4490  * search_start: the character (not byte) offset in src_text at which to
4491  * begin searching.
4492  * n: if 0, replace all matches; if > 0, replace only the N'th match.
4493  */
4494 text *
4495 replace_text_regexp(text *src_text, text *pattern_text,
4496  text *replace_text,
4497  int cflags, Oid collation,
4498  int search_start, int n)
4499 {
4500  text *ret_text;
4501  regex_t *re;
4502  int src_text_len = VARSIZE_ANY_EXHDR(src_text);
4503  int nmatches = 0;
4505  regmatch_t pmatch[10]; /* main match, plus \1 to \9 */
4506  int nmatch = lengthof(pmatch);
4507  pg_wchar *data;
4508  size_t data_len;
4509  int data_pos;
4510  char *start_ptr;
4511  int escape_status;
4512 
4513  initStringInfo(&buf);
4514 
4515  /* Convert data string to wide characters. */
4516  data = (pg_wchar *) palloc((src_text_len + 1) * sizeof(pg_wchar));
4517  data_len = pg_mb2wchar_with_len(VARDATA_ANY(src_text), data, src_text_len);
4518 
4519  /* Check whether replace_text has escapes, especially regexp submatches. */
4520  escape_status = check_replace_text_has_escape(replace_text);
4521 
4522  /* If no regexp submatches, we can use REG_NOSUB. */
4523  if (escape_status < 2)
4524  {
4525  cflags |= REG_NOSUB;
4526  /* Also tell pg_regexec we only want the whole-match location. */
4527  nmatch = 1;
4528  }
4529 
4530  /* Prepare the regexp. */
4531  re = RE_compile_and_cache(pattern_text, cflags, collation);
4532 
4533  /* start_ptr points to the data_pos'th character of src_text */
4534  start_ptr = (char *) VARDATA_ANY(src_text);
4535  data_pos = 0;
4536 
4537  while (search_start <= data_len)
4538  {
4539  int regexec_result;
4540 
4542 
4543  regexec_result = pg_regexec(re,
4544  data,
4545  data_len,
4546  search_start,
4547  NULL, /* no details */
4548  nmatch,
4549  pmatch,
4550  0);
4551 
4552  if (regexec_result == REG_NOMATCH)
4553  break;
4554 
4555  if (regexec_result != REG_OKAY)
4556  {
4557  char errMsg[100];
4558 
4560  pg_regerror(regexec_result, re, errMsg, sizeof(errMsg));
4561  ereport(ERROR,
4562  (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
4563  errmsg("regular expression failed: %s", errMsg)));
4564  }
4565 
4566  /*
4567  * Count matches, and decide whether to replace this match.
4568  */
4569  nmatches++;
4570  if (n > 0 && nmatches != n)
4571  {
4572  /*
4573  * No, so advance search_start, but not start_ptr/data_pos. (Thus,
4574  * we treat the matched text as if it weren't matched, and copy it
4575  * to the output later.)
4576  */
4577  search_start = pmatch[0].rm_eo;
4578  if (pmatch[0].rm_so == pmatch[0].rm_eo)
4579  search_start++;
4580  continue;
4581  }
4582 
4583  /*
4584  * Copy the text to the left of the match position. Note we are given
4585  * character not byte indexes.
4586  */
4587  if (pmatch[0].rm_so - data_pos > 0)
4588  {
4589  int chunk_len;
4590 
4591  chunk_len = charlen_to_bytelen(start_ptr,
4592  pmatch[0].rm_so - data_pos);
4593  appendBinaryStringInfo(&buf, start_ptr, chunk_len);
4594 
4595  /*
4596  * Advance start_ptr over that text, to avoid multiple rescans of
4597  * it if the replace_text contains multiple back-references.
4598  */
4599  start_ptr += chunk_len;
4600  data_pos = pmatch[0].rm_so;
4601  }
4602 
4603  /*
4604  * Copy the replace_text, processing escapes if any are present.
4605  */
4606  if (escape_status > 0)
4607  appendStringInfoRegexpSubstr(&buf, replace_text, pmatch,
4608  start_ptr, data_pos);
4609  else
4610  appendStringInfoText(&buf, replace_text);
4611 
4612  /* Advance start_ptr and data_pos over the matched text. */
4613  start_ptr += charlen_to_bytelen(start_ptr,
4614  pmatch[0].rm_eo - data_pos);
4615  data_pos = pmatch[0].rm_eo;
4616 
4617  /*
4618  * If we only want to replace one occurrence, we're done.
4619  */
4620  if (n > 0)
4621  break;
4622 
4623  /*
4624  * Advance search position. Normally we start the next search at the
4625  * end of the previous match; but if the match was of zero length, we
4626  * have to advance by one character, or we'd just find the same match
4627  * again.
4628  */
4629  search_start = data_pos;
4630  if (pmatch[0].rm_so == pmatch[0].rm_eo)
4631  search_start++;
4632  }
4633 
4634  /*
4635  * Copy the text to the right of the last match.
4636  */
4637  if (data_pos < data_len)
4638  {
4639  int chunk_len;
4640 
4641  chunk_len = ((char *) src_text + VARSIZE_ANY(src_text)) - start_ptr;
4642  appendBinaryStringInfo(&buf, start_ptr, chunk_len);
4643  }
4644 
4645  ret_text = cstring_to_text_with_len(buf.data, buf.len);
4646  pfree(buf.data);
4647  pfree(data);
4648 
4649  return ret_text;
4650 }
4651 
4652 /*
4653  * split_part
4654  * parse input string based on provided field separator
4655  * return N'th item (1 based, negative counts from end)
4656  */
4657 Datum
4659 {
4660  text *inputstring = PG_GETARG_TEXT_PP(0);
4661  text *fldsep = PG_GETARG_TEXT_PP(1);
4662  int fldnum = PG_GETARG_INT32(2);
4663  int inputstring_len;
4664  int fldsep_len;
4666  char *start_ptr;
4667  char *end_ptr;
4668  text *result_text;
4669  bool found;
4670 
4671  /* field number is 1 based */
4672  if (fldnum == 0)
4673  ereport(ERROR,
4674  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4675  errmsg("field position must not be zero")));
4676 
4677  inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
4678  fldsep_len = VARSIZE_ANY_EXHDR(fldsep);
4679 
4680  /* return empty string for empty input string */
4681  if (inputstring_len < 1)
4683 
4684  /* handle empty field separator */
4685  if (fldsep_len < 1)
4686  {
4687  /* if first or last field, return input string, else empty string */
4688  if (fldnum == 1 || fldnum == -1)
4689  PG_RETURN_TEXT_P(inputstring);
4690  else
4692  }
4693 
4694  /* find the first field separator */
4695  text_position_setup(inputstring, fldsep, PG_GET_COLLATION(), &state);
4696 
4697  found = text_position_next(&state);
4698 
4699  /* special case if fldsep not found at all */
4700  if (!found)
4701  {
4702  text_position_cleanup(&state);
4703  /* if first or last field, return input string, else empty string */
4704  if (fldnum == 1 || fldnum == -1)
4705  PG_RETURN_TEXT_P(inputstring);
4706  else
4708  }
4709 
4710  /*
4711  * take care of a negative field number (i.e. count from the right) by
4712  * converting to a positive field number; we need total number of fields
4713  */
4714  if (fldnum < 0)
4715  {
4716  /* we found a fldsep, so there are at least two fields */
4717  int numfields = 2;
4718 
4719  while (text_position_next(&state))
4720  numfields++;
4721 
4722  /* special case of last field does not require an extra pass */
4723  if (fldnum == -1)
4724  {
4725  start_ptr = text_position_get_match_ptr(&state) + fldsep_len;
4726  end_ptr = VARDATA_ANY(inputstring) + inputstring_len;
4727  text_position_cleanup(&state);
4729  end_ptr - start_ptr));
4730  }
4731 
4732  /* else, convert fldnum to positive notation */
4733  fldnum += numfields + 1;
4734 
4735  /* if nonexistent field, return empty string */
4736  if (fldnum <= 0)
4737  {
4738  text_position_cleanup(&state);
4740  }
4741 
4742  /* reset to pointing at first match, but now with positive fldnum */
4743  text_position_reset(&state);
4744  found = text_position_next(&state);
4745  Assert(found);
4746  }
4747 
4748  /* identify bounds of first field */
4749  start_ptr = VARDATA_ANY(inputstring);
4750  end_ptr = text_position_get_match_ptr(&state);
4751 
4752  while (found && --fldnum > 0)
4753  {
4754  /* identify bounds of next field */
4755  start_ptr = end_ptr + fldsep_len;
4756  found = text_position_next(&state);
4757  if (found)
4758  end_ptr = text_position_get_match_ptr(&state);
4759  }
4760 
4761  text_position_cleanup(&state);
4762 
4763  if (fldnum > 0)
4764  {
4765  /* N'th field separator not found */
4766  /* if last field requested, return it, else empty string */
4767  if (fldnum == 1)
4768  {
4769  int last_len = start_ptr - VARDATA_ANY(inputstring);
4770 
4771  result_text = cstring_to_text_with_len(start_ptr,
4772  inputstring_len - last_len);
4773  }
4774  else
4775  result_text = cstring_to_text("");
4776  }
4777  else
4778  {
4779  /* non-last field requested */
4780  result_text = cstring_to_text_with_len(start_ptr, end_ptr - start_ptr);
4781  }
4782 
4783  PG_RETURN_TEXT_P(result_text);
4784 }
4785 
4786 /*
4787  * Convenience function to return true when two text params are equal.
4788  */
4789 static bool
4790 text_isequal(text *txt1, text *txt2, Oid collid)
4791 {
4793  collid,
4794  PointerGetDatum(txt1),
4795  PointerGetDatum(txt2)));
4796 }
4797 
4798 /*
4799  * text_to_array
4800  * parse input string and return text array of elements,
4801  * based on provided field separator
4802  */
4803 Datum
4805 {
4806  SplitTextOutputData tstate;
4807 
4808  /* For array output, tstate should start as all zeroes */
4809  memset(&tstate, 0, sizeof(tstate));
4810 
4811  if (!split_text(fcinfo, &tstate))
4812  PG_RETURN_NULL();
4813 
4814  if (tstate.astate == NULL)
4816 
4819 }
4820 
4821 /*
4822  * text_to_array_null
4823  * parse input string and return text array of elements,
4824  * based on provided field separator and null string
4825  *
4826  * This is a separate entry point only to prevent the regression tests from
4827  * complaining about different argument sets for the same internal function.
4828  */
4829 Datum
4831 {
4832  return text_to_array(fcinfo);
4833 }
4834 
4835 /*
4836  * text_to_table
4837  * parse input string and return table of elements,
4838  * based on provided field separator
4839  */
4840 Datum
4842 {
4843  ReturnSetInfo *rsi = (ReturnSetInfo *) fcinfo->resultinfo;
4844  SplitTextOutputData tstate;
4845  MemoryContext old_cxt;
4846 
4847  /* check to see if caller supports us returning a tuplestore */
4848  if (rsi == NULL || !IsA(rsi, ReturnSetInfo))
4849  ereport(ERROR,
4850  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
4851  errmsg("set-valued function called in context that cannot accept a set")));
4852  if (!(rsi->allowedModes & SFRM_Materialize))
4853  ereport(ERROR,
4854  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
4855  errmsg("materialize mode required, but it is not allowed in this context")));
4856 
4857  /* OK, prepare tuplestore in per-query memory */
4859 
4860  tstate.astate = NULL;
4861  tstate.tupdesc = CreateTupleDescCopy(rsi->expectedDesc);
4862  tstate.tupstore = tuplestore_begin_heap(true, false, work_mem);
4863 
4864  MemoryContextSwitchTo(old_cxt);
4865 
4866  (void) split_text(fcinfo, &tstate);
4867 
4868  tuplestore_donestoring(tstate.tupstore);
4869 
4871  rsi->setResult = tstate.tupstore;
4872  rsi->setDesc = tstate.tupdesc;
4873 
4874  return (Datum) 0;
4875 }
4876 
4877 /*
4878  * text_to_table_null
4879  * parse input string and return table of elements,
4880  * based on provided field separator and null string
4881  *
4882  * This is a separate entry point only to prevent the regression tests from
4883  * complaining about different argument sets for the same internal function.
4884  */
4885 Datum
4887 {
4888  return text_to_table(fcinfo);
4889 }
4890 
4891 /*
4892  * Common code for text_to_array, text_to_array_null, text_to_table
4893  * and text_to_table_null functions.
4894  *
4895  * These are not strict so we have to test for null inputs explicitly.
4896  * Returns false if result is to be null, else returns true.
4897  *
4898  * Note that if the result is valid but empty (zero elements), we return
4899  * without changing *tstate --- caller must handle that case, too.
4900  */
4901 static bool
4903 {
4904  text *inputstring;
4905  text *fldsep;
4906  text *null_string;
4907  Oid collation = PG_GET_COLLATION();
4908  int inputstring_len;
4909  int fldsep_len;
4910  char *start_ptr;
4911  text *result_text;
4912 
4913  /* when input string is NULL, then result is NULL too */
4914  if (PG_ARGISNULL(0))
4915  return false;
4916 
4917  inputstring = PG_GETARG_TEXT_PP(0);
4918 
4919  /* fldsep can be NULL */
4920  if (!PG_ARGISNULL(1))
4921  fldsep = PG_GETARG_TEXT_PP(1);
4922  else
4923  fldsep = NULL;
4924 
4925  /* null_string can be NULL or omitted */
4926  if (PG_NARGS() > 2 && !PG_ARGISNULL(2))
4927  null_string = PG_GETARG_TEXT_PP(2);
4928  else
4929  null_string = NULL;
4930 
4931  if (fldsep != NULL)
4932  {
4933  /*
4934  * Normal case with non-null fldsep. Use the text_position machinery
4935  * to search for occurrences of fldsep.
4936  */
4938 
4939  inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
4940  fldsep_len = VARSIZE_ANY_EXHDR(fldsep);
4941 
4942  /* return empty set for empty input string */
4943  if (inputstring_len < 1)
4944  return true;
4945 
4946  /* empty field separator: return input string as a one-element set */
4947  if (fldsep_len < 1)
4948  {
4949  split_text_accum_result(tstate, inputstring,
4950  null_string, collation);
4951  return true;
4952  }
4953 
4954  text_position_setup(inputstring, fldsep, collation, &state);
4955 
4956  start_ptr = VARDATA_ANY(inputstring);
4957 
4958  for (;;)
4959  {
4960  bool found;
4961  char *end_ptr;
4962  int chunk_len;
4963 
4965 
4966  found = text_position_next(&state);
4967  if (!found)
4968  {
4969  /* fetch last field */
4970  chunk_len = ((char *) inputstring + VARSIZE_ANY(inputstring)) - start_ptr;
4971  end_ptr = NULL; /* not used, but some compilers complain */
4972  }
4973  else
4974  {
4975  /* fetch non-last field */
4976  end_ptr = text_position_get_match_ptr(&state);
4977  chunk_len = end_ptr - start_ptr;
4978  }
4979 
4980  /* build a temp text datum to pass to split_text_accum_result */
4981  result_text = cstring_to_text_with_len(start_ptr, chunk_len);
4982 
4983  /* stash away this field */
4984  split_text_accum_result(tstate, result_text,
4985  null_string, collation);
4986 
4987  pfree(result_text);
4988 
4989  if (!found)
4990  break;
4991 
4992  start_ptr = end_ptr + fldsep_len;
4993  }
4994 
4995  text_position_cleanup(&state);
4996  }
4997  else
4998  {
4999  /*
5000  * When fldsep is NULL, each character in the input string becomes a
5001  * separate element in the result set. The separator is effectively
5002  * the space between characters.
5003  */
5004  inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
5005 
5006  start_ptr = VARDATA_ANY(inputstring);
5007 
5008  while (inputstring_len > 0)
5009  {
5010  int chunk_len = pg_mblen(start_ptr);
5011 
5013 
5014  /* build a temp text datum to pass to split_text_accum_result */
5015  result_text = cstring_to_text_with_len(start_ptr, chunk_len);
5016 
5017  /* stash away this field */
5018  split_text_accum_result(tstate, result_text,
5019  null_string, collation);
5020 
5021  pfree(result_text);
5022 
5023  start_ptr += chunk_len;
5024  inputstring_len -= chunk_len;
5025  }
5026  }
5027 
5028  return true;
5029 }
5030 
5031 /*
5032  * Add text item to result set (table or array).
5033  *
5034  * This is also responsible for checking to see if the item matches
5035  * the null_string, in which case we should emit NULL instead.
5036  */
5037 static void
5039  text *field_value,
5040  text *null_string,
5041  Oid collation)
5042 {
5043  bool is_null = false;
5044 
5045  if (null_string && text_isequal(field_value, null_string, collation))
5046  is_null = true;
5047 
5048  if (tstate->tupstore)
5049  {
5050  Datum values[1];
5051  bool nulls[1];
5052 
5053  values[0] = PointerGetDatum(field_value);
5054  nulls[0] = is_null;
5055 
5057  tstate->tupdesc,
5058  values,
5059  nulls);
5060  }
5061  else
5062  {
5063  tstate->astate = accumArrayResult(tstate->astate,
5064  PointerGetDatum(field_value),
5065  is_null,
5066  TEXTOID,
5068  }
5069 }
5070 
5071 /*
5072  * array_to_text
5073  * concatenate Cstring representation of input array elements
5074  * using provided field separator
5075  */
5076 Datum
5078 {
5080  char *fldsep = text_to_cstring(PG_GETARG_TEXT_PP(1));
5081 
5082  PG_RETURN_TEXT_P(array_to_text_internal(fcinfo, v, fldsep, NULL));
5083 }
5084 
5085 /*
5086  * array_to_text_null
5087  * concatenate Cstring representation of input array elements
5088  * using provided field separator and null string
5089  *
5090  * This version is not strict so we have to test for null inputs explicitly.
5091  */
5092 Datum
5094 {
5095  ArrayType *v;
5096  char *fldsep;
5097  char *null_string;
5098 
5099  /* returns NULL when first or second parameter is NULL */
5100  if (PG_ARGISNULL(0) || PG_ARGISNULL(1))
5101  PG_RETURN_NULL();
5102 
5103  v = PG_GETARG_ARRAYTYPE_P(0);
5104  fldsep = text_to_cstring(PG_GETARG_TEXT_PP(1));
5105 
5106  /* NULL null string is passed through as a null pointer */
5107  if (!PG_ARGISNULL(2))
5108  null_string = text_to_cstring(PG_GETARG_TEXT_PP(2));
5109  else
5110  null_string = NULL;
5111 
5112  PG_RETURN_TEXT_P(array_to_text_internal(fcinfo, v, fldsep, null_string));
5113 }
5114 
5115 /*
5116  * common code for array_to_text and array_to_text_null functions
5117  */
5118 static text *
5120  const char *fldsep, const char *null_string)
5121 {
5122  text *result;
5123  int nitems,
5124  *dims,
5125  ndims;
5126  Oid element_type;
5127  int typlen;
5128  bool typbyval;
5129  char typalign;
5131  bool printed = false;
5132  char *p;
5133  bits8 *bitmap;
5134  int bitmask;
5135  int i;
5136  ArrayMetaState *my_extra;
5137 
5138  ndims = ARR_NDIM(v);
5139  dims = ARR_DIMS(v);
5140  nitems = ArrayGetNItems(ndims, dims);
5141 
5142  /* if there are no elements, return an empty string */
5143  if (nitems == 0)
5144  return cstring_to_text_with_len("", 0);
5145 
5146  element_type = ARR_ELEMTYPE(v);
5147  initStringInfo(&buf);
5148 
5149  /*
5150  * We arrange to look up info about element type, including its output
5151  * conversion proc, only once per series of calls, assuming the element
5152  * type doesn't change underneath us.
5153  */
5154  my_extra = (ArrayMetaState *) fcinfo->flinfo->fn_extra;
5155  if (my_extra == NULL)
5156  {
5157  fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
5158  sizeof(ArrayMetaState));
5159  my_extra = (ArrayMetaState *) fcinfo->flinfo->fn_extra;
5160  my_extra->element_type = ~element_type;
5161  }
5162 
5163  if (my_extra->element_type != element_type)
5164  {
5165  /*
5166  * Get info about element type, including its output conversion proc
5167  */
5168  get_type_io_data(element_type, IOFunc_output,
5169  &my_extra->typlen, &my_extra->typbyval,
5170  &my_extra->typalign, &my_extra->typdelim,
5171  &my_extra->typioparam, &my_extra->typiofunc);
5172  fmgr_info_cxt(my_extra->typiofunc, &my_extra->proc,
5173  fcinfo->flinfo->fn_mcxt);
5174  my_extra->element_type = element_type;
5175  }
5176  typlen = my_extra->typlen;
5177  typbyval = my_extra->typbyval;
5178  typalign = my_extra->typalign;
5179 
5180  p = ARR_DATA_PTR(v);
5181  bitmap = ARR_NULLBITMAP(v);
5182  bitmask = 1;
5183 
5184  for (i = 0; i < nitems; i++)
5185  {
5186  Datum itemvalue;
5187  char *value;
5188 
5189  /* Get source element, checking for NULL */
5190  if (bitmap && (*bitmap & bitmask) == 0)
5191  {
5192  /* if null_string is NULL, we just ignore null elements */
5193  if (null_string != NULL)
5194  {
5195  if (printed)
5196  appendStringInfo(&buf, "%s%s", fldsep, null_string);
5197  else
5198  appendStringInfoString(&buf, null_string);
5199  printed = true;
5200  }
5201  }
5202  else
5203  {
5204  itemvalue = fetch_att(p, typbyval, typlen);
5205 
5206  value = OutputFunctionCall(&my_extra->proc, itemvalue);
5207 
5208  if (printed)
5209  appendStringInfo(&buf, "%s%s", fldsep, value);
5210  else
5211  appendStringInfoString(&buf, value);
5212  printed = true;
5213 
5214  p = att_addlength_pointer(p, typlen, p);
5215  p = (char *) att_align_nominal(p, typalign);
5216  }
5217 
5218  /* advance bitmap pointer if any */
5219  if (bitmap)
5220  {
5221  bitmask <<= 1;
5222  if (bitmask == 0x100)
5223  {
5224  bitmap++;
5225  bitmask = 1;
5226  }
5227  }
5228  }
5229 
5230  result = cstring_to_text_with_len(buf.data, buf.len);
5231  pfree(buf.data);
5232 
5233  return result;
5234 }
5235 
5236 #define HEXBASE 16
5237 /*
5238  * Convert an int32 to a string containing a base 16 (hex) representation of
5239  * the number.
5240  */
5241 Datum
5243 {
5245  char *ptr;
5246  const char *digits = "0123456789abcdef";
5247  char buf[32]; /* bigger than needed, but reasonable */
5248 
5249  ptr = buf + sizeof(buf) - 1;
5250  *ptr = '\0';
5251 
5252  do
5253  {
5254  *--ptr = digits[value % HEXBASE];
5255  value /= HEXBASE;
5256  } while (ptr > buf && value);
5257 
5259 }
5260 
5261 /*
5262  * Convert an int64 to a string containing a base 16 (hex) representation of
5263  * the number.
5264  */
5265 Datum
5267 {
5268  uint64 value = (uint64) PG_GETARG_INT64(0);
5269  char *ptr;
5270  const char *digits = "0123456789abcdef";
5271  char buf[32]; /* bigger than needed, but reasonable */
5272 
5273  ptr = buf + sizeof(buf) - 1;
5274  *ptr = '\0';
5275 
5276  do
5277  {
5278  *--ptr = digits[value % HEXBASE];
5279  value /= HEXBASE;
5280  } while (ptr > buf && value);
5281 
5283 }
5284 
5285 /*
5286  * Return the size of a datum, possibly compressed
5287  *
5288  * Works on any data type
5289  */
5290 Datum
5292 {
5294  int32 result;
5295  int typlen;
5296 
5297  /* On first call, get the input type's typlen, and save at *fn_extra */
5298  if (fcinfo->flinfo->fn_extra == NULL)
5299  {
5300  /* Lookup the datatype of the supplied argument */
5301  Oid argtypeid = get_fn_expr_argtype(fcinfo->flinfo, 0);
5302 
5303  typlen = get_typlen(argtypeid);
5304  if (typlen == 0) /* should not happen */
5305  elog(ERROR, "cache lookup failed for type %u", argtypeid);
5306 
5307  fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
5308  sizeof(int));
5309  *((int *) fcinfo->flinfo->fn_extra) = typlen;
5310  }
5311  else
5312  typlen = *((int *) fcinfo->flinfo->fn_extra);
5313 
5314  if (typlen == -1)
5315  {
5316  /* varlena type, possibly toasted */
5317  result = toast_datum_size(value);
5318  }
5319  else if (typlen == -2)
5320  {
5321  /* cstring */
5322  result = strlen(DatumGetCString(value)) + 1;
5323  }
5324  else
5325  {
5326  /* ordinary fixed-width type */
5327  result = typlen;
5328  }
5329 
5330  PG_RETURN_INT32(result);
5331 }
5332 
5333 /*
5334  * Return the compression method stored in the compressed attribute. Return
5335  * NULL for non varlena type or uncompressed data.
5336  */
5337 Datum
5339 {
5340  int typlen;
5341  char *result;
5342  ToastCompressionId cmid;
5343 
5344  /* On first call, get the input type's typlen, and save at *fn_extra */
5345  if (fcinfo->flinfo->fn_extra == NULL)
5346  {
5347  /* Lookup the datatype of the supplied argument */
5348  Oid argtypeid = get_fn_expr_argtype(fcinfo->flinfo, 0);
5349 
5350  typlen = get_typlen(argtypeid);
5351  if (typlen == 0) /* should not happen */
5352  elog(ERROR, "cache lookup failed for type %u", argtypeid);
5353 
5354  fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
5355  sizeof(int));
5356  *((int *) fcinfo->flinfo->fn_extra) = typlen;
5357  }
5358  else
5359  typlen = *((int *) fcinfo->flinfo->fn_extra);
5360 
5361  if (typlen != -1)
5362  PG_RETURN_NULL();
5363 
5364  /* get the compression method id stored in the compressed varlena */
5365  cmid = toast_get_compression_id((struct varlena *)
5367  if (cmid == TOAST_INVALID_COMPRESSION_ID)
5368  PG_RETURN_NULL();
5369 
5370  /* convert compression method id to compression method name */
5371  switch (cmid)
5372  {
5374  result = "pglz";
5375  break;
5377  result = "lz4";
5378  break;
5379  default:
5380  elog(ERROR, "invalid compression method id %d", cmid);
5381  }
5382 
5384 }
5385 
5386 /*
5387  * string_agg - Concatenates values and returns string.
5388  *
5389  * Syntax: string_agg(value text, delimiter text) RETURNS text
5390  *
5391  * Note: Any NULL values are ignored. The first-call delimiter isn't
5392  * actually used at all, and on subsequent calls the delimiter precedes
5393  * the associated value.
5394  */
5395 
5396 /* subroutine to initialize state */
5397 static StringInfo
5399 {
5400  StringInfo state;
5401  MemoryContext aggcontext;
5402  MemoryContext oldcontext;
5403 
5404  if (!AggCheckCallContext(fcinfo, &aggcontext))
5405  {
5406  /* cannot be called directly because of internal-type argument */
5407  elog(ERROR, "string_agg_transfn called in non-aggregate context");
5408  }
5409 
5410  /*
5411  * Create state in aggregate context. It'll stay there across subsequent
5412  * calls.
5413  */
5414  oldcontext = MemoryContextSwitchTo(aggcontext);
5415  state = makeStringInfo();
5416  MemoryContextSwitchTo(oldcontext);
5417 
5418  return state;
5419 }
5420 
5421 Datum
5423 {
5424  StringInfo state;
5425 
5426  state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
5427 
5428  /* Append the value unless null. */
5429  if (!PG_ARGISNULL(1))
5430  {
5431  /* On the first time through, we ignore the delimiter. */
5432  if (state == NULL)
5433  state = makeStringAggState(fcinfo);
5434  else if (!PG_ARGISNULL(2))
5435  appendStringInfoText(state, PG_GETARG_TEXT_PP(2)); /* delimiter */
5436 
5437  appendStringInfoText(state, PG_GETARG_TEXT_PP(1)); /* value */
5438  }
5439 
5440  /*
5441  * The transition type for string_agg() is declared to be "internal",
5442  * which is a pass-by-value type the same size as a pointer.
5443  */
5444  PG_RETURN_POINTER(state);
5445 }
5446 
5447 Datum
5449 {
5450  StringInfo state;
5451 
5452  /* cannot be called directly because of internal-type argument */
5453  Assert(AggCheckCallContext(fcinfo, NULL));
5454 
5455  state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
5456 
5457  if (state != NULL)
5459  else
5460  PG_RETURN_NULL();
5461 }
5462 
5463 /*
5464  * Prepare cache with fmgr info for the output functions of the datatypes of
5465  * the arguments of a concat-like function, beginning with argument "argidx".
5466  * (Arguments before that will have corresponding slots in the resulting
5467  * FmgrInfo array, but we don't fill those slots.)
5468  */
5469 static FmgrInfo *
5471 {
5472  FmgrInfo *foutcache;
5473  int i;
5474 
5475  /* We keep the info in fn_mcxt so it survives across calls */
5476  foutcache = (FmgrInfo *) MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
5477  PG_NARGS() * sizeof(FmgrInfo));
5478 
5479  for (i = argidx; i < PG_NARGS(); i++)
5480  {
5481  Oid valtype;
5482  Oid typOutput;
5483  bool typIsVarlena;
5484 
5485  valtype = get_fn_expr_argtype(fcinfo->flinfo, i);
5486  if (!OidIsValid(valtype))
5487  elog(ERROR, "could not determine data type of concat() input");
5488 
5489  getTypeOutputInfo(valtype, &typOutput, &typIsVarlena);
5490  fmgr_info_cxt(typOutput, &foutcache[i], fcinfo->flinfo->fn_mcxt);
5491  }
5492 
5493  fcinfo->flinfo->fn_extra = foutcache;
5494 
5495  return foutcache;
5496 }
5497 
5498 /*
5499  * Implementation of both concat() and concat_ws().
5500  *
5501  * sepstr is the separator string to place between values.
5502  * argidx identifies the first argument to concatenate (counting from zero);
5503  * note that this must be constant across any one series of calls.
5504  *
5505  * Returns NULL if result should be NULL, else text value.
5506  */
5507 static text *
5508 concat_internal(const char *sepstr, int argidx,
5509  FunctionCallInfo fcinfo)
5510 {
5511  text *result;
5513  FmgrInfo *foutcache;
5514  bool first_arg = true;
5515  int i;
5516 
5517  /*
5518  * concat(VARIADIC some-array) is essentially equivalent to
5519  * array_to_text(), ie concat the array elements with the given separator.
5520  * So we just pass the case off to that code.
5521  */
5522  if (get_fn_expr_variadic(fcinfo->flinfo))
5523  {
5524  ArrayType *arr;
5525 
5526  /* Should have just the one argument */
5527  Assert(argidx == PG_NARGS() - 1);
5528 
5529  /* concat(VARIADIC NULL) is defined as NULL */
5530  if (PG_ARGISNULL(argidx))
5531  return NULL;
5532 
5533  /*
5534  * Non-null argument had better be an array. We assume that any call
5535  * context that could let get_fn_expr_variadic return true will have
5536  * checked that a VARIADIC-labeled parameter actually is an array. So
5537  * it should be okay to just Assert that it's an array rather than
5538  * doing a full-fledged error check.
5539  */
5541 
5542  /* OK, safe to fetch the array value */
5543  arr = PG_GETARG_ARRAYTYPE_P(argidx);
5544 
5545  /*
5546  * And serialize the array. We tell array_to_text to ignore null
5547  * elements, which matches the behavior of the loop below.
5548  */
5549  return array_to_text_internal(fcinfo, arr, sepstr, NULL);
5550  }
5551 
5552  /* Normal case without explicit VARIADIC marker */
5553  initStringInfo(&str);
5554 
5555  /* Get output function info, building it if first time through */
5556  foutcache = (FmgrInfo *) fcinfo->flinfo->fn_extra;
5557  if (foutcache == NULL)
5558  foutcache = build_concat_foutcache(fcinfo, argidx);
5559 
5560  for (i = argidx; i < PG_NARGS(); i++)
5561  {
5562  if (!PG_ARGISNULL(i))
5563  {
5565 
5566  /* add separator if appropriate */
5567  if (first_arg)
5568  first_arg = false;
5569  else
5570  appendStringInfoString(&str, sepstr);
5571 
5572  /* call the appropriate type output function, append the result */
5574  OutputFunctionCall(&foutcache[i], value));
5575  }
5576  }
5577 
5578  result = cstring_to_text_with_len(str.data, str.len);
5579  pfree(str.data);
5580 
5581  return result;
5582 }
5583 
5584 /*
5585  * Concatenate all arguments. NULL arguments are ignored.
5586  */
5587 Datum
5589 {
5590  text *result;
5591 
5592  result = concat_internal("", 0, fcinfo);
5593  if (result == NULL)
5594  PG_RETURN_NULL();
5595  PG_RETURN_TEXT_P(result);
5596 }
5597 
5598 /*
5599  * Concatenate all but first argument value with separators. The first
5600  * parameter is used as the separator. NULL arguments are ignored.
5601  */
5602 Datum
5604 {
5605  char *sep;
5606  text *result;
5607 
5608  /* return NULL when separator is NULL */
5609  if (PG_ARGISNULL(0))
5610  PG_RETURN_NULL();
5612 
5613  result = concat_internal(sep, 1, fcinfo);
5614  if (result == NULL)
5615  PG_RETURN_NULL();
5616  PG_RETURN_TEXT_P(result);
5617 }
5618 
5619 /*
5620  * Return first n characters in the string. When n is negative,
5621  * return all but last |n| characters.
5622  */
5623 Datum
5625 {
5626  int n = PG_GETARG_INT32(1);
5627 
5628  if (n < 0)
5629  {
5630  text *str = PG_GETARG_TEXT_PP(0);
5631  const char *p = VARDATA_ANY(str);
5632  int len = VARSIZE_ANY_EXHDR(str);
5633  int rlen;
5634 
5635  n = pg_mbstrlen_with_len(p, len) + n;
5636  rlen = pg_mbcharcliplen(p, len, n);
5638  }
5639  else
5641 }
5642 
5643 /*
5644  * Return last n characters in the string. When n is negative,
5645  * return all but first |n| characters.
5646  */
5647 Datum
5649 {
5650  text *str = PG_GETARG_TEXT_PP(0);
5651  const char *p = VARDATA_ANY(str);
5652  int len = VARSIZE_ANY_EXHDR(str);
5653  int n = PG_GETARG_INT32(1);
5654  int off;
5655 
5656  if (n < 0)
5657  n = -n;
5658  else
5659  n = pg_mbstrlen_with_len(p, len) - n;
5660  off = pg_mbcharcliplen(p, len, n);
5661 
5662  PG_RETURN_TEXT_P(cstring_to_text_with_len(p + off, len - off));
5663 }
5664 
5665 /*
5666  * Return reversed string
5667  */
5668 Datum
5670 {
5671  text *str = PG_GETARG_TEXT_PP(0);
5672  const char *p = VARDATA_ANY(str);
5673  int len = VARSIZE_ANY_EXHDR(str);
5674  const char *endp = p + len;
5675  text *result;
5676  char *dst;
5677 
5678  result = palloc(len + VARHDRSZ);
5679  dst = (char *) VARDATA(result) + len;
5680  SET_VARSIZE(result, len + VARHDRSZ);
5681 
5683  {
5684  /* multibyte version */
5685  while (p < endp)
5686  {
5687  int sz;
5688 
5689  sz = pg_mblen(p);
5690  dst -= sz;
5691  memcpy(dst, p, sz);
5692  p += sz;
5693  }
5694  }
5695  else
5696  {
5697  /* single byte version */
5698  while (p < endp)
5699  *(--dst) = *p++;
5700  }
5701 
5702  PG_RETURN_TEXT_P(result);
5703 }
5704 
5705 
5706 /*
5707  * Support macros for text_format()
5708  */
5709 #define TEXT_FORMAT_FLAG_MINUS 0x0001 /* is minus flag present? */
5710 
5711 #define ADVANCE_PARSE_POINTER(ptr,end_ptr) \
5712  do { \
5713  if (++(ptr) >= (end_ptr)) \
5714  ereport(ERROR, \
5715  (errcode(ERRCODE_INVALID_PARAMETER_VALUE), \
5716  errmsg("unterminated format() type specifier"), \
5717  errhint("For a single \"%%\" use \"%%%%\"."))); \
5718  } while (0)
5719 
5720 /*
5721  * Returns a formatted string
5722  */
5723 Datum
5725 {
5726  text *fmt;
5728  const char *cp;
5729  const char *start_ptr;
5730  const char *end_ptr;
5731  text *result;
5732  int arg;
5733  bool funcvariadic;
5734  int nargs;
5735  Datum *elements = NULL;
5736  bool *nulls = NULL;
5737  Oid element_type = InvalidOid;
5738  Oid prev_type = InvalidOid;
5739  Oid prev_width_type = InvalidOid;
5740  FmgrInfo typoutputfinfo;
5741  FmgrInfo typoutputinfo_width;
5742 
5743  /* When format string is null, immediately return null */
5744  if (PG_ARGISNULL(0))
5745  PG_RETURN_NULL();
5746 
5747  /* If argument is marked VARIADIC, expand array into elements */
5748  if (get_fn_expr_variadic(fcinfo->flinfo))
5749  {
5750  ArrayType *arr;
5751  int16 elmlen;
5752  bool elmbyval;
5753  char elmalign;
5754  int nitems;
5755 
5756  /* Should have just the one argument */
5757  Assert(PG_NARGS() == 2);
5758 
5759  /* If argument is NULL, we treat it as zero-length array */
5760  if (PG_ARGISNULL(1))
5761  nitems = 0;
5762  else
5763  {
5764  /*
5765  * Non-null argument had better be an array. We assume that any
5766  * call context that could let get_fn_expr_variadic return true
5767  * will have checked that a VARIADIC-labeled parameter actually is
5768  * an array. So it should be okay to just Assert that it's an
5769  * array rather than doing a full-fledged error check.
5770  */
5772 
5773  /* OK, safe to fetch the array value */
5774  arr = PG_GETARG_ARRAYTYPE_P(1);
5775 
5776  /* Get info about array element type */
5777  element_type = ARR_ELEMTYPE(arr);
5778  get_typlenbyvalalign(element_type,
5779  &elmlen, &elmbyval, &elmalign);
5780 
5781  /* Extract all array elements */
5782  deconstruct_array(arr, element_type, elmlen, elmbyval, elmalign,
5783  &elements, &nulls, &nitems);
5784  }
5785 
5786  nargs = nitems + 1;
5787  funcvariadic = true;
5788  }
5789  else
5790  {
5791  /* Non-variadic case, we'll process the arguments individually */
5792  nargs = PG_NARGS();
5793  funcvariadic = false;
5794  }
5795 
5796  /* Setup for main loop. */
5797  fmt = PG_GETARG_TEXT_PP(0);
5798  start_ptr = VARDATA_ANY(fmt);
5799  end_ptr = start_ptr + VARSIZE_ANY_EXHDR(fmt);
5800  initStringInfo(&str);
5801  arg = 1; /* next argument position to print */
5802 
5803  /* Scan format string, looking for conversion specifiers. */
5804  for (cp = start_ptr; cp < end_ptr; cp++)
5805  {
5806  int argpos;
5807  int widthpos;
5808  int flags;
5809  int width;
5810  Datum value;
5811  bool isNull;
5812  Oid typid;
5813 
5814  /*
5815  * If it's not the start of a conversion specifier, just copy it to
5816  * the output buffer.
5817  */
5818  if (*cp != '%')
5819  {
5820  appendStringInfoCharMacro(&str, *cp);
5821  continue;
5822  }
5823 
5824  ADVANCE_PARSE_POINTER(cp, end_ptr);
5825 
5826  /* Easy case: %% outputs a single % */
5827  if (*cp == '%')
5828  {
5829  appendStringInfoCharMacro(&str, *cp);
5830  continue;
5831  }
5832 
5833  /* Parse the optional portions of the format specifier */
5834  cp = text_format_parse_format(cp, end_ptr,
5835  &argpos, &widthpos,
5836  &flags, &width);
5837 
5838  /*
5839  * Next we should see the main conversion specifier. Whether or not
5840  * an argument position was present, it's known that at least one
5841  * character remains in the string at this point. Experience suggests
5842  * that it's worth checking that that character is one of the expected
5843  * ones before we try to fetch arguments, so as to produce the least
5844  * confusing response to a mis-formatted specifier.
5845  */
5846  if (strchr("sIL", *cp) == NULL)
5847  ereport(ERROR,
5848  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5849  errmsg("unrecognized format() type specifier \"%.*s\"",
5850  pg_mblen(cp), cp),
5851  errhint("For a single \"%%\" use \"%%%%\".")));
5852 
5853  /* If indirect width was specified, get its value */
5854  if (widthpos >= 0)
5855  {
5856  /* Collect the specified or next argument position */
5857  if (widthpos > 0)
5858  arg = widthpos;
5859  if (arg >= nargs)
5860  ereport(ERROR,
5861  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5862  errmsg("too few arguments for format()")));
5863 
5864  /* Get the value and type of the selected argument */
5865  if (!funcvariadic)
5866  {
5867  value = PG_GETARG_DATUM(arg);
5868  isNull = PG_ARGISNULL(arg);
5869  typid = get_fn_expr_argtype(fcinfo->flinfo, arg);
5870  }
5871  else
5872  {
5873  value = elements[arg - 1];
5874  isNull = nulls[arg - 1];
5875  typid = element_type;
5876  }
5877  if (!OidIsValid(typid))
5878  elog(ERROR, "could not determine data type of format() input");
5879 
5880  arg++;
5881 
5882  /* We can treat NULL width the same as zero */
5883  if (isNull)
5884  width = 0;
5885  else if (typid == INT4OID)
5886  width = DatumGetInt32(value);
5887  else if (typid == INT2OID)
5888  width = DatumGetInt16(value);
5889  else
5890  {
5891  /* For less-usual datatypes, convert to text then to int */
5892  char *str;
5893 
5894  if (typid != prev_width_type)
5895  {
5896  Oid typoutputfunc;
5897  bool typIsVarlena;
5898 
5899  getTypeOutputInfo(typid, &typoutputfunc, &typIsVarlena);
5900  fmgr_info(typoutputfunc, &typoutputinfo_width);
5901  prev_width_type = typid;
5902  }
5903 
5904  str = OutputFunctionCall(&typoutputinfo_width, value);
5905 
5906  /* pg_strtoint32 will complain about bad data or overflow */
5907  width = pg_strtoint32(str);
5908 
5909  pfree(str);
5910  }
5911  }
5912 
5913  /* Collect the specified or next argument position */
5914  if (argpos > 0)
5915  arg = argpos;
5916  if (arg >= nargs)
5917  ereport(ERROR,
5918  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5919  errmsg("too few arguments for format()")));
5920 
5921  /* Get the value and type of the selected argument */
5922  if (!funcvariadic)
5923  {
5924  value = PG_GETARG_DATUM(arg);
5925  isNull = PG_ARGISNULL(arg);
5926  typid = get_fn_expr_argtype(fcinfo->flinfo, arg);
5927  }
5928  else
5929  {
5930  value = elements[arg - 1];
5931  isNull = nulls[arg - 1];
5932  typid = element_type;
5933  }
5934  if (!OidIsValid(typid))
5935  elog(ERROR, "could not determine data type of format() input");
5936 
5937  arg++;
5938 
5939  /*
5940  * Get the appropriate typOutput function, reusing previous one if
5941  * same type as previous argument. That's particularly useful in the
5942  * variadic-array case, but often saves work even for ordinary calls.
5943  */
5944  if (typid != prev_type)
5945  {
5946  Oid typoutputfunc;
5947  bool typIsVarlena;
5948 
5949  getTypeOutputInfo(typid, &typoutputfunc, &typIsVarlena);
5950  fmgr_info(typoutputfunc, &typoutputfinfo);
5951  prev_type = typid;
5952  }
5953 
5954  /*
5955  * And now we can format the value.
5956  */
5957  switch (*cp)
5958  {
5959  case 's':
5960  case 'I':
5961  case 'L':
5962  text_format_string_conversion(&str, *cp, &typoutputfinfo,
5963  value, isNull,
5964  flags, width);
5965  break;
5966  default:
5967  /* should not get here, because of previous check */
5968  ereport(ERROR,
5969  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5970  errmsg("unrecognized format() type specifier \"%.*s\"",
5971  pg_mblen(cp), cp),
5972  errhint("For a single \"%%\" use \"%%%%\".")));
5973  break;
5974  }
5975  }
5976 
5977  /* Don't need deconstruct_array results anymore. */
5978  if (elements != NULL)
5979  pfree(elements);
5980  if (nulls != NULL)
5981  pfree(nulls);
5982 
5983  /* Generate results. */
5984  result = cstring_to_text_with_len(str.data, str.len);
5985  pfree(str.data);
5986 
5987  PG_RETURN_TEXT_P(result);
5988 }
5989 
5990 /*
5991  * Parse contiguous digits as a decimal number.
5992  *
5993  * Returns true if some digits could be parsed.
5994  * The value is returned into *value, and *ptr is advanced to the next
5995  * character to be parsed.
5996  *
5997  * Note parsing invariant: at least one character is known available before
5998  * string end (end_ptr) at entry, and this is still true at exit.
5999  */
6000 static bool
6001 text_format_parse_digits(const char **ptr, const char *end_ptr, int *value)
6002 {
6003  bool found = false;
6004  const char *cp = *ptr;
6005  int val = 0;
6006 
6007  while (*cp >= '0' && *cp <= '9')
6008  {
6009  int8 digit = (*cp - '0');
6010 
6011  if (unlikely(pg_mul_s32_overflow(val, 10, &val)) ||
6012  unlikely(pg_add_s32_overflow(val, digit, &val)))
6013  ereport(ERROR,
6014  (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
6015  errmsg("number is out of range")));
6016  ADVANCE_PARSE_POINTER(cp, end_ptr);
6017  found = true;
6018  }
6019 
6020  *ptr = cp;
6021  *value = val;
6022 
6023  return found;
6024 }
6025 
6026 /*
6027  * Parse a format specifier (generally following the SUS printf spec).
6028  *
6029  * We have already advanced over the initial '%', and we are looking for
6030  * [argpos][flags][width]type (but the type character is not consumed here).
6031  *
6032  * Inputs are start_ptr (the position after '%') and end_ptr (string end + 1).
6033  * Output parameters:
6034  * argpos: argument position for value to be printed. -1 means unspecified.
6035  * widthpos: argument position for width. Zero means the argument position
6036  * was unspecified (ie, take the next arg) and -1 means no width
6037  * argument (width was omitted or specified as a constant).
6038  * flags: bitmask of flags.
6039  * width: directly-specified width value. Zero means the width was omitted
6040  * (note it's not necessary to distinguish this case from an explicit
6041  * zero width value).
6042  *
6043  * The function result is the next character position to be parsed, ie, the
6044  * location where the type character is/should be.
6045  *
6046  * Note parsing invariant: at least one character is known available before
6047  * string end (end_ptr) at entry, and this is still true at exit.
6048  */
6049 static const char *
6050 text_format_parse_format(const char *start_ptr, const char *end_ptr,
6051  int *argpos, int *widthpos,
6052  int *flags, int *width)
6053 {
6054  const char *cp = start_ptr;
6055  int n;
6056 
6057  /* set defaults for output parameters */
6058  *argpos = -1;
6059  *widthpos = -1;
6060  *flags = 0;
6061  *width = 0;
6062 
6063  /* try to identify first number */
6064  if (text_format_parse_digits(&cp, end_ptr, &n))
6065  {
6066  if (*cp != '$')
6067  {
6068  /* Must be just a width and a type, so we're done */
6069  *width = n;
6070  return cp;
6071  }
6072  /* The number was argument position */
6073  *argpos = n;
6074  /* Explicit 0 for argument index is immediately refused */
6075  if (n == 0)
6076  ereport(ERROR,
6077  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6078  errmsg("format specifies argument 0, but arguments are numbered from 1")));
6079  ADVANCE_PARSE_POINTER(cp, end_ptr);
6080  }
6081 
6082  /* Handle flags (only minus is supported now) */
6083  while (*cp == '-')
6084  {
6085  *flags |= TEXT_FORMAT_FLAG_MINUS;
6086  ADVANCE_PARSE_POINTER(cp, end_ptr);
6087  }
6088 
6089  if (*cp == '*')
6090  {
6091  /* Handle indirect width */
6092  ADVANCE_PARSE_POINTER(cp, end_ptr);
6093  if (text_format_parse_digits(&cp, end_ptr, &n))
6094  {
6095  /* number in this position must be closed by $ */
6096  if (*cp != '$')
6097  ereport(ERROR,
6098  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6099  errmsg("width argument position must be ended by \"$\"")));
6100  /* The number was width argument position */
6101  *widthpos = n;
6102  /* Explicit 0 for argument index is immediately refused */
6103  if (n == 0)
6104  ereport(ERROR,
6105  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6106  errmsg("format specifies argument 0, but arguments are numbered from 1")));
6107  ADVANCE_PARSE_POINTER(cp, end_ptr);
6108  }
6109  else
6110  *widthpos = 0; /* width's argument position is unspecified */
6111  }
6112  else
6113  {
6114  /* Check for direct width specification */
6115  if (text_format_parse_digits(&cp, end_ptr, &n))
6116  *width = n;
6117  }
6118 
6119  /* cp should now be pointing at type character */
6120  return cp;
6121 }
6122 
6123 /*
6124  * Format a %s, %I, or %L conversion
6125  */
6126 static void
6128  FmgrInfo *typOutputInfo,
6129  Datum value, bool isNull,
6130  int flags, int width)
6131 {
6132  char *str;
6133 
6134  /* Handle NULL arguments before trying to stringify the value. */
6135  if (isNull)
6136  {
6137  if (conversion == 's')
6138  text_format_append_string(buf, "", flags, width);
6139  else if (conversion == 'L')
6140  text_format_append_string(buf, "NULL", flags, width);
6141  else if (conversion == 'I')
6142  ereport(ERROR,
6143  (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
6144  errmsg("null values cannot be formatted as an SQL identifier")));
6145  return;
6146  }
6147 
6148  /* Stringify. */
6149  str = OutputFunctionCall(typOutputInfo, value);
6150 
6151  /* Escape. */
6152  if (conversion == 'I')
6153  {
6154  /* quote_identifier may or may not allocate a new string. */
6155  text_format_append_string(buf, quote_identifier(str), flags, width);
6156  }
6157  else if (conversion == 'L')
6158  {
6159  char *qstr = quote_literal_cstr(str);
6160 
6161  text_format_append_string(buf, qstr, flags, width);
6162  /* quote_literal_cstr() always allocates a new string */
6163  pfree(qstr);
6164  }
6165  else
6166  text_format_append_string(buf, str, flags, width);
6167 
6168  /* Cleanup. */
6169  pfree(str);
6170 }
6171 
6172 /*
6173  * Append str to buf, padding as directed by flags/width
6174  */
6175 static void
6177  int flags, int width)
6178 {
6179  bool align_to_left = false;
6180  int len;
6181 
6182  /* fast path for typical easy case */
6183  if (width == 0)
6184  {
6185  appendStringInfoString(buf, str);
6186  return;
6187  }
6188 
6189  if (width < 0)
6190  {
6191  /* Negative width: implicit '-' flag, then take absolute value */
6192  align_to_left = true;
6193  /* -INT_MIN is undefined */
6194  if (width <= INT_MIN)
6195  ereport(ERROR,
6196  (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
6197  errmsg("number is out of range")));
6198  width = -width;
6199  }
6200  else if (flags & TEXT_FORMAT_FLAG_MINUS)
6201  align_to_left = true;
6202 
6203  len = pg_mbstrlen(str);
6204  if (align_to_left)
6205  {
6206  /* left justify */
6207  appendStringInfoString(buf, str);
6208  if (len < width)
6209  appendStringInfoSpaces(buf, width - len);
6210  }
6211  else
6212  {
6213  /* right justify */
6214  if (len < width)
6215  appendStringInfoSpaces(buf, width - len);
6216  appendStringInfoString(buf, str);
6217  }
6218 }
6219 
6220 /*
6221  * text_format_nv - nonvariadic wrapper for text_format function.
6222  *
6223  * note: this wrapper is necessary to pass the sanity check in opr_sanity,
6224  * which checks that all built-in functions that share the implementing C
6225  * function take the same number of arguments.
6226  */
6227 Datum
6229 {
6230  return text_format(fcinfo);
6231 }
6232 
6233 /*
6234  * Helper function for Levenshtein distance functions. Faster than memcmp(),
6235  * for this use case.
6236  */
6237 static inline bool
6238 rest_of_char_same(const char *s1, const char *s2, int len)
6239 {
6240  while (len > 0)
6241  {
6242  len--;
6243  if (s1[len] != s2[len])
6244  return false;
6245  }
6246  return true;
6247 }
6248 
6249 /* Expand each Levenshtein distance variant */
6250 #include "levenshtein.c"
6251 #define LEVENSHTEIN_LESS_EQUAL
6252 #include "levenshtein.c"
6253 
6254 
6255 /*
6256  * Unicode support
6257  */
6258