PostgreSQL Source Code  git master
varlena.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * varlena.c
4  * Functions for the variable-length built-in types.
5  *
6  * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  * src/backend/utils/adt/varlena.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 #include "postgres.h"
16 
17 #include <ctype.h>
18 #include <limits.h>
19 
20 #include "access/detoast.h"
22 #include "catalog/pg_collation.h"
23 #include "catalog/pg_type.h"
24 #include "common/hashfn.h"
25 #include "common/int.h"
26 #include "common/unicode_norm.h"
27 #include "funcapi.h"
28 #include "lib/hyperloglog.h"
29 #include "libpq/pqformat.h"
30 #include "miscadmin.h"
31 #include "nodes/execnodes.h"
32 #include "parser/scansup.h"
33 #include "port/pg_bswap.h"
34 #include "regex/regex.h"
35 #include "utils/builtins.h"
36 #include "utils/bytea.h"
37 #include "utils/lsyscache.h"
38 #include "utils/memutils.h"
39 #include "utils/pg_locale.h"
40 #include "utils/sortsupport.h"
41 #include "utils/varlena.h"
42 
43 
44 /* GUC variable */
46 
47 typedef struct varlena unknown;
48 typedef struct varlena VarString;
49 
50 /*
51  * State for text_position_* functions.
52  */
53 typedef struct
54 {
55  bool is_multibyte_char_in_char; /* need to check char boundaries? */
56 
57  char *str1; /* haystack string */
58  char *str2; /* needle string */
59  int len1; /* string lengths in bytes */
60  int len2;
61 
62  /* Skip table for Boyer-Moore-Horspool search algorithm: */
63  int skiptablemask; /* mask for ANDing with skiptable subscripts */
64  int skiptable[256]; /* skip distance for given mismatched char */
65 
66  char *last_match; /* pointer to last match in 'str1' */
67 
68  /*
69  * Sometimes we need to convert the byte position of a match to a
70  * character position. These store the last position that was converted,
71  * so that on the next call, we can continue from that point, rather than
72  * count characters from the very beginning.
73  */
74  char *refpoint; /* pointer within original haystack string */
75  int refpos; /* 0-based character offset of the same point */
77 
78 typedef struct
79 {
80  char *buf1; /* 1st string, or abbreviation original string
81  * buf */
82  char *buf2; /* 2nd string, or abbreviation strxfrm() buf */
83  int buflen1;
84  int buflen2;
85  int last_len1; /* Length of last buf1 string/strxfrm() input */
86  int last_len2; /* Length of last buf2 string/strxfrm() blob */
87  int last_returned; /* Last comparison result (cache) */
88  bool cache_blob; /* Does buf2 contain strxfrm() blob, etc? */
89  bool collate_c;
90  Oid typid; /* Actual datatype (text/bpchar/bytea/name) */
91  hyperLogLogState abbr_card; /* Abbreviated key cardinality state */
92  hyperLogLogState full_card; /* Full key cardinality state */
93  double prop_card; /* Required cardinality proportion */
96 
97 /*
98  * Output data for split_text(): we output either to an array or a table.
99  * tupstore and tupdesc must be set up in advance to output to a table.
100  */
101 typedef struct
102 {
107 
108 /*
109  * This should be large enough that most strings will fit, but small enough
110  * that we feel comfortable putting it on the stack
111  */
112 #define TEXTBUFLEN 1024
113 
114 #define DatumGetUnknownP(X) ((unknown *) PG_DETOAST_DATUM(X))
115 #define DatumGetUnknownPCopy(X) ((unknown *) PG_DETOAST_DATUM_COPY(X))
116 #define PG_GETARG_UNKNOWN_P(n) DatumGetUnknownP(PG_GETARG_DATUM(n))
117 #define PG_GETARG_UNKNOWN_P_COPY(n) DatumGetUnknownPCopy(PG_GETARG_DATUM(n))
118 #define PG_RETURN_UNKNOWN_P(x) PG_RETURN_POINTER(x)
119 
120 #define DatumGetVarStringP(X) ((VarString *) PG_DETOAST_DATUM(X))
121 #define DatumGetVarStringPP(X) ((VarString *) PG_DETOAST_DATUM_PACKED(X))
122 
123 static int varstrfastcmp_c(Datum x, Datum y, SortSupport ssup);
124 static int bpcharfastcmp_c(Datum x, Datum y, SortSupport ssup);
125 static int namefastcmp_c(Datum x, Datum y, SortSupport ssup);
126 static int varlenafastcmp_locale(Datum x, Datum y, SortSupport ssup);
127 static int namefastcmp_locale(Datum x, Datum y, SortSupport ssup);
128 static int varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup);
129 static Datum varstr_abbrev_convert(Datum original, SortSupport ssup);
130 static bool varstr_abbrev_abort(int memtupcount, SortSupport ssup);
131 static int32 text_length(Datum str);
132 static text *text_catenate(text *t1, text *t2);
133 static text *text_substring(Datum str,
134  int32 start,
135  int32 length,
136  bool length_not_specified);
137 static text *text_overlay(text *t1, text *t2, int sp, int sl);
138 static int text_position(text *t1, text *t2, Oid collid);
139 static void text_position_setup(text *t1, text *t2, Oid collid, TextPositionState *state);
141 static char *text_position_next_internal(char *start_ptr, TextPositionState *state);
145 static void check_collation_set(Oid collid);
146 static int text_cmp(text *arg1, text *arg2, Oid collid);
147 static bytea *bytea_catenate(bytea *t1, bytea *t2);
149  int S,
150  int L,
151  bool length_not_specified);
152 static bytea *bytea_overlay(bytea *t1, bytea *t2, int sp, int sl);
153 static void appendStringInfoText(StringInfo str, const text *t);
154 static bool split_text(FunctionCallInfo fcinfo, SplitTextOutputData *tstate);
155 static void split_text_accum_result(SplitTextOutputData *tstate,
156  text *field_value,
157  text *null_string,
158  Oid collation);
160  const char *fldsep, const char *null_string);
162 static bool text_format_parse_digits(const char **ptr, const char *end_ptr,
163  int *value);
164 static const char *text_format_parse_format(const char *start_ptr,
165  const char *end_ptr,
166  int *argpos, int *widthpos,
167  int *flags, int *width);
168 static void text_format_string_conversion(StringInfo buf, char conversion,
169  FmgrInfo *typOutputInfo,
170  Datum value, bool isNull,
171  int flags, int width);
172 static void text_format_append_string(StringInfo buf, const char *str,
173  int flags, int width);
174 
175 
176 /*****************************************************************************
177  * CONVERSION ROUTINES EXPORTED FOR USE BY C CODE *
178  *****************************************************************************/
179 
180 /*
181  * cstring_to_text
182  *
183  * Create a text value from a null-terminated C string.
184  *
185  * The new text value is freshly palloc'd with a full-size VARHDR.
186  */
187 text *
188 cstring_to_text(const char *s)
189 {
190  return cstring_to_text_with_len(s, strlen(s));
191 }
192 
193 /*
194  * cstring_to_text_with_len
195  *
196  * Same as cstring_to_text except the caller specifies the string length;
197  * the string need not be null_terminated.
198  */
199 text *
200 cstring_to_text_with_len(const char *s, int len)
201 {
202  text *result = (text *) palloc(len + VARHDRSZ);
203 
204  SET_VARSIZE(result, len + VARHDRSZ);
205  memcpy(VARDATA(result), s, len);
206 
207  return result;
208 }
209 
210 /*
211  * text_to_cstring
212  *
213  * Create a palloc'd, null-terminated C string from a text value.
214  *
215  * We support being passed a compressed or toasted text value.
216  * This is a bit bogus since such values shouldn't really be referred to as
217  * "text *", but it seems useful for robustness. If we didn't handle that
218  * case here, we'd need another routine that did, anyway.
219  */
220 char *
222 {
223  /* must cast away the const, unfortunately */
224  text *tunpacked = pg_detoast_datum_packed(unconstify(text *, t));
225  int len = VARSIZE_ANY_EXHDR(tunpacked);
226  char *result;
227 
228  result = (char *) palloc(len + 1);
229  memcpy(result, VARDATA_ANY(tunpacked), len);
230  result[len] = '\0';
231 
232  if (tunpacked != t)
233  pfree(tunpacked);
234 
235  return result;
236 }
237 
238 /*
239  * text_to_cstring_buffer
240  *
241  * Copy a text value into a caller-supplied buffer of size dst_len.
242  *
243  * The text string is truncated if necessary to fit. The result is
244  * guaranteed null-terminated (unless dst_len == 0).
245  *
246  * We support being passed a compressed or toasted text value.
247  * This is a bit bogus since such values shouldn't really be referred to as
248  * "text *", but it seems useful for robustness. If we didn't handle that
249  * case here, we'd need another routine that did, anyway.
250  */
251 void
252 text_to_cstring_buffer(const text *src, char *dst, size_t dst_len)
253 {
254  /* must cast away the const, unfortunately */
255  text *srcunpacked = pg_detoast_datum_packed(unconstify(text *, src));
256  size_t src_len = VARSIZE_ANY_EXHDR(srcunpacked);
257 
258  if (dst_len > 0)
259  {
260  dst_len--;
261  if (dst_len >= src_len)
262  dst_len = src_len;
263  else /* ensure truncation is encoding-safe */
264  dst_len = pg_mbcliplen(VARDATA_ANY(srcunpacked), src_len, dst_len);
265  memcpy(dst, VARDATA_ANY(srcunpacked), dst_len);
266  dst[dst_len] = '\0';
267  }
268 
269  if (srcunpacked != src)
270  pfree(srcunpacked);
271 }
272 
273 
274 /*****************************************************************************
275  * USER I/O ROUTINES *
276  *****************************************************************************/
277 
278 
279 #define VAL(CH) ((CH) - '0')
280 #define DIG(VAL) ((VAL) + '0')
281 
282 /*
283  * byteain - converts from printable representation of byte array
284  *
285  * Non-printable characters must be passed as '\nnn' (octal) and are
286  * converted to internal form. '\' must be passed as '\\'.
287  * ereport(ERROR, ...) if bad form.
288  *
289  * BUGS:
290  * The input is scanned twice.
291  * The error checking of input is minimal.
292  */
293 Datum
295 {
296  char *inputText = PG_GETARG_CSTRING(0);
297  char *tp;
298  char *rp;
299  int bc;
300  bytea *result;
301 
302  /* Recognize hex input */
303  if (inputText[0] == '\\' && inputText[1] == 'x')
304  {
305  size_t len = strlen(inputText);
306 
307  bc = (len - 2) / 2 + VARHDRSZ; /* maximum possible length */
308  result = palloc(bc);
309  bc = hex_decode(inputText + 2, len - 2, VARDATA(result));
310  SET_VARSIZE(result, bc + VARHDRSZ); /* actual length */
311 
312  PG_RETURN_BYTEA_P(result);
313  }
314 
315  /* Else, it's the traditional escaped style */
316  for (bc = 0, tp = inputText; *tp != '\0'; bc++)
317  {
318  if (tp[0] != '\\')
319  tp++;
320  else if ((tp[0] == '\\') &&
321  (tp[1] >= '0' && tp[1] <= '3') &&
322  (tp[2] >= '0' && tp[2] <= '7') &&
323  (tp[3] >= '0' && tp[3] <= '7'))
324  tp += 4;
325  else if ((tp[0] == '\\') &&
326  (tp[1] == '\\'))
327  tp += 2;
328  else
329  {
330  /*
331  * one backslash, not followed by another or ### valid octal
332  */
333  ereport(ERROR,
334  (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
335  errmsg("invalid input syntax for type %s", "bytea")));
336  }
337  }
338 
339  bc += VARHDRSZ;
340 
341  result = (bytea *) palloc(bc);
342  SET_VARSIZE(result, bc);
343 
344  tp = inputText;
345  rp = VARDATA(result);
346  while (*tp != '\0')
347  {
348  if (tp[0] != '\\')
349  *rp++ = *tp++;
350  else if ((tp[0] == '\\') &&
351  (tp[1] >= '0' && tp[1] <= '3') &&
352  (tp[2] >= '0' && tp[2] <= '7') &&
353  (tp[3] >= '0' && tp[3] <= '7'))
354  {
355  bc = VAL(tp[1]);
356  bc <<= 3;
357  bc += VAL(tp[2]);
358  bc <<= 3;
359  *rp++ = bc + VAL(tp[3]);
360 
361  tp += 4;
362  }
363  else if ((tp[0] == '\\') &&
364  (tp[1] == '\\'))
365  {
366  *rp++ = '\\';
367  tp += 2;
368  }
369  else
370  {
371  /*
372  * We should never get here. The first pass should not allow it.
373  */
374  ereport(ERROR,
375  (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
376  errmsg("invalid input syntax for type %s", "bytea")));
377  }
378  }
379 
380  PG_RETURN_BYTEA_P(result);
381 }
382 
383 /*
384  * byteaout - converts to printable representation of byte array
385  *
386  * In the traditional escaped format, non-printable characters are
387  * printed as '\nnn' (octal) and '\' as '\\'.
388  */
389 Datum
391 {
392  bytea *vlena = PG_GETARG_BYTEA_PP(0);
393  char *result;
394  char *rp;
395 
397  {
398  /* Print hex format */
399  rp = result = palloc(VARSIZE_ANY_EXHDR(vlena) * 2 + 2 + 1);
400  *rp++ = '\\';
401  *rp++ = 'x';
402  rp += hex_encode(VARDATA_ANY(vlena), VARSIZE_ANY_EXHDR(vlena), rp);
403  }
404  else if (bytea_output == BYTEA_OUTPUT_ESCAPE)
405  {
406  /* Print traditional escaped format */
407  char *vp;
408  uint64 len;
409  int i;
410 
411  len = 1; /* empty string has 1 char */
412  vp = VARDATA_ANY(vlena);
413  for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++)
414  {
415  if (*vp == '\\')
416  len += 2;
417  else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e)
418  len += 4;
419  else
420  len++;
421  }
422 
423  /*
424  * In principle len can't overflow uint32 if the input fit in 1GB, but
425  * for safety let's check rather than relying on palloc's internal
426  * check.
427  */
428  if (len > MaxAllocSize)
429  ereport(ERROR,
430  (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
431  errmsg_internal("result of bytea output conversion is too large")));
432  rp = result = (char *) palloc(len);
433 
434  vp = VARDATA_ANY(vlena);
435  for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++)
436  {
437  if (*vp == '\\')
438  {
439  *rp++ = '\\';
440  *rp++ = '\\';
441  }
442  else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e)
443  {
444  int val; /* holds unprintable chars */
445 
446  val = *vp;
447  rp[0] = '\\';
448  rp[3] = DIG(val & 07);
449  val >>= 3;
450  rp[2] = DIG(val & 07);
451  val >>= 3;
452  rp[1] = DIG(val & 03);
453  rp += 4;
454  }
455  else
456  *rp++ = *vp;
457  }
458  }
459  else
460  {
461  elog(ERROR, "unrecognized bytea_output setting: %d",
462  bytea_output);
463  rp = result = NULL; /* keep compiler quiet */
464  }
465  *rp = '\0';
466  PG_RETURN_CSTRING(result);
467 }
468 
469 /*
470  * bytearecv - converts external binary format to bytea
471  */
472 Datum
474 {
476  bytea *result;
477  int nbytes;
478 
479  nbytes = buf->len - buf->cursor;
480  result = (bytea *) palloc(nbytes + VARHDRSZ);
481  SET_VARSIZE(result, nbytes + VARHDRSZ);
482  pq_copymsgbytes(buf, VARDATA(result), nbytes);
483  PG_RETURN_BYTEA_P(result);
484 }
485 
486 /*
487  * byteasend - converts bytea to binary format
488  *
489  * This is a special case: just copy the input...
490  */
491 Datum
493 {
494  bytea *vlena = PG_GETARG_BYTEA_P_COPY(0);
495 
496  PG_RETURN_BYTEA_P(vlena);
497 }
498 
499 Datum
501 {
503 
504  state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
505 
506  /* Append the value unless null. */
507  if (!PG_ARGISNULL(1))
508  {
510 
511  /* On the first time through, we ignore the delimiter. */
512  if (state == NULL)
513  state = makeStringAggState(fcinfo);
514  else if (!PG_ARGISNULL(2))
515  {
516  bytea *delim = PG_GETARG_BYTEA_PP(2);
517 
519  }
520 
522  }
523 
524  /*
525  * The transition type for string_agg() is declared to be "internal",
526  * which is a pass-by-value type the same size as a pointer.
527  */
529 }
530 
531 Datum
533 {
535 
536  /* cannot be called directly because of internal-type argument */
537  Assert(AggCheckCallContext(fcinfo, NULL));
538 
539  state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
540 
541  if (state != NULL)
542  {
543  bytea *result;
544 
545  result = (bytea *) palloc(state->len + VARHDRSZ);
546  SET_VARSIZE(result, state->len + VARHDRSZ);
547  memcpy(VARDATA(result), state->data, state->len);
548  PG_RETURN_BYTEA_P(result);
549  }
550  else
551  PG_RETURN_NULL();
552 }
553 
554 /*
555  * textin - converts "..." to internal representation
556  */
557 Datum
559 {
560  char *inputText = PG_GETARG_CSTRING(0);
561 
562  PG_RETURN_TEXT_P(cstring_to_text(inputText));
563 }
564 
565 /*
566  * textout - converts internal representation to "..."
567  */
568 Datum
570 {
571  Datum txt = PG_GETARG_DATUM(0);
572 
574 }
575 
576 /*
577  * textrecv - converts external binary format to text
578  */
579 Datum
581 {
583  text *result;
584  char *str;
585  int nbytes;
586 
587  str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
588 
589  result = cstring_to_text_with_len(str, nbytes);
590  pfree(str);
591  PG_RETURN_TEXT_P(result);
592 }
593 
594 /*
595  * textsend - converts text to binary format
596  */
597 Datum
599 {
600  text *t = PG_GETARG_TEXT_PP(0);
602 
606 }
607 
608 
609 /*
610  * unknownin - converts "..." to internal representation
611  */
612 Datum
614 {
615  char *str = PG_GETARG_CSTRING(0);
616 
617  /* representation is same as cstring */
619 }
620 
621 /*
622  * unknownout - converts internal representation to "..."
623  */
624 Datum
626 {
627  /* representation is same as cstring */
628  char *str = PG_GETARG_CSTRING(0);
629 
631 }
632 
633 /*
634  * unknownrecv - converts external binary format to unknown
635  */
636 Datum
638 {
640  char *str;
641  int nbytes;
642 
643  str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
644  /* representation is same as cstring */
646 }
647 
648 /*
649  * unknownsend - converts unknown to binary format
650  */
651 Datum
653 {
654  /* representation is same as cstring */
655  char *str = PG_GETARG_CSTRING(0);
657 
659  pq_sendtext(&buf, str, strlen(str));
661 }
662 
663 
664 /* ========== PUBLIC ROUTINES ========== */
665 
666 /*
667  * textlen -
668  * returns the logical length of a text*
669  * (which is less than the VARSIZE of the text*)
670  */
671 Datum
673 {
675 
676  /* try to avoid decompressing argument */
678 }
679 
680 /*
681  * text_length -
682  * Does the real work for textlen()
683  *
684  * This is broken out so it can be called directly by other string processing
685  * functions. Note that the argument is passed as a Datum, to indicate that
686  * it may still be in compressed form. We can avoid decompressing it at all
687  * in some cases.
688  */
689 static int32
691 {
692  /* fastpath when max encoding length is one */
695  else
696  {
697  text *t = DatumGetTextPP(str);
698 
700  VARSIZE_ANY_EXHDR(t)));
701  }
702 }
703 
704 /*
705  * textoctetlen -
706  * returns the physical length of a text*
707  * (which is less than the VARSIZE of the text*)
708  */
709 Datum
711 {
713 
714  /* We need not detoast the input at all */
716 }
717 
718 /*
719  * textcat -
720  * takes two text* and returns a text* that is the concatenation of
721  * the two.
722  *
723  * Rewritten by Sapa, sapa@hq.icb.chel.su. 8-Jul-96.
724  * Updated by Thomas, Thomas.Lockhart@jpl.nasa.gov 1997-07-10.
725  * Allocate space for output in all cases.
726  * XXX - thomas 1997-07-10
727  */
728 Datum
730 {
731  text *t1 = PG_GETARG_TEXT_PP(0);
732  text *t2 = PG_GETARG_TEXT_PP(1);
733 
735 }
736 
737 /*
738  * text_catenate
739  * Guts of textcat(), broken out so it can be used by other functions
740  *
741  * Arguments can be in short-header form, but not compressed or out-of-line
742  */
743 static text *
745 {
746  text *result;
747  int len1,
748  len2,
749  len;
750  char *ptr;
751 
752  len1 = VARSIZE_ANY_EXHDR(t1);
753  len2 = VARSIZE_ANY_EXHDR(t2);
754 
755  /* paranoia ... probably should throw error instead? */
756  if (len1 < 0)
757  len1 = 0;
758  if (len2 < 0)
759  len2 = 0;
760 
761  len = len1 + len2 + VARHDRSZ;
762  result = (text *) palloc(len);
763 
764  /* Set size of result string... */
765  SET_VARSIZE(result, len);
766 
767  /* Fill data field of result string... */
768  ptr = VARDATA(result);
769  if (len1 > 0)
770  memcpy(ptr, VARDATA_ANY(t1), len1);
771  if (len2 > 0)
772  memcpy(ptr + len1, VARDATA_ANY(t2), len2);
773 
774  return result;
775 }
776 
777 /*
778  * charlen_to_bytelen()
779  * Compute the number of bytes occupied by n characters starting at *p
780  *
781  * It is caller's responsibility that there actually are n characters;
782  * the string need not be null-terminated.
783  */
784 static int
785 charlen_to_bytelen(const char *p, int n)
786 {
788  {
789  /* Optimization for single-byte encodings */
790  return n;
791  }
792  else
793  {
794  const char *s;
795 
796  for (s = p; n > 0; n--)
797  s += pg_mblen(s);
798 
799  return s - p;
800  }
801 }
802 
803 /*
804  * text_substr()
805  * Return a substring starting at the specified position.
806  * - thomas 1997-12-31
807  *
808  * Input:
809  * - string
810  * - starting position (is one-based)
811  * - string length
812  *
813  * If the starting position is zero or less, then return from the start of the string
814  * adjusting the length to be consistent with the "negative start" per SQL.
815  * If the length is less than zero, return the remaining string.
816  *
817  * Added multibyte support.
818  * - Tatsuo Ishii 1998-4-21
819  * Changed behavior if starting position is less than one to conform to SQL behavior.
820  * Formerly returned the entire string; now returns a portion.
821  * - Thomas Lockhart 1998-12-10
822  * Now uses faster TOAST-slicing interface
823  * - John Gray 2002-02-22
824  * Remove "#ifdef MULTIBYTE" and test for encoding_max_length instead. Change
825  * behaviors conflicting with SQL to meet SQL (if E = S + L < S throw
826  * error; if E < 1, return '', not entire string). Fixed MB related bug when
827  * S > LC and < LC + 4 sometimes garbage characters are returned.
828  * - Joe Conway 2002-08-10
829  */
830 Datum
832 {
834  PG_GETARG_INT32(1),
835  PG_GETARG_INT32(2),
836  false));
837 }
838 
839 /*
840  * text_substr_no_len -
841  * Wrapper to avoid opr_sanity failure due to
842  * one function accepting a different number of args.
843  */
844 Datum
846 {
848  PG_GETARG_INT32(1),
849  -1, true));
850 }
851 
852 /*
853  * text_substring -
854  * Does the real work for text_substr() and text_substr_no_len()
855  *
856  * This is broken out so it can be called directly by other string processing
857  * functions. Note that the argument is passed as a Datum, to indicate that
858  * it may still be in compressed/toasted form. We can avoid detoasting all
859  * of it in some cases.
860  *
861  * The result is always a freshly palloc'd datum.
862  */
863 static text *
864 text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
865 {
867  int32 S = start; /* start position */
868  int32 S1; /* adjusted start position */
869  int32 L1; /* adjusted substring length */
870  int32 E; /* end position */
871 
872  /*
873  * SQL99 says S can be zero or negative, but we still must fetch from the
874  * start of the string.
875  */
876  S1 = Max(S, 1);
877 
878  /* life is easy if the encoding max length is 1 */
879  if (eml == 1)
880  {
881  if (length_not_specified) /* special case - get length to end of
882  * string */
883  L1 = -1;
884  else if (length < 0)
885  {
886  /* SQL99 says to throw an error for E < S, i.e., negative length */
887  ereport(ERROR,
888  (errcode(ERRCODE_SUBSTRING_ERROR),
889  errmsg("negative substring length not allowed")));
890  L1 = -1; /* silence stupider compilers */
891  }
892  else if (pg_add_s32_overflow(S, length, &E))
893  {
894  /*
895  * L could be large enough for S + L to overflow, in which case
896  * the substring must run to end of string.
897  */
898  L1 = -1;
899  }
900  else
901  {
902  /*
903  * A zero or negative value for the end position can happen if the
904  * start was negative or one. SQL99 says to return a zero-length
905  * string.
906  */
907  if (E < 1)
908  return cstring_to_text("");
909 
910  L1 = E - S1;
911  }
912 
913  /*
914  * If the start position is past the end of the string, SQL99 says to
915  * return a zero-length string -- DatumGetTextPSlice() will do that
916  * for us. We need only convert S1 to zero-based starting position.
917  */
918  return DatumGetTextPSlice(str, S1 - 1, L1);
919  }
920  else if (eml > 1)
921  {
922  /*
923  * When encoding max length is > 1, we can't get LC without
924  * detoasting, so we'll grab a conservatively large slice now and go
925  * back later to do the right thing
926  */
927  int32 slice_start;
928  int32 slice_size;
929  int32 slice_strlen;
930  text *slice;
931  int32 E1;
932  int32 i;
933  char *p;
934  char *s;
935  text *ret;
936 
937  /*
938  * We need to start at position zero because there is no way to know
939  * in advance which byte offset corresponds to the supplied start
940  * position.
941  */
942  slice_start = 0;
943 
944  if (length_not_specified) /* special case - get length to end of
945  * string */
946  slice_size = L1 = -1;
947  else if (length < 0)
948  {
949  /* SQL99 says to throw an error for E < S, i.e., negative length */
950  ereport(ERROR,
951  (errcode(ERRCODE_SUBSTRING_ERROR),
952  errmsg("negative substring length not allowed")));
953  slice_size = L1 = -1; /* silence stupider compilers */
954  }
955  else if (pg_add_s32_overflow(S, length, &E))
956  {
957  /*
958  * L could be large enough for S + L to overflow, in which case
959  * the substring must run to end of string.
960  */
961  slice_size = L1 = -1;
962  }
963  else
964  {
965  /*
966  * A zero or negative value for the end position can happen if the
967  * start was negative or one. SQL99 says to return a zero-length
968  * string.
969  */
970  if (E < 1)
971  return cstring_to_text("");
972 
973  /*
974  * if E is past the end of the string, the tuple toaster will
975  * truncate the length for us
976  */
977  L1 = E - S1;
978 
979  /*
980  * Total slice size in bytes can't be any longer than the start
981  * position plus substring length times the encoding max length.
982  * If that overflows, we can just use -1.
983  */
984  if (pg_mul_s32_overflow(E, eml, &slice_size))
985  slice_size = -1;
986  }
987 
988  /*
989  * If we're working with an untoasted source, no need to do an extra
990  * copying step.
991  */
994  slice = DatumGetTextPSlice(str, slice_start, slice_size);
995  else
996  slice = (text *) DatumGetPointer(str);
997 
998  /* see if we got back an empty string */
999  if (VARSIZE_ANY_EXHDR(slice) == 0)
1000  {
1001  if (slice != (text *) DatumGetPointer(str))
1002  pfree(slice);
1003  return cstring_to_text("");
1004  }
1005 
1006  /* Now we can get the actual length of the slice in MB characters */
1007  slice_strlen = pg_mbstrlen_with_len(VARDATA_ANY(slice),
1008  VARSIZE_ANY_EXHDR(slice));
1009 
1010  /*
1011  * Check that the start position wasn't > slice_strlen. If so, SQL99
1012  * says to return a zero-length string.
1013  */
1014  if (S1 > slice_strlen)
1015  {
1016  if (slice != (text *) DatumGetPointer(str))
1017  pfree(slice);
1018  return cstring_to_text("");
1019  }
1020 
1021  /*
1022  * Adjust L1 and E1 now that we know the slice string length. Again
1023  * remember that S1 is one based, and slice_start is zero based.
1024  */
1025  if (L1 > -1)
1026  E1 = Min(S1 + L1, slice_start + 1 + slice_strlen);
1027  else
1028  E1 = slice_start + 1 + slice_strlen;
1029 
1030  /*
1031  * Find the start position in the slice; remember S1 is not zero based
1032  */
1033  p = VARDATA_ANY(slice);
1034  for (i = 0; i < S1 - 1; i++)
1035  p += pg_mblen(p);
1036 
1037  /* hang onto a pointer to our start position */
1038  s = p;
1039 
1040  /*
1041  * Count the actual bytes used by the substring of the requested
1042  * length.
1043  */
1044  for (i = S1; i < E1; i++)
1045  p += pg_mblen(p);
1046 
1047  ret = (text *) palloc(VARHDRSZ + (p - s));
1048  SET_VARSIZE(ret, VARHDRSZ + (p - s));
1049  memcpy(VARDATA(ret), s, (p - s));
1050 
1051  if (slice != (text *) DatumGetPointer(str))
1052  pfree(slice);
1053 
1054  return ret;
1055  }
1056  else
1057  elog(ERROR, "invalid backend encoding: encoding max length < 1");
1058 
1059  /* not reached: suppress compiler warning */
1060  return NULL;
1061 }
1062 
1063 /*
1064  * textoverlay
1065  * Replace specified substring of first string with second
1066  *
1067  * The SQL standard defines OVERLAY() in terms of substring and concatenation.
1068  * This code is a direct implementation of what the standard says.
1069  */
1070 Datum
1072 {
1073  text *t1 = PG_GETARG_TEXT_PP(0);
1074  text *t2 = PG_GETARG_TEXT_PP(1);
1075  int sp = PG_GETARG_INT32(2); /* substring start position */
1076  int sl = PG_GETARG_INT32(3); /* substring length */
1077 
1078  PG_RETURN_TEXT_P(text_overlay(t1, t2, sp, sl));
1079 }
1080 
1081 Datum
1083 {
1084  text *t1 = PG_GETARG_TEXT_PP(0);
1085  text *t2 = PG_GETARG_TEXT_PP(1);
1086  int sp = PG_GETARG_INT32(2); /* substring start position */
1087  int sl;
1088 
1089  sl = text_length(PointerGetDatum(t2)); /* defaults to length(t2) */
1090  PG_RETURN_TEXT_P(text_overlay(t1, t2, sp, sl));
1091 }
1092 
1093 static text *
1094 text_overlay(text *t1, text *t2, int sp, int sl)
1095 {
1096  text *result;
1097  text *s1;
1098  text *s2;
1099  int sp_pl_sl;
1100 
1101  /*
1102  * Check for possible integer-overflow cases. For negative sp, throw a
1103  * "substring length" error because that's what should be expected
1104  * according to the spec's definition of OVERLAY().
1105  */
1106  if (sp <= 0)
1107  ereport(ERROR,
1108  (errcode(ERRCODE_SUBSTRING_ERROR),
1109  errmsg("negative substring length not allowed")));
1110  if (pg_add_s32_overflow(sp, sl, &sp_pl_sl))
1111  ereport(ERROR,
1112  (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
1113  errmsg("integer out of range")));
1114 
1115  s1 = text_substring(PointerGetDatum(t1), 1, sp - 1, false);
1116  s2 = text_substring(PointerGetDatum(t1), sp_pl_sl, -1, true);
1117  result = text_catenate(s1, t2);
1118  result = text_catenate(result, s2);
1119 
1120  return result;
1121 }
1122 
1123 /*
1124  * textpos -
1125  * Return the position of the specified substring.
1126  * Implements the SQL POSITION() function.
1127  * Ref: A Guide To The SQL Standard, Date & Darwen, 1997
1128  * - thomas 1997-07-27
1129  */
1130 Datum
1132 {
1133  text *str = PG_GETARG_TEXT_PP(0);
1134  text *search_str = PG_GETARG_TEXT_PP(1);
1135 
1137 }
1138 
1139 /*
1140  * text_position -
1141  * Does the real work for textpos()
1142  *
1143  * Inputs:
1144  * t1 - string to be searched
1145  * t2 - pattern to match within t1
1146  * Result:
1147  * Character index of the first matched char, starting from 1,
1148  * or 0 if no match.
1149  *
1150  * This is broken out so it can be called directly by other string processing
1151  * functions.
1152  */
1153 static int
1154 text_position(text *t1, text *t2, Oid collid)
1155 {
1157  int result;
1158 
1159  /* Empty needle always matches at position 1 */
1160  if (VARSIZE_ANY_EXHDR(t2) < 1)
1161  return 1;
1162 
1163  /* Otherwise, can't match if haystack is shorter than needle */
1164  if (VARSIZE_ANY_EXHDR(t1) < VARSIZE_ANY_EXHDR(t2))
1165  return 0;
1166 
1167  text_position_setup(t1, t2, collid, &state);
1168  if (!text_position_next(&state))
1169  result = 0;
1170  else
1173  return result;
1174 }
1175 
1176 
1177 /*
1178  * text_position_setup, text_position_next, text_position_cleanup -
1179  * Component steps of text_position()
1180  *
1181  * These are broken out so that a string can be efficiently searched for
1182  * multiple occurrences of the same pattern. text_position_next may be
1183  * called multiple times, and it advances to the next match on each call.
1184  * text_position_get_match_ptr() and text_position_get_match_pos() return
1185  * a pointer or 1-based character position of the last match, respectively.
1186  *
1187  * The "state" variable is normally just a local variable in the caller.
1188  *
1189  * NOTE: text_position_next skips over the matched portion. For example,
1190  * searching for "xx" in "xxx" returns only one match, not two.
1191  */
1192 
1193 static void
1195 {
1196  int len1 = VARSIZE_ANY_EXHDR(t1);
1197  int len2 = VARSIZE_ANY_EXHDR(t2);
1198  pg_locale_t mylocale = 0;
1199 
1200  check_collation_set(collid);
1201 
1202  if (!lc_collate_is_c(collid))
1203  mylocale = pg_newlocale_from_collation(collid);
1204 
1205  if (mylocale && !mylocale->deterministic)
1206  ereport(ERROR,
1207  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1208  errmsg("nondeterministic collations are not supported for substring searches")));
1209 
1210  Assert(len1 > 0);
1211  Assert(len2 > 0);
1212 
1213  /*
1214  * Even with a multi-byte encoding, we perform the search using the raw
1215  * byte sequence, ignoring multibyte issues. For UTF-8, that works fine,
1216  * because in UTF-8 the byte sequence of one character cannot contain
1217  * another character. For other multi-byte encodings, we do the search
1218  * initially as a simple byte search, ignoring multibyte issues, but
1219  * verify afterwards that the match we found is at a character boundary,
1220  * and continue the search if it was a false match.
1221  */
1223  state->is_multibyte_char_in_char = false;
1224  else if (GetDatabaseEncoding() == PG_UTF8)
1225  state->is_multibyte_char_in_char = false;
1226  else
1227  state->is_multibyte_char_in_char = true;
1228 
1229  state->str1 = VARDATA_ANY(t1);
1230  state->str2 = VARDATA_ANY(t2);
1231  state->len1 = len1;
1232  state->len2 = len2;
1233  state->last_match = NULL;
1234  state->refpoint = state->str1;
1235  state->refpos = 0;
1236 
1237  /*
1238  * Prepare the skip table for Boyer-Moore-Horspool searching. In these
1239  * notes we use the terminology that the "haystack" is the string to be
1240  * searched (t1) and the "needle" is the pattern being sought (t2).
1241  *
1242  * If the needle is empty or bigger than the haystack then there is no
1243  * point in wasting cycles initializing the table. We also choose not to
1244  * use B-M-H for needles of length 1, since the skip table can't possibly
1245  * save anything in that case.
1246  */
1247  if (len1 >= len2 && len2 > 1)
1248  {
1249  int searchlength = len1 - len2;
1250  int skiptablemask;
1251  int last;
1252  int i;
1253  const char *str2 = state->str2;
1254 
1255  /*
1256  * First we must determine how much of the skip table to use. The
1257  * declaration of TextPositionState allows up to 256 elements, but for
1258  * short search problems we don't really want to have to initialize so
1259  * many elements --- it would take too long in comparison to the
1260  * actual search time. So we choose a useful skip table size based on
1261  * the haystack length minus the needle length. The closer the needle
1262  * length is to the haystack length the less useful skipping becomes.
1263  *
1264  * Note: since we use bit-masking to select table elements, the skip
1265  * table size MUST be a power of 2, and so the mask must be 2^N-1.
1266  */
1267  if (searchlength < 16)
1268  skiptablemask = 3;
1269  else if (searchlength < 64)
1270  skiptablemask = 7;
1271  else if (searchlength < 128)
1272  skiptablemask = 15;
1273  else if (searchlength < 512)
1274  skiptablemask = 31;
1275  else if (searchlength < 2048)
1276  skiptablemask = 63;
1277  else if (searchlength < 4096)
1278  skiptablemask = 127;
1279  else
1280  skiptablemask = 255;
1281  state->skiptablemask = skiptablemask;
1282 
1283  /*
1284  * Initialize the skip table. We set all elements to the needle
1285  * length, since this is the correct skip distance for any character
1286  * not found in the needle.
1287  */
1288  for (i = 0; i <= skiptablemask; i++)
1289  state->skiptable[i] = len2;
1290 
1291  /*
1292  * Now examine the needle. For each character except the last one,
1293  * set the corresponding table element to the appropriate skip
1294  * distance. Note that when two characters share the same skip table
1295  * entry, the one later in the needle must determine the skip
1296  * distance.
1297  */
1298  last = len2 - 1;
1299 
1300  for (i = 0; i < last; i++)
1301  state->skiptable[(unsigned char) str2[i] & skiptablemask] = last - i;
1302  }
1303 }
1304 
1305 /*
1306  * Advance to the next match, starting from the end of the previous match
1307  * (or the beginning of the string, on first call). Returns true if a match
1308  * is found.
1309  *
1310  * Note that this refuses to match an empty-string needle. Most callers
1311  * will have handled that case specially and we'll never see it here.
1312  */
1313 static bool
1315 {
1316  int needle_len = state->len2;
1317  char *start_ptr;
1318  char *matchptr;
1319 
1320  if (needle_len <= 0)
1321  return false; /* result for empty pattern */
1322 
1323  /* Start from the point right after the previous match. */
1324  if (state->last_match)
1325  start_ptr = state->last_match + needle_len;
1326  else
1327  start_ptr = state->str1;
1328 
1329 retry:
1330  matchptr = text_position_next_internal(start_ptr, state);
1331 
1332  if (!matchptr)
1333  return false;
1334 
1335  /*
1336  * Found a match for the byte sequence. If this is a multibyte encoding,
1337  * where one character's byte sequence can appear inside a longer
1338  * multi-byte character, we need to verify that the match was at a
1339  * character boundary, not in the middle of a multi-byte character.
1340  */
1341  if (state->is_multibyte_char_in_char)
1342  {
1343  /* Walk one character at a time, until we reach the match. */
1344 
1345  /* the search should never move backwards. */
1346  Assert(state->refpoint <= matchptr);
1347 
1348  while (state->refpoint < matchptr)
1349  {
1350  /* step to next character. */
1351  state->refpoint += pg_mblen(state->refpoint);
1352  state->refpos++;
1353 
1354  /*
1355  * If we stepped over the match's start position, then it was a
1356  * false positive, where the byte sequence appeared in the middle
1357  * of a multi-byte character. Skip it, and continue the search at
1358  * the next character boundary.
1359  */
1360  if (state->refpoint > matchptr)
1361  {
1362  start_ptr = state->refpoint;
1363  goto retry;
1364  }
1365  }
1366  }
1367 
1368  state->last_match = matchptr;
1369  return true;
1370 }
1371 
1372 /*
1373  * Subroutine of text_position_next(). This searches for the raw byte
1374  * sequence, ignoring any multi-byte encoding issues. Returns the first
1375  * match starting at 'start_ptr', or NULL if no match is found.
1376  */
1377 static char *
1379 {
1380  int haystack_len = state->len1;
1381  int needle_len = state->len2;
1382  int skiptablemask = state->skiptablemask;
1383  const char *haystack = state->str1;
1384  const char *needle = state->str2;
1385  const char *haystack_end = &haystack[haystack_len];
1386  const char *hptr;
1387 
1388  Assert(start_ptr >= haystack && start_ptr <= haystack_end);
1389 
1390  if (needle_len == 1)
1391  {
1392  /* No point in using B-M-H for a one-character needle */
1393  char nchar = *needle;
1394 
1395  hptr = start_ptr;
1396  while (hptr < haystack_end)
1397  {
1398  if (*hptr == nchar)
1399  return (char *) hptr;
1400  hptr++;
1401  }
1402  }
1403  else
1404  {
1405  const char *needle_last = &needle[needle_len - 1];
1406 
1407  /* Start at startpos plus the length of the needle */
1408  hptr = start_ptr + needle_len - 1;
1409  while (hptr < haystack_end)
1410  {
1411  /* Match the needle scanning *backward* */
1412  const char *nptr;
1413  const char *p;
1414 
1415  nptr = needle_last;
1416  p = hptr;
1417  while (*nptr == *p)
1418  {
1419  /* Matched it all? If so, return 1-based position */
1420  if (nptr == needle)
1421  return (char *) p;
1422  nptr--, p--;
1423  }
1424 
1425  /*
1426  * No match, so use the haystack char at hptr to decide how far to
1427  * advance. If the needle had any occurrence of that character
1428  * (or more precisely, one sharing the same skiptable entry)
1429  * before its last character, then we advance far enough to align
1430  * the last such needle character with that haystack position.
1431  * Otherwise we can advance by the whole needle length.
1432  */
1433  hptr += state->skiptable[(unsigned char) *hptr & skiptablemask];
1434  }
1435  }
1436 
1437  return 0; /* not found */
1438 }
1439 
1440 /*
1441  * Return a pointer to the current match.
1442  *
1443  * The returned pointer points into the original haystack string.
1444  */
1445 static char *
1447 {
1448  return state->last_match;
1449 }
1450 
1451 /*
1452  * Return the offset of the current match.
1453  *
1454  * The offset is in characters, 1-based.
1455  */
1456 static int
1458 {
1459  /* Convert the byte position to char position. */
1460  state->refpos += pg_mbstrlen_with_len(state->refpoint,
1461  state->last_match - state->refpoint);
1462  state->refpoint = state->last_match;
1463  return state->refpos + 1;
1464 }
1465 
1466 /*
1467  * Reset search state to the initial state installed by text_position_setup.
1468  *
1469  * The next call to text_position_next will search from the beginning
1470  * of the string.
1471  */
1472 static void
1474 {
1475  state->last_match = NULL;
1476  state->refpoint = state->str1;
1477  state->refpos = 0;
1478 }
1479 
1480 static void
1482 {
1483  /* no cleanup needed */
1484 }
1485 
1486 
1487 static void
1489 {
1490  if (!OidIsValid(collid))
1491  {
1492  /*
1493  * This typically means that the parser could not resolve a conflict
1494  * of implicit collations, so report it that way.
1495  */
1496  ereport(ERROR,
1497  (errcode(ERRCODE_INDETERMINATE_COLLATION),
1498  errmsg("could not determine which collation to use for string comparison"),
1499  errhint("Use the COLLATE clause to set the collation explicitly.")));
1500  }
1501 }
1502 
1503 /* varstr_cmp()
1504  * Comparison function for text strings with given lengths.
1505  * Includes locale support, but must copy strings to temporary memory
1506  * to allow null-termination for inputs to strcoll().
1507  * Returns an integer less than, equal to, or greater than zero, indicating
1508  * whether arg1 is less than, equal to, or greater than arg2.
1509  *
1510  * Note: many functions that depend on this are marked leakproof; therefore,
1511  * avoid reporting the actual contents of the input when throwing errors.
1512  * All errors herein should be things that can't happen except on corrupt
1513  * data, anyway; otherwise we will have trouble with indexing strings that
1514  * would cause them.
1515  */
1516 int
1517 varstr_cmp(const char *arg1, int len1, const char *arg2, int len2, Oid collid)
1518 {
1519  int result;
1520 
1521  check_collation_set(collid);
1522 
1523  /*
1524  * Unfortunately, there is no strncoll(), so in the non-C locale case we
1525  * have to do some memory copying. This turns out to be significantly
1526  * slower, so we optimize the case where LC_COLLATE is C. We also try to
1527  * optimize relatively-short strings by avoiding palloc/pfree overhead.
1528  */
1529  if (lc_collate_is_c(collid))
1530  {
1531  result = memcmp(arg1, arg2, Min(len1, len2));
1532  if ((result == 0) && (len1 != len2))
1533  result = (len1 < len2) ? -1 : 1;
1534  }
1535  else
1536  {
1537  char a1buf[TEXTBUFLEN];
1538  char a2buf[TEXTBUFLEN];
1539  char *a1p,
1540  *a2p;
1541  pg_locale_t mylocale;
1542 
1543  mylocale = pg_newlocale_from_collation(collid);
1544 
1545  /*
1546  * memcmp() can't tell us which of two unequal strings sorts first,
1547  * but it's a cheap way to tell if they're equal. Testing shows that
1548  * memcmp() followed by strcoll() is only trivially slower than
1549  * strcoll() by itself, so we don't lose much if this doesn't work out
1550  * very often, and if it does - for example, because there are many
1551  * equal strings in the input - then we win big by avoiding expensive
1552  * collation-aware comparisons.
1553  */
1554  if (len1 == len2 && memcmp(arg1, arg2, len1) == 0)
1555  return 0;
1556 
1557 #ifdef WIN32
1558  /* Win32 does not have UTF-8, so we need to map to UTF-16 */
1559  if (GetDatabaseEncoding() == PG_UTF8
1560  && (!mylocale || mylocale->provider == COLLPROVIDER_LIBC))
1561  {
1562  int a1len;
1563  int a2len;
1564  int r;
1565 
1566  if (len1 >= TEXTBUFLEN / 2)
1567  {
1568  a1len = len1 * 2 + 2;
1569  a1p = palloc(a1len);
1570  }
1571  else
1572  {
1573  a1len = TEXTBUFLEN;
1574  a1p = a1buf;
1575  }
1576  if (len2 >= TEXTBUFLEN / 2)
1577  {
1578  a2len = len2 * 2 + 2;
1579  a2p = palloc(a2len);
1580  }
1581  else
1582  {
1583  a2len = TEXTBUFLEN;
1584  a2p = a2buf;
1585  }
1586 
1587  /* stupid Microsloth API does not work for zero-length input */
1588  if (len1 == 0)
1589  r = 0;
1590  else
1591  {
1592  r = MultiByteToWideChar(CP_UTF8, 0, arg1, len1,
1593  (LPWSTR) a1p, a1len / 2);
1594  if (!r)
1595  ereport(ERROR,
1596  (errmsg("could not convert string to UTF-16: error code %lu",
1597  GetLastError())));
1598  }
1599  ((LPWSTR) a1p)[r] = 0;
1600 
1601  if (len2 == 0)
1602  r = 0;
1603  else
1604  {
1605  r = MultiByteToWideChar(CP_UTF8, 0, arg2, len2,
1606  (LPWSTR) a2p, a2len / 2);
1607  if (!r)
1608  ereport(ERROR,
1609  (errmsg("could not convert string to UTF-16: error code %lu",
1610  GetLastError())));
1611  }
1612  ((LPWSTR) a2p)[r] = 0;
1613 
1614  errno = 0;
1615 #ifdef HAVE_LOCALE_T
1616  if (mylocale)
1617  result = wcscoll_l((LPWSTR) a1p, (LPWSTR) a2p, mylocale->info.lt);
1618  else
1619 #endif
1620  result = wcscoll((LPWSTR) a1p, (LPWSTR) a2p);
1621  if (result == 2147483647) /* _NLSCMPERROR; missing from mingw
1622  * headers */
1623  ereport(ERROR,
1624  (errmsg("could not compare Unicode strings: %m")));
1625 
1626  /* Break tie if necessary. */
1627  if (result == 0 &&
1628  (!mylocale || mylocale->deterministic))
1629  {
1630  result = memcmp(arg1, arg2, Min(len1, len2));
1631  if ((result == 0) && (len1 != len2))
1632  result = (len1 < len2) ? -1 : 1;
1633  }
1634 
1635  if (a1p != a1buf)
1636  pfree(a1p);
1637  if (a2p != a2buf)
1638  pfree(a2p);
1639 
1640  return result;
1641  }
1642 #endif /* WIN32 */
1643 
1644  if (len1 >= TEXTBUFLEN)
1645  a1p = (char *) palloc(len1 + 1);
1646  else
1647  a1p = a1buf;
1648  if (len2 >= TEXTBUFLEN)
1649  a2p = (char *) palloc(len2 + 1);
1650  else
1651  a2p = a2buf;
1652 
1653  memcpy(a1p, arg1, len1);
1654  a1p[len1] = '\0';
1655  memcpy(a2p, arg2, len2);
1656  a2p[len2] = '\0';
1657 
1658  if (mylocale)
1659  {
1660  if (mylocale->provider == COLLPROVIDER_ICU)
1661  {
1662 #ifdef USE_ICU
1663 #ifdef HAVE_UCOL_STRCOLLUTF8
1664  if (GetDatabaseEncoding() == PG_UTF8)
1665  {
1666  UErrorCode status;
1667 
1668  status = U_ZERO_ERROR;
1669  result = ucol_strcollUTF8(mylocale->info.icu.ucol,
1670  arg1, len1,
1671  arg2, len2,
1672  &status);
1673  if (U_FAILURE(status))
1674  ereport(ERROR,
1675  (errmsg("collation failed: %s", u_errorName(status))));
1676  }
1677  else
1678 #endif
1679  {
1680  int32_t ulen1,
1681  ulen2;
1682  UChar *uchar1,
1683  *uchar2;
1684 
1685  ulen1 = icu_to_uchar(&uchar1, arg1, len1);
1686  ulen2 = icu_to_uchar(&uchar2, arg2, len2);
1687 
1688  result = ucol_strcoll(mylocale->info.icu.ucol,
1689  uchar1, ulen1,
1690  uchar2, ulen2);
1691 
1692  pfree(uchar1);
1693  pfree(uchar2);
1694  }
1695 #else /* not USE_ICU */
1696  /* shouldn't happen */
1697  elog(ERROR, "unsupported collprovider: %c", mylocale->provider);
1698 #endif /* not USE_ICU */
1699  }
1700  else
1701  {
1702 #ifdef HAVE_LOCALE_T
1703  result = strcoll_l(a1p, a2p, mylocale->info.lt);
1704 #else
1705  /* shouldn't happen */
1706  elog(ERROR, "unsupported collprovider: %c", mylocale->provider);
1707 #endif
1708  }
1709  }
1710  else
1711  result = strcoll(a1p, a2p);
1712 
1713  /* Break tie if necessary. */
1714  if (result == 0 &&
1715  (!mylocale || mylocale->deterministic))
1716  result = strcmp(a1p, a2p);
1717 
1718  if (a1p != a1buf)
1719  pfree(a1p);
1720  if (a2p != a2buf)
1721  pfree(a2p);
1722  }
1723 
1724  return result;
1725 }
1726 
1727 /* text_cmp()
1728  * Internal comparison function for text strings.
1729  * Returns -1, 0 or 1
1730  */
1731 static int
1732 text_cmp(text *arg1, text *arg2, Oid collid)
1733 {
1734  char *a1p,
1735  *a2p;
1736  int len1,
1737  len2;
1738 
1739  a1p = VARDATA_ANY(arg1);
1740  a2p = VARDATA_ANY(arg2);
1741 
1742  len1 = VARSIZE_ANY_EXHDR(arg1);
1743  len2 = VARSIZE_ANY_EXHDR(arg2);
1744 
1745  return varstr_cmp(a1p, len1, a2p, len2, collid);
1746 }
1747 
1748 /*
1749  * Comparison functions for text strings.
1750  *
1751  * Note: btree indexes need these routines not to leak memory; therefore,
1752  * be careful to free working copies of toasted datums. Most places don't
1753  * need to be so careful.
1754  */
1755 
1756 Datum
1758 {
1759  Oid collid = PG_GET_COLLATION();
1760  bool locale_is_c = false;
1761  pg_locale_t mylocale = 0;
1762  bool result;
1763 
1764  check_collation_set(collid);
1765 
1766  if (lc_collate_is_c(collid))
1767  locale_is_c = true;
1768  else
1769  mylocale = pg_newlocale_from_collation(collid);
1770 
1771  if (locale_is_c || !mylocale || mylocale->deterministic)
1772  {
1773  Datum arg1 = PG_GETARG_DATUM(0);
1774  Datum arg2 = PG_GETARG_DATUM(1);
1775  Size len1,
1776  len2;
1777 
1778  /*
1779  * Since we only care about equality or not-equality, we can avoid all
1780  * the expense of strcoll() here, and just do bitwise comparison. In
1781  * fact, we don't even have to do a bitwise comparison if we can show
1782  * the lengths of the strings are unequal; which might save us from
1783  * having to detoast one or both values.
1784  */
1785  len1 = toast_raw_datum_size(arg1);
1786  len2 = toast_raw_datum_size(arg2);
1787  if (len1 != len2)
1788  result = false;
1789  else
1790  {
1791  text *targ1 = DatumGetTextPP(arg1);
1792  text *targ2 = DatumGetTextPP(arg2);
1793 
1794  result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1795  len1 - VARHDRSZ) == 0);
1796 
1797  PG_FREE_IF_COPY(targ1, 0);
1798  PG_FREE_IF_COPY(targ2, 1);
1799  }
1800  }
1801  else
1802  {
1803  text *arg1 = PG_GETARG_TEXT_PP(0);
1804  text *arg2 = PG_GETARG_TEXT_PP(1);
1805 
1806  result = (text_cmp(arg1, arg2, collid) == 0);
1807 
1808  PG_FREE_IF_COPY(arg1, 0);
1809  PG_FREE_IF_COPY(arg2, 1);
1810  }
1811 
1812  PG_RETURN_BOOL(result);
1813 }
1814 
1815 Datum
1817 {
1818  Oid collid = PG_GET_COLLATION();
1819  bool locale_is_c = false;
1820  pg_locale_t mylocale = 0;
1821  bool result;
1822 
1823  check_collation_set(collid);
1824 
1825  if (lc_collate_is_c(collid))
1826  locale_is_c = true;
1827  else
1828  mylocale = pg_newlocale_from_collation(collid);
1829 
1830  if (locale_is_c || !mylocale || mylocale->deterministic)
1831  {
1832  Datum arg1 = PG_GETARG_DATUM(0);
1833  Datum arg2 = PG_GETARG_DATUM(1);
1834  Size len1,
1835  len2;
1836 
1837  /* See comment in texteq() */
1838  len1 = toast_raw_datum_size(arg1);
1839  len2 = toast_raw_datum_size(arg2);
1840  if (len1 != len2)
1841  result = true;
1842  else
1843  {
1844  text *targ1 = DatumGetTextPP(arg1);
1845  text *targ2 = DatumGetTextPP(arg2);
1846 
1847  result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1848  len1 - VARHDRSZ) != 0);
1849 
1850  PG_FREE_IF_COPY(targ1, 0);
1851  PG_FREE_IF_COPY(targ2, 1);
1852  }
1853  }
1854  else
1855  {
1856  text *arg1 = PG_GETARG_TEXT_PP(0);
1857  text *arg2 = PG_GETARG_TEXT_PP(1);
1858 
1859  result = (text_cmp(arg1, arg2, collid) != 0);
1860 
1861  PG_FREE_IF_COPY(arg1, 0);
1862  PG_FREE_IF_COPY(arg2, 1);
1863  }
1864 
1865  PG_RETURN_BOOL(result);
1866 }
1867 
1868 Datum
1870 {
1871  text *arg1 = PG_GETARG_TEXT_PP(0);
1872  text *arg2 = PG_GETARG_TEXT_PP(1);
1873  bool result;
1874 
1875  result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) < 0);
1876 
1877  PG_FREE_IF_COPY(arg1, 0);
1878  PG_FREE_IF_COPY(arg2, 1);
1879 
1880  PG_RETURN_BOOL(result);
1881 }
1882 
1883 Datum
1885 {
1886  text *arg1 = PG_GETARG_TEXT_PP(0);
1887  text *arg2 = PG_GETARG_TEXT_PP(1);
1888  bool result;
1889 
1890  result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) <= 0);
1891 
1892  PG_FREE_IF_COPY(arg1, 0);
1893  PG_FREE_IF_COPY(arg2, 1);
1894 
1895  PG_RETURN_BOOL(result);
1896 }
1897 
1898 Datum
1900 {
1901  text *arg1 = PG_GETARG_TEXT_PP(0);
1902  text *arg2 = PG_GETARG_TEXT_PP(1);
1903  bool result;
1904 
1905  result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) > 0);
1906 
1907  PG_FREE_IF_COPY(arg1, 0);
1908  PG_FREE_IF_COPY(arg2, 1);
1909 
1910  PG_RETURN_BOOL(result);
1911 }
1912 
1913 Datum
1915 {
1916  text *arg1 = PG_GETARG_TEXT_PP(0);
1917  text *arg2 = PG_GETARG_TEXT_PP(1);
1918  bool result;
1919 
1920  result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) >= 0);
1921 
1922  PG_FREE_IF_COPY(arg1, 0);
1923  PG_FREE_IF_COPY(arg2, 1);
1924 
1925  PG_RETURN_BOOL(result);
1926 }
1927 
1928 Datum
1930 {
1931  Datum arg1 = PG_GETARG_DATUM(0);
1932  Datum arg2 = PG_GETARG_DATUM(1);
1933  Oid collid = PG_GET_COLLATION();
1934  pg_locale_t mylocale = 0;
1935  bool result;
1936  Size len1,
1937  len2;
1938 
1939  check_collation_set(collid);
1940 
1941  if (!lc_collate_is_c(collid))
1942  mylocale = pg_newlocale_from_collation(collid);
1943 
1944  if (mylocale && !mylocale->deterministic)
1945  ereport(ERROR,
1946  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1947  errmsg("nondeterministic collations are not supported for substring searches")));
1948 
1949  len1 = toast_raw_datum_size(arg1);
1950  len2 = toast_raw_datum_size(arg2);
1951  if (len2 > len1)
1952  result = false;
1953  else
1954  {
1955  text *targ1 = text_substring(arg1, 1, len2, false);
1956  text *targ2 = DatumGetTextPP(arg2);
1957 
1958  result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1959  VARSIZE_ANY_EXHDR(targ2)) == 0);
1960 
1961  PG_FREE_IF_COPY(targ1, 0);
1962  PG_FREE_IF_COPY(targ2, 1);
1963  }
1964 
1965  PG_RETURN_BOOL(result);
1966 }
1967 
1968 Datum
1970 {
1971  text *arg1 = PG_GETARG_TEXT_PP(0);
1972  text *arg2 = PG_GETARG_TEXT_PP(1);
1973  int32 result;
1974 
1975  result = text_cmp(arg1, arg2, PG_GET_COLLATION());
1976 
1977  PG_FREE_IF_COPY(arg1, 0);
1978  PG_FREE_IF_COPY(arg2, 1);
1979 
1980  PG_RETURN_INT32(result);
1981 }
1982 
1983 Datum
1985 {
1987  Oid collid = ssup->ssup_collation;
1988  MemoryContext oldcontext;
1989 
1990  oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
1991 
1992  /* Use generic string SortSupport */
1993  varstr_sortsupport(ssup, TEXTOID, collid);
1994 
1995  MemoryContextSwitchTo(oldcontext);
1996 
1997  PG_RETURN_VOID();
1998 }
1999 
2000 /*
2001  * Generic sortsupport interface for character type's operator classes.
2002  * Includes locale support, and support for BpChar semantics (i.e. removing
2003  * trailing spaces before comparison).
2004  *
2005  * Relies on the assumption that text, VarChar, BpChar, and bytea all have the
2006  * same representation. Callers that always use the C collation (e.g.
2007  * non-collatable type callers like bytea) may have NUL bytes in their strings;
2008  * this will not work with any other collation, though.
2009  */
2010 void
2012 {
2013  bool abbreviate = ssup->abbreviate;
2014  bool collate_c = false;
2015  VarStringSortSupport *sss;
2016  pg_locale_t locale = 0;
2017 
2018  check_collation_set(collid);
2019 
2020  /*
2021  * If possible, set ssup->comparator to a function which can be used to
2022  * directly compare two datums. If we can do this, we'll avoid the
2023  * overhead of a trip through the fmgr layer for every comparison, which
2024  * can be substantial.
2025  *
2026  * Most typically, we'll set the comparator to varlenafastcmp_locale,
2027  * which uses strcoll() to perform comparisons. We use that for the
2028  * BpChar case too, but type NAME uses namefastcmp_locale. However, if
2029  * LC_COLLATE = C, we can make things quite a bit faster with
2030  * varstrfastcmp_c, bpcharfastcmp_c, or namefastcmp_c, all of which use
2031  * memcmp() rather than strcoll().
2032  */
2033  if (lc_collate_is_c(collid))
2034  {
2035  if (typid == BPCHAROID)
2036  ssup->comparator = bpcharfastcmp_c;
2037  else if (typid == NAMEOID)
2038  {
2039  ssup->comparator = namefastcmp_c;
2040  /* Not supporting abbreviation with type NAME, for now */
2041  abbreviate = false;
2042  }
2043  else
2044  ssup->comparator = varstrfastcmp_c;
2045 
2046  collate_c = true;
2047  }
2048  else
2049  {
2050  /*
2051  * We need a collation-sensitive comparison. To make things faster,
2052  * we'll figure out the collation based on the locale id and cache the
2053  * result.
2054  */
2056 
2057  /*
2058  * There is a further exception on Windows. When the database
2059  * encoding is UTF-8 and we are not using the C collation, complex
2060  * hacks are required. We don't currently have a comparator that
2061  * handles that case, so we fall back on the slow method of having the
2062  * sort code invoke bttextcmp() (in the case of text) via the fmgr
2063  * trampoline. ICU locales work just the same on Windows, however.
2064  */
2065 #ifdef WIN32
2066  if (GetDatabaseEncoding() == PG_UTF8 &&
2067  !(locale && locale->provider == COLLPROVIDER_ICU))
2068  return;
2069 #endif
2070 
2071  /*
2072  * We use varlenafastcmp_locale except for type NAME.
2073  */
2074  if (typid == NAMEOID)
2075  {
2077  /* Not supporting abbreviation with type NAME, for now */
2078  abbreviate = false;
2079  }
2080  else
2082  }
2083 
2084  /*
2085  * Unfortunately, it seems that abbreviation for non-C collations is
2086  * broken on many common platforms; testing of multiple versions of glibc
2087  * reveals that, for many locales, strcoll() and strxfrm() do not return
2088  * consistent results, which is fatal to this optimization. While no
2089  * other libc other than Cygwin has so far been shown to have a problem,
2090  * we take the conservative course of action for right now and disable
2091  * this categorically. (Users who are certain this isn't a problem on
2092  * their system can define TRUST_STRXFRM.)
2093  *
2094  * Even apart from the risk of broken locales, it's possible that there
2095  * are platforms where the use of abbreviated keys should be disabled at
2096  * compile time. Having only 4 byte datums could make worst-case
2097  * performance drastically more likely, for example. Moreover, macOS's
2098  * strxfrm() implementation is known to not effectively concentrate a
2099  * significant amount of entropy from the original string in earlier
2100  * transformed blobs. It's possible that other supported platforms are
2101  * similarly encumbered. So, if we ever get past disabling this
2102  * categorically, we may still want or need to disable it for particular
2103  * platforms.
2104  */
2105 #ifndef TRUST_STRXFRM
2106  if (!collate_c && !(locale && locale->provider == COLLPROVIDER_ICU))
2107  abbreviate = false;
2108 #endif
2109 
2110  /*
2111  * If we're using abbreviated keys, or if we're using a locale-aware
2112  * comparison, we need to initialize a VarStringSortSupport object. Both
2113  * cases will make use of the temporary buffers we initialize here for
2114  * scratch space (and to detect requirement for BpChar semantics from
2115  * caller), and the abbreviation case requires additional state.
2116  */
2117  if (abbreviate || !collate_c)
2118  {
2119  sss = palloc(sizeof(VarStringSortSupport));
2120  sss->buf1 = palloc(TEXTBUFLEN);
2121  sss->buflen1 = TEXTBUFLEN;
2122  sss->buf2 = palloc(TEXTBUFLEN);
2123  sss->buflen2 = TEXTBUFLEN;
2124  /* Start with invalid values */
2125  sss->last_len1 = -1;
2126  sss->last_len2 = -1;
2127  /* Initialize */
2128  sss->last_returned = 0;
2129  sss->locale = locale;
2130 
2131  /*
2132  * To avoid somehow confusing a strxfrm() blob and an original string,
2133  * constantly keep track of the variety of data that buf1 and buf2
2134  * currently contain.
2135  *
2136  * Comparisons may be interleaved with conversion calls. Frequently,
2137  * conversions and comparisons are batched into two distinct phases,
2138  * but the correctness of caching cannot hinge upon this. For
2139  * comparison caching, buffer state is only trusted if cache_blob is
2140  * found set to false, whereas strxfrm() caching only trusts the state
2141  * when cache_blob is found set to true.
2142  *
2143  * Arbitrarily initialize cache_blob to true.
2144  */
2145  sss->cache_blob = true;
2146  sss->collate_c = collate_c;
2147  sss->typid = typid;
2148  ssup->ssup_extra = sss;
2149 
2150  /*
2151  * If possible, plan to use the abbreviated keys optimization. The
2152  * core code may switch back to authoritative comparator should
2153  * abbreviation be aborted.
2154  */
2155  if (abbreviate)
2156  {
2157  sss->prop_card = 0.20;
2158  initHyperLogLog(&sss->abbr_card, 10);
2159  initHyperLogLog(&sss->full_card, 10);
2160  ssup->abbrev_full_comparator = ssup->comparator;
2164  }
2165  }
2166 }
2167 
2168 /*
2169  * sortsupport comparison func (for C locale case)
2170  */
2171 static int
2173 {
2174  VarString *arg1 = DatumGetVarStringPP(x);
2175  VarString *arg2 = DatumGetVarStringPP(y);
2176  char *a1p,
2177  *a2p;
2178  int len1,
2179  len2,
2180  result;
2181 
2182  a1p = VARDATA_ANY(arg1);
2183  a2p = VARDATA_ANY(arg2);
2184 
2185  len1 = VARSIZE_ANY_EXHDR(arg1);
2186  len2 = VARSIZE_ANY_EXHDR(arg2);
2187 
2188  result = memcmp(a1p, a2p, Min(len1, len2));
2189  if ((result == 0) && (len1 != len2))
2190  result = (len1 < len2) ? -1 : 1;
2191 
2192  /* We can't afford to leak memory here. */
2193  if (PointerGetDatum(arg1) != x)
2194  pfree(arg1);
2195  if (PointerGetDatum(arg2) != y)
2196  pfree(arg2);
2197 
2198  return result;
2199 }
2200 
2201 /*
2202  * sortsupport comparison func (for BpChar C locale case)
2203  *
2204  * BpChar outsources its sortsupport to this module. Specialization for the
2205  * varstr_sortsupport BpChar case, modeled on
2206  * internal_bpchar_pattern_compare().
2207  */
2208 static int
2210 {
2211  BpChar *arg1 = DatumGetBpCharPP(x);
2212  BpChar *arg2 = DatumGetBpCharPP(y);
2213  char *a1p,
2214  *a2p;
2215  int len1,
2216  len2,
2217  result;
2218 
2219  a1p = VARDATA_ANY(arg1);
2220  a2p = VARDATA_ANY(arg2);
2221 
2222  len1 = bpchartruelen(a1p, VARSIZE_ANY_EXHDR(arg1));
2223  len2 = bpchartruelen(a2p, VARSIZE_ANY_EXHDR(arg2));
2224 
2225  result = memcmp(a1p, a2p, Min(len1, len2));
2226  if ((result == 0) && (len1 != len2))
2227  result = (len1 < len2) ? -1 : 1;
2228 
2229  /* We can't afford to leak memory here. */
2230  if (PointerGetDatum(arg1) != x)
2231  pfree(arg1);
2232  if (PointerGetDatum(arg2) != y)
2233  pfree(arg2);
2234 
2235  return result;
2236 }
2237 
2238 /*
2239  * sortsupport comparison func (for NAME C locale case)
2240  */
2241 static int
2243 {
2244  Name arg1 = DatumGetName(x);
2245  Name arg2 = DatumGetName(y);
2246 
2247  return strncmp(NameStr(*arg1), NameStr(*arg2), NAMEDATALEN);
2248 }
2249 
2250 /*
2251  * sortsupport comparison func (for locale case with all varlena types)
2252  */
2253 static int
2255 {
2256  VarString *arg1 = DatumGetVarStringPP(x);
2257  VarString *arg2 = DatumGetVarStringPP(y);
2258  char *a1p,
2259  *a2p;
2260  int len1,
2261  len2,
2262  result;
2263 
2264  a1p = VARDATA_ANY(arg1);
2265  a2p = VARDATA_ANY(arg2);
2266 
2267  len1 = VARSIZE_ANY_EXHDR(arg1);
2268  len2 = VARSIZE_ANY_EXHDR(arg2);
2269 
2270  result = varstrfastcmp_locale(a1p, len1, a2p, len2, ssup);
2271 
2272  /* We can't afford to leak memory here. */
2273  if (PointerGetDatum(arg1) != x)
2274  pfree(arg1);
2275  if (PointerGetDatum(arg2) != y)
2276  pfree(arg2);
2277 
2278  return result;
2279 }
2280 
2281 /*
2282  * sortsupport comparison func (for locale case with NAME type)
2283  */
2284 static int
2286 {
2287  Name arg1 = DatumGetName(x);
2288  Name arg2 = DatumGetName(y);
2289 
2290  return varstrfastcmp_locale(NameStr(*arg1), strlen(NameStr(*arg1)),
2291  NameStr(*arg2), strlen(NameStr(*arg2)),
2292  ssup);
2293 }
2294 
2295 /*
2296  * sortsupport comparison func for locale cases
2297  */
2298 static int
2299 varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup)
2300 {
2302  int result;
2303  bool arg1_match;
2304 
2305  /* Fast pre-check for equality, as discussed in varstr_cmp() */
2306  if (len1 == len2 && memcmp(a1p, a2p, len1) == 0)
2307  {
2308  /*
2309  * No change in buf1 or buf2 contents, so avoid changing last_len1 or
2310  * last_len2. Existing contents of buffers might still be used by
2311  * next call.
2312  *
2313  * It's fine to allow the comparison of BpChar padding bytes here,
2314  * even though that implies that the memcmp() will usually be
2315  * performed for BpChar callers (though multibyte characters could
2316  * still prevent that from occurring). The memcmp() is still very
2317  * cheap, and BpChar's funny semantics have us remove trailing spaces
2318  * (not limited to padding), so we need make no distinction between
2319  * padding space characters and "real" space characters.
2320  */
2321  return 0;
2322  }
2323 
2324  if (sss->typid == BPCHAROID)
2325  {
2326  /* Get true number of bytes, ignoring trailing spaces */
2327  len1 = bpchartruelen(a1p, len1);
2328  len2 = bpchartruelen(a2p, len2);
2329  }
2330 
2331  if (len1 >= sss->buflen1)
2332  {
2333  pfree(sss->buf1);
2334  sss->buflen1 = Max(len1 + 1, Min(sss->buflen1 * 2, MaxAllocSize));
2335  sss->buf1 = MemoryContextAlloc(ssup->ssup_cxt, sss->buflen1);
2336  }
2337  if (len2 >= sss->buflen2)
2338  {
2339  pfree(sss->buf2);
2340  sss->buflen2 = Max(len2 + 1, Min(sss->buflen2 * 2, MaxAllocSize));
2341  sss->buf2 = MemoryContextAlloc(ssup->ssup_cxt, sss->buflen2);
2342  }
2343 
2344  /*
2345  * We're likely to be asked to compare the same strings repeatedly, and
2346  * memcmp() is so much cheaper than strcoll() that it pays to try to cache
2347  * comparisons, even though in general there is no reason to think that
2348  * that will work out (every string datum may be unique). Caching does
2349  * not slow things down measurably when it doesn't work out, and can speed
2350  * things up by rather a lot when it does. In part, this is because the
2351  * memcmp() compares data from cachelines that are needed in L1 cache even
2352  * when the last comparison's result cannot be reused.
2353  */
2354  arg1_match = true;
2355  if (len1 != sss->last_len1 || memcmp(sss->buf1, a1p, len1) != 0)
2356  {
2357  arg1_match = false;
2358  memcpy(sss->buf1, a1p, len1);
2359  sss->buf1[len1] = '\0';
2360  sss->last_len1 = len1;
2361  }
2362 
2363  /*
2364  * If we're comparing the same two strings as last time, we can return the
2365  * same answer without calling strcoll() again. This is more likely than
2366  * it seems (at least with moderate to low cardinality sets), because
2367  * quicksort compares the same pivot against many values.
2368  */
2369  if (len2 != sss->last_len2 || memcmp(sss->buf2, a2p, len2) != 0)
2370  {
2371  memcpy(sss->buf2, a2p, len2);
2372  sss->buf2[len2] = '\0';
2373  sss->last_len2 = len2;
2374  }
2375  else if (arg1_match && !sss->cache_blob)
2376  {
2377  /* Use result cached following last actual strcoll() call */
2378  return sss->last_returned;
2379  }
2380 
2381  if (sss->locale)
2382  {
2383  if (sss->locale->provider == COLLPROVIDER_ICU)
2384  {
2385 #ifdef USE_ICU
2386 #ifdef HAVE_UCOL_STRCOLLUTF8
2387  if (GetDatabaseEncoding() == PG_UTF8)
2388  {
2389  UErrorCode status;
2390 
2391  status = U_ZERO_ERROR;
2392  result = ucol_strcollUTF8(sss->locale->info.icu.ucol,
2393  a1p, len1,
2394  a2p, len2,
2395  &status);
2396  if (U_FAILURE(status))
2397  ereport(ERROR,
2398  (errmsg("collation failed: %s", u_errorName(status))));
2399  }
2400  else
2401 #endif
2402  {
2403  int32_t ulen1,
2404  ulen2;
2405  UChar *uchar1,
2406  *uchar2;
2407 
2408  ulen1 = icu_to_uchar(&uchar1, a1p, len1);
2409  ulen2 = icu_to_uchar(&uchar2, a2p, len2);
2410 
2411  result = ucol_strcoll(sss->locale->info.icu.ucol,
2412  uchar1, ulen1,
2413  uchar2, ulen2);
2414 
2415  pfree(uchar1);
2416  pfree(uchar2);
2417  }
2418 #else /* not USE_ICU */
2419  /* shouldn't happen */
2420  elog(ERROR, "unsupported collprovider: %c", sss->locale->provider);
2421 #endif /* not USE_ICU */
2422  }
2423  else
2424  {
2425 #ifdef HAVE_LOCALE_T
2426  result = strcoll_l(sss->buf1, sss->buf2, sss->locale->info.lt);
2427 #else
2428  /* shouldn't happen */
2429  elog(ERROR, "unsupported collprovider: %c", sss->locale->provider);
2430 #endif
2431  }
2432  }
2433  else
2434  result = strcoll(sss->buf1, sss->buf2);
2435 
2436  /* Break tie if necessary. */
2437  if (result == 0 &&
2438  (!sss->locale || sss->locale->deterministic))
2439  result = strcmp(sss->buf1, sss->buf2);
2440 
2441  /* Cache result, perhaps saving an expensive strcoll() call next time */
2442  sss->cache_blob = false;
2443  sss->last_returned = result;
2444  return result;
2445 }
2446 
2447 /*
2448  * Conversion routine for sortsupport. Converts original to abbreviated key
2449  * representation. Our encoding strategy is simple -- pack the first 8 bytes
2450  * of a strxfrm() blob into a Datum (on little-endian machines, the 8 bytes are
2451  * stored in reverse order), and treat it as an unsigned integer. When the "C"
2452  * locale is used, or in case of bytea, just memcpy() from original instead.
2453  */
2454 static Datum
2456 {
2458  VarString *authoritative = DatumGetVarStringPP(original);
2459  char *authoritative_data = VARDATA_ANY(authoritative);
2460 
2461  /* working state */
2462  Datum res;
2463  char *pres;
2464  int len;
2465  uint32 hash;
2466 
2467  pres = (char *) &res;
2468  /* memset(), so any non-overwritten bytes are NUL */
2469  memset(pres, 0, sizeof(Datum));
2470  len = VARSIZE_ANY_EXHDR(authoritative);
2471 
2472  /* Get number of bytes, ignoring trailing spaces */
2473  if (sss->typid == BPCHAROID)
2474  len = bpchartruelen(authoritative_data, len);
2475 
2476  /*
2477  * If we're using the C collation, use memcpy(), rather than strxfrm(), to
2478  * abbreviate keys. The full comparator for the C locale is always
2479  * memcmp(). It would be incorrect to allow bytea callers (callers that
2480  * always force the C collation -- bytea isn't a collatable type, but this
2481  * approach is convenient) to use strxfrm(). This is because bytea
2482  * strings may contain NUL bytes. Besides, this should be faster, too.
2483  *
2484  * More generally, it's okay that bytea callers can have NUL bytes in
2485  * strings because abbreviated cmp need not make a distinction between
2486  * terminating NUL bytes, and NUL bytes representing actual NULs in the
2487  * authoritative representation. Hopefully a comparison at or past one
2488  * abbreviated key's terminating NUL byte will resolve the comparison
2489  * without consulting the authoritative representation; specifically, some
2490  * later non-NUL byte in the longer string can resolve the comparison
2491  * against a subsequent terminating NUL in the shorter string. There will
2492  * usually be what is effectively a "length-wise" resolution there and
2493  * then.
2494  *
2495  * If that doesn't work out -- if all bytes in the longer string
2496  * positioned at or past the offset of the smaller string's (first)
2497  * terminating NUL are actually representative of NUL bytes in the
2498  * authoritative binary string (perhaps with some *terminating* NUL bytes
2499  * towards the end of the longer string iff it happens to still be small)
2500  * -- then an authoritative tie-breaker will happen, and do the right
2501  * thing: explicitly consider string length.
2502  */
2503  if (sss->collate_c)
2504  memcpy(pres, authoritative_data, Min(len, sizeof(Datum)));
2505  else
2506  {
2507  Size bsize;
2508 #ifdef USE_ICU
2509  int32_t ulen = -1;
2510  UChar *uchar = NULL;
2511 #endif
2512 
2513  /*
2514  * We're not using the C collation, so fall back on strxfrm or ICU
2515  * analogs.
2516  */
2517 
2518  /* By convention, we use buffer 1 to store and NUL-terminate */
2519  if (len >= sss->buflen1)
2520  {
2521  pfree(sss->buf1);
2522  sss->buflen1 = Max(len + 1, Min(sss->buflen1 * 2, MaxAllocSize));
2523  sss->buf1 = palloc(sss->buflen1);
2524  }
2525 
2526  /* Might be able to reuse strxfrm() blob from last call */
2527  if (sss->last_len1 == len && sss->cache_blob &&
2528  memcmp(sss->buf1, authoritative_data, len) == 0)
2529  {
2530  memcpy(pres, sss->buf2, Min(sizeof(Datum), sss->last_len2));
2531  /* No change affecting cardinality, so no hashing required */
2532  goto done;
2533  }
2534 
2535  memcpy(sss->buf1, authoritative_data, len);
2536 
2537  /*
2538  * Just like strcoll(), strxfrm() expects a NUL-terminated string. Not
2539  * necessary for ICU, but doesn't hurt.
2540  */
2541  sss->buf1[len] = '\0';
2542  sss->last_len1 = len;
2543 
2544 #ifdef USE_ICU
2545  /* When using ICU and not UTF8, convert string to UChar. */
2546  if (sss->locale && sss->locale->provider == COLLPROVIDER_ICU &&
2548  ulen = icu_to_uchar(&uchar, sss->buf1, len);
2549 #endif
2550 
2551  /*
2552  * Loop: Call strxfrm() or ucol_getSortKey(), possibly enlarge buffer,
2553  * and try again. Both of these functions have the result buffer
2554  * content undefined if the result did not fit, so we need to retry
2555  * until everything fits, even though we only need the first few bytes
2556  * in the end. When using ucol_nextSortKeyPart(), however, we only
2557  * ask for as many bytes as we actually need.
2558  */
2559  for (;;)
2560  {
2561 #ifdef USE_ICU
2562  if (sss->locale && sss->locale->provider == COLLPROVIDER_ICU)
2563  {
2564  /*
2565  * When using UTF8, use the iteration interface so we only
2566  * need to produce as many bytes as we actually need.
2567  */
2568  if (GetDatabaseEncoding() == PG_UTF8)
2569  {
2570  UCharIterator iter;
2571  uint32_t state[2];
2572  UErrorCode status;
2573 
2574  uiter_setUTF8(&iter, sss->buf1, len);
2575  state[0] = state[1] = 0; /* won't need that again */
2576  status = U_ZERO_ERROR;
2577  bsize = ucol_nextSortKeyPart(sss->locale->info.icu.ucol,
2578  &iter,
2579  state,
2580  (uint8_t *) sss->buf2,
2581  Min(sizeof(Datum), sss->buflen2),
2582  &status);
2583  if (U_FAILURE(status))
2584  ereport(ERROR,
2585  (errmsg("sort key generation failed: %s",
2586  u_errorName(status))));
2587  }
2588  else
2589  bsize = ucol_getSortKey(sss->locale->info.icu.ucol,
2590  uchar, ulen,
2591  (uint8_t *) sss->buf2, sss->buflen2);
2592  }
2593  else
2594 #endif
2595 #ifdef HAVE_LOCALE_T
2596  if (sss->locale && sss->locale->provider == COLLPROVIDER_LIBC)
2597  bsize = strxfrm_l(sss->buf2, sss->buf1,
2598  sss->buflen2, sss->locale->info.lt);
2599  else
2600 #endif
2601  bsize = strxfrm(sss->buf2, sss->buf1, sss->buflen2);
2602 
2603  sss->last_len2 = bsize;
2604  if (bsize < sss->buflen2)
2605  break;
2606 
2607  /*
2608  * Grow buffer and retry.
2609  */
2610  pfree(sss->buf2);
2611  sss->buflen2 = Max(bsize + 1,
2612  Min(sss->buflen2 * 2, MaxAllocSize));
2613  sss->buf2 = palloc(sss->buflen2);
2614  }
2615 
2616  /*
2617  * Every Datum byte is always compared. This is safe because the
2618  * strxfrm() blob is itself NUL terminated, leaving no danger of
2619  * misinterpreting any NUL bytes not intended to be interpreted as
2620  * logically representing termination.
2621  *
2622  * (Actually, even if there were NUL bytes in the blob it would be
2623  * okay. See remarks on bytea case above.)
2624  */
2625  memcpy(pres, sss->buf2, Min(sizeof(Datum), bsize));
2626 
2627 #ifdef USE_ICU
2628  if (uchar)
2629  pfree(uchar);
2630 #endif
2631  }
2632 
2633  /*
2634  * Maintain approximate cardinality of both abbreviated keys and original,
2635  * authoritative keys using HyperLogLog. Used as cheap insurance against
2636  * the worst case, where we do many string transformations for no saving
2637  * in full strcoll()-based comparisons. These statistics are used by
2638  * varstr_abbrev_abort().
2639  *
2640  * First, Hash key proper, or a significant fraction of it. Mix in length
2641  * in order to compensate for cases where differences are past
2642  * PG_CACHE_LINE_SIZE bytes, so as to limit the overhead of hashing.
2643  */
2644  hash = DatumGetUInt32(hash_any((unsigned char *) authoritative_data,
2646 
2647  if (len > PG_CACHE_LINE_SIZE)
2649 
2650  addHyperLogLog(&sss->full_card, hash);
2651 
2652  /* Hash abbreviated key */
2653 #if SIZEOF_DATUM == 8
2654  {
2655  uint32 lohalf,
2656  hihalf;
2657 
2658  lohalf = (uint32) res;
2659  hihalf = (uint32) (res >> 32);
2660  hash = DatumGetUInt32(hash_uint32(lohalf ^ hihalf));
2661  }
2662 #else /* SIZEOF_DATUM != 8 */
2664 #endif
2665 
2666  addHyperLogLog(&sss->abbr_card, hash);
2667 
2668  /* Cache result, perhaps saving an expensive strxfrm() call next time */
2669  sss->cache_blob = true;
2670 done:
2671 
2672  /*
2673  * Byteswap on little-endian machines.
2674  *
2675  * This is needed so that ssup_datum_unsigned_cmp() (an unsigned integer
2676  * 3-way comparator) works correctly on all platforms. If we didn't do
2677  * this, the comparator would have to call memcmp() with a pair of
2678  * pointers to the first byte of each abbreviated key, which is slower.
2679  */
2680  res = DatumBigEndianToNative(res);
2681 
2682  /* Don't leak memory here */
2683  if (PointerGetDatum(authoritative) != original)
2684  pfree(authoritative);
2685 
2686  return res;
2687 }
2688 
2689 /*
2690  * Callback for estimating effectiveness of abbreviated key optimization, using
2691  * heuristic rules. Returns value indicating if the abbreviation optimization
2692  * should be aborted, based on its projected effectiveness.
2693  */
2694 static bool
2695 varstr_abbrev_abort(int memtupcount, SortSupport ssup)
2696 {
2698  double abbrev_distinct,
2699  key_distinct;
2700 
2701  Assert(ssup->abbreviate);
2702 
2703  /* Have a little patience */
2704  if (memtupcount < 100)
2705  return false;
2706 
2707  abbrev_distinct = estimateHyperLogLog(&sss->abbr_card);
2708  key_distinct = estimateHyperLogLog(&sss->full_card);
2709 
2710  /*
2711  * Clamp cardinality estimates to at least one distinct value. While
2712  * NULLs are generally disregarded, if only NULL values were seen so far,
2713  * that might misrepresent costs if we failed to clamp.
2714  */
2715  if (abbrev_distinct <= 1.0)
2716  abbrev_distinct = 1.0;
2717 
2718  if (key_distinct <= 1.0)
2719  key_distinct = 1.0;
2720 
2721  /*
2722  * In the worst case all abbreviated keys are identical, while at the same
2723  * time there are differences within full key strings not captured in
2724  * abbreviations.
2725  */
2726 #ifdef TRACE_SORT
2727  if (trace_sort)
2728  {
2729  double norm_abbrev_card = abbrev_distinct / (double) memtupcount;
2730 
2731  elog(LOG, "varstr_abbrev: abbrev_distinct after %d: %f "
2732  "(key_distinct: %f, norm_abbrev_card: %f, prop_card: %f)",
2733  memtupcount, abbrev_distinct, key_distinct, norm_abbrev_card,
2734  sss->prop_card);
2735  }
2736 #endif
2737 
2738  /*
2739  * If the number of distinct abbreviated keys approximately matches the
2740  * number of distinct authoritative original keys, that's reason enough to
2741  * proceed. We can win even with a very low cardinality set if most
2742  * tie-breakers only memcmp(). This is by far the most important
2743  * consideration.
2744  *
2745  * While comparisons that are resolved at the abbreviated key level are
2746  * considerably cheaper than tie-breakers resolved with memcmp(), both of
2747  * those two outcomes are so much cheaper than a full strcoll() once
2748  * sorting is underway that it doesn't seem worth it to weigh abbreviated
2749  * cardinality against the overall size of the set in order to more
2750  * accurately model costs. Assume that an abbreviated comparison, and an
2751  * abbreviated comparison with a cheap memcmp()-based authoritative
2752  * resolution are equivalent.
2753  */
2754  if (abbrev_distinct > key_distinct * sss->prop_card)
2755  {
2756  /*
2757  * When we have exceeded 10,000 tuples, decay required cardinality
2758  * aggressively for next call.
2759  *
2760  * This is useful because the number of comparisons required on
2761  * average increases at a linearithmic rate, and at roughly 10,000
2762  * tuples that factor will start to dominate over the linear costs of
2763  * string transformation (this is a conservative estimate). The decay
2764  * rate is chosen to be a little less aggressive than halving -- which
2765  * (since we're called at points at which memtupcount has doubled)
2766  * would never see the cost model actually abort past the first call
2767  * following a decay. This decay rate is mostly a precaution against
2768  * a sudden, violent swing in how well abbreviated cardinality tracks
2769  * full key cardinality. The decay also serves to prevent a marginal
2770  * case from being aborted too late, when too much has already been
2771  * invested in string transformation.
2772  *
2773  * It's possible for sets of several million distinct strings with
2774  * mere tens of thousands of distinct abbreviated keys to still
2775  * benefit very significantly. This will generally occur provided
2776  * each abbreviated key is a proxy for a roughly uniform number of the
2777  * set's full keys. If it isn't so, we hope to catch that early and
2778  * abort. If it isn't caught early, by the time the problem is
2779  * apparent it's probably not worth aborting.
2780  */
2781  if (memtupcount > 10000)
2782  sss->prop_card *= 0.65;
2783 
2784  return false;
2785  }
2786 
2787  /*
2788  * Abort abbreviation strategy.
2789  *
2790  * The worst case, where all abbreviated keys are identical while all
2791  * original strings differ will typically only see a regression of about
2792  * 10% in execution time for small to medium sized lists of strings.
2793  * Whereas on modern CPUs where cache stalls are the dominant cost, we can
2794  * often expect very large improvements, particularly with sets of strings
2795  * of moderately high to high abbreviated cardinality. There is little to
2796  * lose but much to gain, which our strategy reflects.
2797  */
2798 #ifdef TRACE_SORT
2799  if (trace_sort)
2800  elog(LOG, "varstr_abbrev: aborted abbreviation at %d "
2801  "(abbrev_distinct: %f, key_distinct: %f, prop_card: %f)",
2802  memtupcount, abbrev_distinct, key_distinct, sss->prop_card);
2803 #endif
2804 
2805  return true;
2806 }
2807 
2808 /*
2809  * Generic equalimage support function for character type's operator classes.
2810  * Disables the use of deduplication with nondeterministic collations.
2811  */
2812 Datum
2814 {
2815  /* Oid opcintype = PG_GETARG_OID(0); */
2816  Oid collid = PG_GET_COLLATION();
2817 
2818  check_collation_set(collid);
2819 
2820  if (lc_collate_is_c(collid) ||
2821  collid == DEFAULT_COLLATION_OID ||
2823  PG_RETURN_BOOL(true);
2824  else
2825  PG_RETURN_BOOL(false);
2826 }
2827 
2828 Datum
2830 {
2831  text *arg1 = PG_GETARG_TEXT_PP(0);
2832  text *arg2 = PG_GETARG_TEXT_PP(1);
2833  text *result;
2834 
2835  result = ((text_cmp(arg1, arg2, PG_GET_COLLATION()) > 0) ? arg1 : arg2);
2836 
2837  PG_RETURN_TEXT_P(result);
2838 }
2839 
2840 Datum
2842 {
2843  text *arg1 = PG_GETARG_TEXT_PP(0);
2844  text *arg2 = PG_GETARG_TEXT_PP(1);
2845  text *result;
2846 
2847  result = ((text_cmp(arg1, arg2, PG_GET_COLLATION()) < 0) ? arg1 : arg2);
2848 
2849  PG_RETURN_TEXT_P(result);
2850 }
2851 
2852 
2853 /*
2854  * Cross-type comparison functions for types text and name.
2855  */
2856 
2857 Datum
2859 {
2860  Name arg1 = PG_GETARG_NAME(0);
2861  text *arg2 = PG_GETARG_TEXT_PP(1);
2862  size_t len1 = strlen(NameStr(*arg1));
2863  size_t len2 = VARSIZE_ANY_EXHDR(arg2);
2864  Oid collid = PG_GET_COLLATION();
2865  bool result;
2866 
2867  check_collation_set(collid);
2868 
2869  if (collid == C_COLLATION_OID)
2870  result = (len1 == len2 &&
2871  memcmp(NameStr(*arg1), VARDATA_ANY(arg2), len1) == 0);
2872  else
2873  result = (varstr_cmp(NameStr(*arg1), len1,
2874  VARDATA_ANY(arg2), len2,
2875  collid) == 0);
2876 
2877  PG_FREE_IF_COPY(arg2, 1);
2878 
2879  PG_RETURN_BOOL(result);
2880 }
2881 
2882 Datum
2884 {
2885  text *arg1 = PG_GETARG_TEXT_PP(0);
2886  Name arg2 = PG_GETARG_NAME(1);
2887  size_t len1 = VARSIZE_ANY_EXHDR(arg1);
2888  size_t len2 = strlen(NameStr(*arg2));
2889  Oid collid = PG_GET_COLLATION();
2890  bool result;
2891 
2892  check_collation_set(collid);
2893 
2894  if (collid == C_COLLATION_OID)
2895  result = (len1 == len2 &&
2896  memcmp(VARDATA_ANY(arg1), NameStr(*arg2), len1) == 0);
2897  else
2898  result = (varstr_cmp(VARDATA_ANY(arg1), len1,
2899  NameStr(*arg2), len2,
2900  collid) == 0);
2901 
2902  PG_FREE_IF_COPY(arg1, 0);
2903 
2904  PG_RETURN_BOOL(result);
2905 }
2906 
2907 Datum
2909 {
2910  Name arg1 = PG_GETARG_NAME(0);
2911  text *arg2 = PG_GETARG_TEXT_PP(1);
2912  size_t len1 = strlen(NameStr(*arg1));
2913  size_t len2 = VARSIZE_ANY_EXHDR(arg2);
2914  Oid collid = PG_GET_COLLATION();
2915  bool result;
2916 
2917  check_collation_set(collid);
2918 
2919  if (collid == C_COLLATION_OID)
2920  result = !(len1 == len2 &&
2921  memcmp(NameStr(*arg1), VARDATA_ANY(arg2), len1) == 0);
2922  else
2923  result = !(varstr_cmp(NameStr(*arg1), len1,
2924  VARDATA_ANY(arg2), len2,
2925  collid) == 0);
2926 
2927  PG_FREE_IF_COPY(arg2, 1);
2928 
2929  PG_RETURN_BOOL(result);
2930 }
2931 
2932 Datum
2934 {
2935  text *arg1 = PG_GETARG_TEXT_PP(0);
2936  Name arg2 = PG_GETARG_NAME(1);
2937  size_t len1 = VARSIZE_ANY_EXHDR(arg1);
2938  size_t len2 = strlen(NameStr(*arg2));
2939  Oid collid = PG_GET_COLLATION();
2940  bool result;
2941 
2942  check_collation_set(collid);
2943 
2944  if (collid == C_COLLATION_OID)
2945  result = !(len1 == len2 &&
2946  memcmp(VARDATA_ANY(arg1), NameStr(*arg2), len1) == 0);
2947  else
2948  result = !(varstr_cmp(VARDATA_ANY(arg1), len1,
2949  NameStr(*arg2), len2,
2950  collid) == 0);
2951 
2952  PG_FREE_IF_COPY(arg1, 0);
2953 
2954  PG_RETURN_BOOL(result);
2955 }
2956 
2957 Datum
2959 {
2960  Name arg1 = PG_GETARG_NAME(0);
2961  text *arg2 = PG_GETARG_TEXT_PP(1);
2962  int32 result;
2963 
2964  result = varstr_cmp(NameStr(*arg1), strlen(NameStr(*arg1)),
2965  VARDATA_ANY(arg2), VARSIZE_ANY_EXHDR(arg2),
2966  PG_GET_COLLATION());
2967 
2968  PG_FREE_IF_COPY(arg2, 1);
2969 
2970  PG_RETURN_INT32(result);
2971 }
2972 
2973 Datum
2975 {
2976  text *arg1 = PG_GETARG_TEXT_PP(0);
2977  Name arg2 = PG_GETARG_NAME(1);
2978  int32 result;
2979 
2980  result = varstr_cmp(VARDATA_ANY(arg1), VARSIZE_ANY_EXHDR(arg1),
2981  NameStr(*arg2), strlen(NameStr(*arg2)),
2982  PG_GET_COLLATION());
2983 
2984  PG_FREE_IF_COPY(arg1, 0);
2985 
2986  PG_RETURN_INT32(result);
2987 }
2988 
2989 #define CmpCall(cmpfunc) \
2990  DatumGetInt32(DirectFunctionCall2Coll(cmpfunc, \
2991  PG_GET_COLLATION(), \
2992  PG_GETARG_DATUM(0), \
2993  PG_GETARG_DATUM(1)))
2994 
2995 Datum
2997 {
2999 }
3000 
3001 Datum
3003 {
3005 }
3006 
3007 Datum
3009 {
3011 }
3012 
3013 Datum
3015 {
3017 }
3018 
3019 Datum
3021 {
3023 }
3024 
3025 Datum
3027 {
3029 }
3030 
3031 Datum
3033 {
3035 }
3036 
3037 Datum
3039 {
3041 }
3042 
3043 #undef CmpCall
3044 
3045 
3046 /*
3047  * The following operators support character-by-character comparison
3048  * of text datums, to allow building indexes suitable for LIKE clauses.
3049  * Note that the regular texteq/textne comparison operators, and regular
3050  * support functions 1 and 2 with "C" collation are assumed to be
3051  * compatible with these!
3052  */
3053 
3054 static int
3056 {
3057  int result;
3058  int len1,
3059  len2;
3060 
3061  len1 = VARSIZE_ANY_EXHDR(arg1);
3062  len2 = VARSIZE_ANY_EXHDR(arg2);
3063 
3064  result = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3065  if (result != 0)
3066  return result;
3067  else if (len1 < len2)
3068  return -1;
3069  else if (len1 > len2)
3070  return 1;
3071  else
3072  return 0;
3073 }
3074 
3075 
3076 Datum
3078 {
3079  text *arg1 = PG_GETARG_TEXT_PP(0);
3080  text *arg2 = PG_GETARG_TEXT_PP(1);
3081  int result;
3082 
3083  result = internal_text_pattern_compare(arg1, arg2);
3084 
3085  PG_FREE_IF_COPY(arg1, 0);
3086  PG_FREE_IF_COPY(arg2, 1);
3087 
3088  PG_RETURN_BOOL(result < 0);
3089 }
3090 
3091 
3092 Datum
3094 {
3095  text *arg1 = PG_GETARG_TEXT_PP(0);
3096  text *arg2 = PG_GETARG_TEXT_PP(1);
3097  int result;
3098 
3099  result = internal_text_pattern_compare(arg1, arg2);
3100 
3101  PG_FREE_IF_COPY(arg1, 0);
3102  PG_FREE_IF_COPY(arg2, 1);
3103 
3104  PG_RETURN_BOOL(result <= 0);
3105 }
3106 
3107 
3108 Datum
3110 {
3111  text *arg1 = PG_GETARG_TEXT_PP(0);
3112  text *arg2 = PG_GETARG_TEXT_PP(1);
3113  int result;
3114 
3115  result = internal_text_pattern_compare(arg1, arg2);
3116 
3117  PG_FREE_IF_COPY(arg1, 0);
3118  PG_FREE_IF_COPY(arg2, 1);
3119 
3120  PG_RETURN_BOOL(result >= 0);
3121 }
3122 
3123 
3124 Datum
3126 {
3127  text *arg1 = PG_GETARG_TEXT_PP(0);
3128  text *arg2 = PG_GETARG_TEXT_PP(1);
3129  int result;
3130 
3131  result = internal_text_pattern_compare(arg1, arg2);
3132 
3133  PG_FREE_IF_COPY(arg1, 0);
3134  PG_FREE_IF_COPY(arg2, 1);
3135 
3136  PG_RETURN_BOOL(result > 0);
3137 }
3138 
3139 
3140 Datum
3142 {
3143  text *arg1 = PG_GETARG_TEXT_PP(0);
3144  text *arg2 = PG_GETARG_TEXT_PP(1);
3145  int result;
3146 
3147  result = internal_text_pattern_compare(arg1, arg2);
3148 
3149  PG_FREE_IF_COPY(arg1, 0);
3150  PG_FREE_IF_COPY(arg2, 1);
3151 
3152  PG_RETURN_INT32(result);
3153 }
3154 
3155 
3156 Datum
3158 {
3160  MemoryContext oldcontext;
3161 
3162  oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
3163 
3164  /* Use generic string SortSupport, forcing "C" collation */
3165  varstr_sortsupport(ssup, TEXTOID, C_COLLATION_OID);
3166 
3167  MemoryContextSwitchTo(oldcontext);
3168 
3169  PG_RETURN_VOID();
3170 }
3171 
3172 
3173 /*-------------------------------------------------------------
3174  * byteaoctetlen
3175  *
3176  * get the number of bytes contained in an instance of type 'bytea'
3177  *-------------------------------------------------------------
3178  */
3179 Datum
3181 {
3182  Datum str = PG_GETARG_DATUM(0);
3183 
3184  /* We need not detoast the input at all */
3186 }
3187 
3188 /*
3189  * byteacat -
3190  * takes two bytea* and returns a bytea* that is the concatenation of
3191  * the two.
3192  *
3193  * Cloned from textcat and modified as required.
3194  */
3195 Datum
3197 {
3198  bytea *t1 = PG_GETARG_BYTEA_PP(0);
3199  bytea *t2 = PG_GETARG_BYTEA_PP(1);
3200 
3202 }
3203 
3204 /*
3205  * bytea_catenate
3206  * Guts of byteacat(), broken out so it can be used by other functions
3207  *
3208  * Arguments can be in short-header form, but not compressed or out-of-line
3209  */
3210 static bytea *
3212 {
3213  bytea *result;
3214  int len1,
3215  len2,
3216  len;
3217  char *ptr;
3218 
3219  len1 = VARSIZE_ANY_EXHDR(t1);
3220  len2 = VARSIZE_ANY_EXHDR(t2);
3221 
3222  /* paranoia ... probably should throw error instead? */
3223  if (len1 < 0)
3224  len1 = 0;
3225  if (len2 < 0)
3226  len2 = 0;
3227 
3228  len = len1 + len2 + VARHDRSZ;
3229  result = (bytea *) palloc(len);
3230 
3231  /* Set size of result string... */
3232  SET_VARSIZE(result, len);
3233 
3234  /* Fill data field of result string... */
3235  ptr = VARDATA(result);
3236  if (len1 > 0)
3237  memcpy(ptr, VARDATA_ANY(t1), len1);
3238  if (len2 > 0)
3239  memcpy(ptr + len1, VARDATA_ANY(t2), len2);
3240 
3241  return result;
3242 }
3243 
3244 #define PG_STR_GET_BYTEA(str_) \
3245  DatumGetByteaPP(DirectFunctionCall1(byteain, CStringGetDatum(str_)))
3246 
3247 /*
3248  * bytea_substr()
3249  * Return a substring starting at the specified position.
3250  * Cloned from text_substr and modified as required.
3251  *
3252  * Input:
3253  * - string
3254  * - starting position (is one-based)
3255  * - string length (optional)
3256  *
3257  * If the starting position is zero or less, then return from the start of the string
3258  * adjusting the length to be consistent with the "negative start" per SQL.
3259  * If the length is less than zero, an ERROR is thrown. If no third argument
3260  * (length) is provided, the length to the end of the string is assumed.
3261  */
3262 Datum
3264 {
3266  PG_GETARG_INT32(1),
3267  PG_GETARG_INT32(2),
3268  false));
3269 }
3270 
3271 /*
3272  * bytea_substr_no_len -
3273  * Wrapper to avoid opr_sanity failure due to
3274  * one function accepting a different number of args.
3275  */
3276 Datum
3278 {
3280  PG_GETARG_INT32(1),
3281  -1,
3282  true));
3283 }
3284 
3285 static bytea *
3287  int S,
3288  int L,
3289  bool length_not_specified)
3290 {
3291  int32 S1; /* adjusted start position */
3292  int32 L1; /* adjusted substring length */
3293  int32 E; /* end position */
3294 
3295  /*
3296  * The logic here should generally match text_substring().
3297  */
3298  S1 = Max(S, 1);
3299 
3300  if (length_not_specified)
3301  {
3302  /*
3303  * Not passed a length - DatumGetByteaPSlice() grabs everything to the
3304  * end of the string if we pass it a negative value for length.
3305  */
3306  L1 = -1;
3307  }
3308  else if (L < 0)
3309  {
3310  /* SQL99 says to throw an error for E < S, i.e., negative length */
3311  ereport(ERROR,
3312  (errcode(ERRCODE_SUBSTRING_ERROR),
3313  errmsg("negative substring length not allowed")));
3314  L1 = -1; /* silence stupider compilers */
3315  }
3316  else if (pg_add_s32_overflow(S, L, &E))
3317  {
3318  /*
3319  * L could be large enough for S + L to overflow, in which case the
3320  * substring must run to end of string.
3321  */
3322  L1 = -1;
3323  }
3324  else
3325  {
3326  /*
3327  * A zero or negative value for the end position can happen if the
3328  * start was negative or one. SQL99 says to return a zero-length
3329  * string.
3330  */
3331  if (E < 1)
3332  return PG_STR_GET_BYTEA("");
3333 
3334  L1 = E - S1;
3335  }
3336 
3337  /*
3338  * If the start position is past the end of the string, SQL99 says to
3339  * return a zero-length string -- DatumGetByteaPSlice() will do that for
3340  * us. We need only convert S1 to zero-based starting position.
3341  */
3342  return DatumGetByteaPSlice(str, S1 - 1, L1);
3343 }
3344 
3345 /*
3346  * byteaoverlay
3347  * Replace specified substring of first string with second
3348  *
3349  * The SQL standard defines OVERLAY() in terms of substring and concatenation.
3350  * This code is a direct implementation of what the standard says.
3351  */
3352 Datum
3354 {
3355  bytea *t1 = PG_GETARG_BYTEA_PP(0);
3356  bytea *t2 = PG_GETARG_BYTEA_PP(1);
3357  int sp = PG_GETARG_INT32(2); /* substring start position */
3358  int sl = PG_GETARG_INT32(3); /* substring length */
3359 
3360  PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl));
3361 }
3362 
3363 Datum
3365 {
3366  bytea *t1 = PG_GETARG_BYTEA_PP(0);
3367  bytea *t2 = PG_GETARG_BYTEA_PP(1);
3368  int sp = PG_GETARG_INT32(2); /* substring start position */
3369  int sl;
3370 
3371  sl = VARSIZE_ANY_EXHDR(t2); /* defaults to length(t2) */
3372  PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl));
3373 }
3374 
3375 static bytea *
3376 bytea_overlay(bytea *t1, bytea *t2, int sp, int sl)
3377 {
3378  bytea *result;
3379  bytea *s1;
3380  bytea *s2;
3381  int sp_pl_sl;
3382 
3383  /*
3384  * Check for possible integer-overflow cases. For negative sp, throw a
3385  * "substring length" error because that's what should be expected
3386  * according to the spec's definition of OVERLAY().
3387  */
3388  if (sp <= 0)
3389  ereport(ERROR,
3390  (errcode(ERRCODE_SUBSTRING_ERROR),
3391  errmsg("negative substring length not allowed")));
3392  if (pg_add_s32_overflow(sp, sl, &sp_pl_sl))
3393  ereport(ERROR,
3394  (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
3395  errmsg("integer out of range")));
3396 
3397  s1 = bytea_substring(PointerGetDatum(t1), 1, sp - 1, false);
3398  s2 = bytea_substring(PointerGetDatum(t1), sp_pl_sl, -1, true);
3399  result = bytea_catenate(s1, t2);
3400  result = bytea_catenate(result, s2);
3401 
3402  return result;
3403 }
3404 
3405 /*
3406  * bit_count
3407  */
3408 Datum
3410 {
3411  bytea *t1 = PG_GETARG_BYTEA_PP(0);
3412 
3414 }
3415 
3416 /*
3417  * byteapos -
3418  * Return the position of the specified substring.
3419  * Implements the SQL POSITION() function.
3420  * Cloned from textpos and modified as required.
3421  */
3422 Datum
3424 {
3425  bytea *t1 = PG_GETARG_BYTEA_PP(0);
3426  bytea *t2 = PG_GETARG_BYTEA_PP(1);
3427  int pos;
3428  int px,
3429  p;
3430  int len1,
3431  len2;
3432  char *p1,
3433  *p2;
3434 
3435  len1 = VARSIZE_ANY_EXHDR(t1);
3436  len2 = VARSIZE_ANY_EXHDR(t2);
3437 
3438  if (len2 <= 0)
3439  PG_RETURN_INT32(1); /* result for empty pattern */
3440 
3441  p1 = VARDATA_ANY(t1);
3442  p2 = VARDATA_ANY(t2);
3443 
3444  pos = 0;
3445  px = (len1 - len2);
3446  for (p = 0; p <= px; p++)
3447  {
3448  if ((*p2 == *p1) && (memcmp(p1, p2, len2) == 0))
3449  {
3450  pos = p + 1;
3451  break;
3452  };
3453  p1++;
3454  };
3455 
3456  PG_RETURN_INT32(pos);
3457 }
3458 
3459 /*-------------------------------------------------------------
3460  * byteaGetByte
3461  *
3462  * this routine treats "bytea" as an array of bytes.
3463  * It returns the Nth byte (a number between 0 and 255).
3464  *-------------------------------------------------------------
3465  */
3466 Datum
3468 {
3469  bytea *v = PG_GETARG_BYTEA_PP(0);
3470  int32 n = PG_GETARG_INT32(1);
3471  int len;
3472  int byte;
3473 
3474  len = VARSIZE_ANY_EXHDR(v);
3475 
3476  if (n < 0 || n >= len)
3477  ereport(ERROR,
3478  (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3479  errmsg("index %d out of valid range, 0..%d",
3480  n, len - 1)));
3481 
3482  byte = ((unsigned char *) VARDATA_ANY(v))[n];
3483 
3484  PG_RETURN_INT32(byte);
3485 }
3486 
3487 /*-------------------------------------------------------------
3488  * byteaGetBit
3489  *
3490  * This routine treats a "bytea" type like an array of bits.
3491  * It returns the value of the Nth bit (0 or 1).
3492  *
3493  *-------------------------------------------------------------
3494  */
3495 Datum
3497 {
3498  bytea *v = PG_GETARG_BYTEA_PP(0);
3499  int64 n = PG_GETARG_INT64(1);
3500  int byteNo,
3501  bitNo;
3502  int len;
3503  int byte;
3504 
3505  len = VARSIZE_ANY_EXHDR(v);
3506 
3507  if (n < 0 || n >= (int64) len * 8)
3508  ereport(ERROR,
3509  (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3510  errmsg("index %lld out of valid range, 0..%lld",
3511  (long long) n, (long long) len * 8 - 1)));
3512 
3513  /* n/8 is now known < len, so safe to cast to int */
3514  byteNo = (int) (n / 8);
3515  bitNo = (int) (n % 8);
3516 
3517  byte = ((unsigned char *) VARDATA_ANY(v))[byteNo];
3518 
3519  if (byte & (1 << bitNo))
3520  PG_RETURN_INT32(1);
3521  else
3522  PG_RETURN_INT32(0);
3523 }
3524 
3525 /*-------------------------------------------------------------
3526  * byteaSetByte
3527  *
3528  * Given an instance of type 'bytea' creates a new one with
3529  * the Nth byte set to the given value.
3530  *
3531  *-------------------------------------------------------------
3532  */
3533 Datum
3535 {
3537  int32 n = PG_GETARG_INT32(1);
3538  int32 newByte = PG_GETARG_INT32(2);
3539  int len;
3540 
3541  len = VARSIZE(res) - VARHDRSZ;
3542 
3543  if (n < 0 || n >= len)
3544  ereport(ERROR,
3545  (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3546  errmsg("index %d out of valid range, 0..%d",
3547  n, len - 1)));
3548 
3549  /*
3550  * Now set the byte.
3551  */
3552  ((unsigned char *) VARDATA(res))[n] = newByte;
3553 
3555 }
3556 
3557 /*-------------------------------------------------------------
3558  * byteaSetBit
3559  *
3560  * Given an instance of type 'bytea' creates a new one with
3561  * the Nth bit set to the given value.
3562  *
3563  *-------------------------------------------------------------
3564  */
3565 Datum
3567 {
3569  int64 n = PG_GETARG_INT64(1);
3570  int32 newBit = PG_GETARG_INT32(2);
3571  int len;
3572  int oldByte,
3573  newByte;
3574  int byteNo,
3575  bitNo;
3576 
3577  len = VARSIZE(res) - VARHDRSZ;
3578 
3579  if (n < 0 || n >= (int64) len * 8)
3580  ereport(ERROR,
3581  (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3582  errmsg("index %lld out of valid range, 0..%lld",
3583  (long long) n, (long long) len * 8 - 1)));
3584 
3585  /* n/8 is now known < len, so safe to cast to int */
3586  byteNo = (int) (n / 8);
3587  bitNo = (int) (n % 8);
3588 
3589  /*
3590  * sanity check!
3591  */
3592  if (newBit != 0 && newBit != 1)
3593  ereport(ERROR,
3594  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
3595  errmsg("new bit must be 0 or 1")));
3596 
3597  /*
3598  * Update the byte.
3599  */
3600  oldByte = ((unsigned char *) VARDATA(res))[byteNo];
3601 
3602  if (newBit == 0)
3603  newByte = oldByte & (~(1 << bitNo));
3604  else
3605  newByte = oldByte | (1 << bitNo);
3606 
3607  ((unsigned char *) VARDATA(res))[byteNo] = newByte;
3608 
3610 }
3611 
3612 
3613 /* text_name()
3614  * Converts a text type to a Name type.
3615  */
3616 Datum
3618 {
3619  text *s = PG_GETARG_TEXT_PP(0);
3620  Name result;
3621  int len;
3622 
3623  len = VARSIZE_ANY_EXHDR(s);
3624 
3625  /* Truncate oversize input */
3626  if (len >= NAMEDATALEN)
3628 
3629  /* We use palloc0 here to ensure result is zero-padded */
3630  result = (Name) palloc0(NAMEDATALEN);
3631  memcpy(NameStr(*result), VARDATA_ANY(s), len);
3632 
3633  PG_RETURN_NAME(result);
3634 }
3635 
3636 /* name_text()
3637  * Converts a Name type to a text type.
3638  */
3639 Datum
3641 {
3642  Name s = PG_GETARG_NAME(0);
3643 
3645 }
3646 
3647 
3648 /*
3649  * textToQualifiedNameList - convert a text object to list of names
3650  *
3651  * This implements the input parsing needed by nextval() and other
3652  * functions that take a text parameter representing a qualified name.
3653  * We split the name at dots, downcase if not double-quoted, and
3654  * truncate names if they're too long.
3655  */
3656 List *
3658 {
3659  char *rawname;
3660  List *result = NIL;
3661  List *namelist;
3662  ListCell *l;
3663 
3664  /* Convert to C string (handles possible detoasting). */
3665  /* Note we rely on being able to modify rawname below. */
3666  rawname = text_to_cstring(textval);
3667 
3668  if (!SplitIdentifierString(rawname, '.', &namelist))
3669  ereport(ERROR,
3670  (errcode(ERRCODE_INVALID_NAME),
3671  errmsg("invalid name syntax")));
3672 
3673  if (namelist == NIL)
3674  ereport(ERROR,
3675  (errcode(ERRCODE_INVALID_NAME),
3676  errmsg("invalid name syntax")));
3677 
3678  foreach(l, namelist)
3679  {
3680  char *curname = (char *) lfirst(l);
3681 
3682  result = lappend(result, makeString(pstrdup(curname)));
3683  }
3684 
3685  pfree(rawname);
3686  list_free(namelist);
3687 
3688  return result;
3689 }
3690 
3691 /*
3692  * SplitIdentifierString --- parse a string containing identifiers
3693  *
3694  * This is the guts of textToQualifiedNameList, and is exported for use in
3695  * other situations such as parsing GUC variables. In the GUC case, it's
3696  * important to avoid memory leaks, so the API is designed to minimize the
3697  * amount of stuff that needs to be allocated and freed.
3698  *
3699  * Inputs:
3700  * rawstring: the input string; must be overwritable! On return, it's
3701  * been modified to contain the separated identifiers.
3702  * separator: the separator punctuation expected between identifiers
3703  * (typically '.' or ','). Whitespace may also appear around
3704  * identifiers.
3705  * Outputs:
3706  * namelist: filled with a palloc'd list of pointers to identifiers within
3707  * rawstring. Caller should list_free() this even on error return.
3708  *
3709  * Returns true if okay, false if there is a syntax error in the string.
3710  *
3711  * Note that an empty string is considered okay here, though not in
3712  * textToQualifiedNameList.
3713  */
3714 bool
3715 SplitIdentifierString(char *rawstring, char separator,
3716  List **namelist)
3717 {
3718  char *nextp = rawstring;
3719  bool done = false;
3720 
3721  *namelist = NIL;
3722 
3723  while (scanner_isspace(*nextp))
3724  nextp++; /* skip leading whitespace */
3725 
3726  if (*nextp == '\0')
3727  return true; /* allow empty string */
3728 
3729  /* At the top of the loop, we are at start of a new identifier. */
3730  do
3731  {
3732  char *curname;
3733  char *endp;
3734 
3735  if (*nextp == '"')
3736  {
3737  /* Quoted name --- collapse quote-quote pairs, no downcasing */
3738  curname = nextp + 1;
3739  for (;;)
3740  {
3741  endp = strchr(nextp + 1, '"');
3742  if (endp == NULL)
3743  return false; /* mismatched quotes */
3744  if (endp[1] != '"')
3745  break; /* found end of quoted name */
3746  /* Collapse adjacent quotes into one quote, and look again */
3747  memmove(endp, endp + 1, strlen(endp));
3748  nextp = endp;
3749  }
3750  /* endp now points at the terminating quote */
3751  nextp = endp + 1;
3752  }
3753  else
3754  {
3755  /* Unquoted name --- extends to separator or whitespace */
3756  char *downname;
3757  int len;
3758 
3759  curname = nextp;
3760  while (*nextp && *nextp != separator &&
3761  !scanner_isspace(*nextp))
3762  nextp++;
3763  endp = nextp;
3764  if (curname == nextp)
3765  return false; /* empty unquoted name not allowed */
3766 
3767  /*
3768  * Downcase the identifier, using same code as main lexer does.
3769  *
3770  * XXX because we want to overwrite the input in-place, we cannot
3771  * support a downcasing transformation that increases the string
3772  * length. This is not a problem given the current implementation
3773  * of downcase_truncate_identifier, but we'll probably have to do
3774  * something about this someday.
3775  */
3776  len = endp - curname;
3777  downname = downcase_truncate_identifier(curname, len, false);
3778  Assert(strlen(downname) <= len);
3779  strncpy(curname, downname, len); /* strncpy is required here */
3780  pfree(downname);
3781  }
3782 
3783  while (scanner_isspace(*nextp))
3784  nextp++; /* skip trailing whitespace */
3785 
3786  if (*nextp == separator)
3787  {
3788  nextp++;
3789  while (scanner_isspace(*nextp))
3790  nextp++; /* skip leading whitespace for next */
3791  /* we expect another name, so done remains false */
3792  }
3793  else if (*nextp == '\0')
3794  done = true;
3795  else
3796  return false; /* invalid syntax */
3797 
3798  /* Now safe to overwrite separator with a null */
3799  *endp = '\0';
3800 
3801  /* Truncate name if it's overlength */
3802  truncate_identifier(curname, strlen(curname), false);
3803 
3804  /*
3805  * Finished isolating current name --- add it to list
3806  */
3807  *namelist = lappend(*namelist, curname);
3808 
3809  /* Loop back if we didn't reach end of string */
3810  } while (!done);
3811 
3812  return true;
3813 }
3814 
3815 
3816 /*
3817  * SplitDirectoriesString --- parse a string containing file/directory names
3818  *
3819  * This works fine on file names too; the function name is historical.
3820  *
3821  * This is similar to SplitIdentifierString, except that the parsing
3822  * rules are meant to handle pathnames instead of identifiers: there is
3823  * no downcasing, embedded spaces are allowed, the max length is MAXPGPATH-1,
3824  * and we apply canonicalize_path() to each extracted string. Because of the
3825  * last, the returned strings are separately palloc'd rather than being
3826  * pointers into rawstring --- but we still scribble on rawstring.
3827  *
3828  * Inputs:
3829  * rawstring: the input string; must be modifiable!
3830  * separator: the separator punctuation expected between directories
3831  * (typically ',' or ';'). Whitespace may also appear around
3832  * directories.
3833  * Outputs:
3834  * namelist: filled with a palloc'd list of directory names.
3835  * Caller should list_free_deep() this even on error return.
3836  *
3837  * Returns true if okay, false if there is a syntax error in the string.
3838  *
3839  * Note that an empty string is considered okay here.
3840  */
3841 bool
3842 SplitDirectoriesString(char *rawstring, char separator,
3843  List **namelist)
3844 {
3845  char *nextp = rawstring;
3846  bool done = false;
3847 
3848  *namelist = NIL;
3849 
3850  while (scanner_isspace(*nextp))
3851  nextp++; /* skip leading whitespace */
3852 
3853  if (*nextp == '\0')
3854  return true; /* allow empty string */
3855 
3856  /* At the top of the loop, we are at start of a new directory. */
3857  do
3858  {
3859  char *curname;
3860  char *endp;
3861 
3862  if (*nextp == '"')
3863  {
3864  /* Quoted name --- collapse quote-quote pairs */
3865  curname = nextp + 1;
3866  for (;;)
3867  {
3868  endp = strchr(nextp + 1, '"');
3869  if (endp == NULL)
3870  return false; /* mismatched quotes */
3871  if (endp[1] != '"')
3872  break; /* found end of quoted name */
3873  /* Collapse adjacent quotes into one quote, and look again */
3874  memmove(endp, endp + 1, strlen(endp));
3875  nextp = endp;
3876  }
3877  /* endp now points at the terminating quote */
3878  nextp = endp + 1;
3879  }
3880  else
3881  {
3882  /* Unquoted name --- extends to separator or end of string */
3883  curname = endp = nextp;
3884  while (*nextp && *nextp != separator)
3885  {
3886  /* trailing whitespace should not be included in name */
3887  if (!scanner_isspace(*nextp))
3888  endp = nextp + 1;
3889  nextp++;
3890  }
3891  if (curname == endp)
3892  return false; /* empty unquoted name not allowed */
3893  }
3894 
3895  while (scanner_isspace(*nextp))
3896  nextp++; /* skip trailing whitespace */
3897 
3898  if (*nextp == separator)
3899  {
3900  nextp++;
3901  while (scanner_isspace(*nextp))
3902  nextp++; /* skip leading whitespace for next */
3903  /* we expect another name, so done remains false */
3904  }
3905  else if (*nextp == '\0')
3906  done = true;
3907  else
3908  return false; /* invalid syntax */
3909 
3910  /* Now safe to overwrite separator with a null */
3911  *endp = '\0';
3912 
3913  /* Truncate path if it's overlength */
3914  if (strlen(curname) >= MAXPGPATH)
3915  curname[MAXPGPATH - 1] = '\0';
3916 
3917  /*
3918  * Finished isolating current name --- add it to list
3919  */
3920  curname = pstrdup(curname);
3921  canonicalize_path(curname);
3922  *namelist = lappend(*namelist, curname);
3923 
3924  /* Loop back if we didn't reach end of string */
3925  } while (!done);
3926 
3927  return true;
3928 }
3929 
3930 
3931 /*
3932  * SplitGUCList --- parse a string containing identifiers or file names
3933  *
3934  * This is used to split the value of a GUC_LIST_QUOTE GUC variable, without
3935  * presuming whether the elements will be taken as identifiers or file names.
3936  * We assume the input has already been through flatten_set_variable_args(),
3937  * so that we need never downcase (if appropriate, that was done already).
3938  * Nor do we ever truncate, since we don't know the correct max length.
3939  * We disallow embedded whitespace for simplicity (it shouldn't matter,
3940  * because any embedded whitespace should have led to double-quoting).
3941  * Otherwise the API is identical to SplitIdentifierString.
3942  *
3943  * XXX it's annoying to have so many copies of this string-splitting logic.
3944  * However, it's not clear that having one function with a bunch of option
3945  * flags would be much better.
3946  *
3947  * XXX there is a version of this function in src/bin/pg_dump/dumputils.c.
3948  * Be sure to update that if you have to change this.
3949  *
3950  * Inputs:
3951  * rawstring: the input string; must be overwritable! On return, it's
3952  * been modified to contain the separated identifiers.
3953  * separator: the separator punctuation expected between identifiers
3954  * (typically '.' or ','). Whitespace may also appear around
3955  * identifiers.
3956  * Outputs:
3957  * namelist: filled with a palloc'd list of pointers to identifiers within
3958  * rawstring. Caller should list_free() this even on error return.
3959  *
3960  * Returns true if okay, false if there is a syntax error in the string.
3961  */
3962 bool
3963 SplitGUCList(char *rawstring, char separator,
3964  List **namelist)
3965 {
3966  char *nextp = rawstring;
3967  bool done = false;
3968 
3969  *namelist = NIL;
3970 
3971  while (scanner_isspace(*nextp))
3972  nextp++; /* skip leading whitespace */
3973 
3974  if (*nextp == '\0')
3975  return true; /* allow empty string */
3976 
3977  /* At the top of the loop, we are at start of a new identifier. */
3978  do
3979  {
3980  char *curname;
3981  char *endp;
3982 
3983  if (*nextp == '"')
3984  {
3985  /* Quoted name --- collapse quote-quote pairs */
3986  curname = nextp + 1;
3987  for (;;)
3988  {
3989  endp = strchr(nextp + 1, '"');
3990  if (endp == NULL)
3991  return false; /* mismatched quotes */
3992  if (endp[1] != '"')
3993  break; /* found end of quoted name */
3994  /* Collapse adjacent quotes into one quote, and look again */
3995  memmove(endp, endp + 1, strlen(endp));
3996  nextp = endp;
3997  }
3998  /* endp now points at the terminating quote */
3999  nextp = endp + 1;
4000  }
4001  else
4002  {
4003  /* Unquoted name --- extends to separator or whitespace */
4004  curname = nextp;
4005  while (*nextp && *nextp != separator &&
4006  !scanner_isspace(*nextp))
4007  nextp++;
4008  endp = nextp;
4009  if (curname == nextp)
4010  return false; /* empty unquoted name not allowed */
4011  }
4012 
4013  while (scanner_isspace(*nextp))
4014  nextp++; /* skip trailing whitespace */
4015 
4016  if (*nextp == separator)
4017  {
4018  nextp++;
4019  while (scanner_isspace(*nextp))
4020  nextp++; /* skip leading whitespace for next */
4021  /* we expect another name, so done remains false */
4022  }
4023  else if (*nextp == '\0')
4024  done = true;
4025  else
4026  return false; /* invalid syntax */
4027 
4028  /* Now safe to overwrite separator with a null */
4029  *endp = '\0';
4030 
4031  /*
4032  * Finished isolating current name --- add it to list
4033  */
4034  *namelist = lappend(*namelist, curname);
4035 
4036  /* Loop back if we didn't reach end of string */
4037  } while (!done);
4038 
4039  return true;
4040 }
4041 
4042 
4043 /*****************************************************************************
4044  * Comparison Functions used for bytea
4045  *
4046  * Note: btree indexes need these routines not to leak memory; therefore,
4047  * be careful to free working copies of toasted datums. Most places don't
4048  * need to be so careful.
4049  *****************************************************************************/
4050 
4051 Datum
4053 {
4054  Datum arg1 = PG_GETARG_DATUM(0);
4055  Datum arg2 = PG_GETARG_DATUM(1);
4056  bool result;
4057  Size len1,
4058  len2;
4059 
4060  /*
4061  * We can use a fast path for unequal lengths, which might save us from
4062  * having to detoast one or both values.
4063  */
4064  len1 = toast_raw_datum_size(arg1);
4065  len2 = toast_raw_datum_size(arg2);
4066  if (len1 != len2)
4067  result = false;
4068  else
4069  {
4070  bytea *barg1 = DatumGetByteaPP(arg1);
4071  bytea *barg2 = DatumGetByteaPP(arg2);
4072 
4073  result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2),
4074  len1 - VARHDRSZ) == 0);
4075 
4076  PG_FREE_IF_COPY(barg1, 0);
4077  PG_FREE_IF_COPY(barg2, 1);
4078  }
4079 
4080  PG_RETURN_BOOL(result);
4081 }
4082 
4083 Datum
4085 {
4086  Datum arg1 = PG_GETARG_DATUM(0);
4087  Datum arg2 = PG_GETARG_DATUM(1);
4088  bool result;
4089  Size len1,
4090  len2;
4091 
4092  /*
4093  * We can use a fast path for unequal lengths, which might save us from
4094  * having to detoast one or both values.
4095  */
4096  len1 = toast_raw_datum_size(arg1);
4097  len2 = toast_raw_datum_size(arg2);
4098  if (len1 != len2)
4099  result = true;
4100  else
4101  {
4102  bytea *barg1 = DatumGetByteaPP(arg1);
4103  bytea *barg2 = DatumGetByteaPP(arg2);
4104 
4105  result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2),
4106  len1 - VARHDRSZ) != 0);
4107 
4108  PG_FREE_IF_COPY(barg1, 0);
4109  PG_FREE_IF_COPY(barg2, 1);
4110  }
4111 
4112  PG_RETURN_BOOL(result);
4113 }
4114 
4115 Datum
4117 {
4118  bytea *arg1 = PG_GETARG_BYTEA_PP(0);
4119  bytea *arg2 = PG_GETARG_BYTEA_PP(1);
4120  int len1,
4121  len2;
4122  int cmp;
4123 
4124  len1 = VARSIZE_ANY_EXHDR(arg1);
4125  len2 = VARSIZE_ANY_EXHDR(arg2);
4126 
4127  cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4128 
4129  PG_FREE_IF_COPY(arg1, 0);
4130  PG_FREE_IF_COPY(arg2, 1);
4131 
4132  PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 < len2)));
4133 }
4134 
4135 Datum
4137 {
4138  bytea *arg1 = PG_GETARG_BYTEA_PP(0);
4139  bytea *arg2 = PG_GETARG_BYTEA_PP(1);
4140  int len1,
4141  len2;
4142  int cmp;
4143 
4144  len1 = VARSIZE_ANY_EXHDR(arg1);
4145  len2 = VARSIZE_ANY_EXHDR(arg2);
4146 
4147  cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4148 
4149  PG_FREE_IF_COPY(arg1, 0);
4150  PG_FREE_IF_COPY(arg2, 1);
4151 
4152  PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 <= len2)));
4153 }
4154 
4155 Datum
4157 {
4158  bytea *arg1 = PG_GETARG_BYTEA_PP(0);
4159  bytea *arg2 = PG_GETARG_BYTEA_PP(1);
4160  int len1,
4161  len2;
4162  int cmp;
4163 
4164  len1 = VARSIZE_ANY_EXHDR(arg1);
4165  len2 = VARSIZE_ANY_EXHDR(arg2);
4166 
4167  cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4168 
4169  PG_FREE_IF_COPY(arg1, 0);
4170  PG_FREE_IF_COPY(arg2, 1);
4171 
4172  PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 > len2)));
4173 }
4174 
4175 Datum
4177 {
4178  bytea *arg1 = PG_GETARG_BYTEA_PP(0);
4179  bytea *arg2 = PG_GETARG_BYTEA_PP(1);
4180  int len1,
4181  len2;
4182  int cmp;
4183 
4184  len1 = VARSIZE_ANY_EXHDR(arg1);
4185  len2 = VARSIZE_ANY_EXHDR(arg2);
4186 
4187  cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4188 
4189  PG_FREE_IF_COPY(arg1, 0);
4190  PG_FREE_IF_COPY(arg2, 1);
4191 
4192  PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 >= len2)));
4193 }
4194 
4195 Datum
4197 {
4198  bytea *arg1 = PG_GETARG_BYTEA_PP(0);
4199  bytea *arg2 = PG_GETARG_BYTEA_PP(1);
4200  int len1,
4201  len2;
4202  int cmp;
4203 
4204  len1 = VARSIZE_ANY_EXHDR(arg1);
4205  len2 = VARSIZE_ANY_EXHDR(arg2);
4206 
4207  cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4208  if ((cmp == 0) && (len1 != len2))
4209  cmp = (len1 < len2) ? -1 : 1;
4210 
4211  PG_FREE_IF_COPY(arg1, 0);
4212  PG_FREE_IF_COPY(arg2, 1);
4213 
4215 }
4216 
4217 Datum
4219 {
4221  MemoryContext oldcontext;
4222 
4223  oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
4224 
4225  /* Use generic string SortSupport, forcing "C" collation */
4226  varstr_sortsupport(ssup, BYTEAOID, C_COLLATION_OID);
4227 
4228  MemoryContextSwitchTo(oldcontext);
4229 
4230  PG_RETURN_VOID();
4231 }
4232 
4233 /*
4234  * appendStringInfoText
4235  *
4236  * Append a text to str.
4237  * Like appendStringInfoString(str, text_to_cstring(t)) but faster.
4238  */
4239 static void
4241 {
4243 }
4244 
4245 /*
4246  * replace_text
4247  * replace all occurrences of 'old_sub_str' in 'orig_str'
4248  * with 'new_sub_str' to form 'new_str'
4249  *
4250  * returns 'orig_str' if 'old_sub_str' == '' or 'orig_str' == ''
4251  * otherwise returns 'new_str'
4252  */
4253 Datum
4255 {
4256  text *src_text = PG_GETARG_TEXT_PP(0);
4257  text *from_sub_text = PG_GETARG_TEXT_PP(1);
4258  text *to_sub_text = PG_GETARG_TEXT_PP(2);
4259  int src_text_len;
4260  int from_sub_text_len;
4262  text *ret_text;
4263  int chunk_len;
4264  char *curr_ptr;
4265  char *start_ptr;
4267  bool found;
4268 
4269  src_text_len = VARSIZE_ANY_EXHDR(src_text);
4270  from_sub_text_len = VARSIZE_ANY_EXHDR(from_sub_text);
4271 
4272  /* Return unmodified source string if empty source or pattern */
4273  if (src_text_len < 1 || from_sub_text_len < 1)
4274  {
4275  PG_RETURN_TEXT_P(src_text);
4276  }
4277 
4278  text_position_setup(src_text, from_sub_text, PG_GET_COLLATION(), &state);
4279 
4280  found = text_position_next(&state);
4281 
4282  /* When the from_sub_text is not found, there is nothing to do. */
4283  if (!found)
4284  {
4286  PG_RETURN_TEXT_P(src_text);
4287  }
4288  curr_ptr = text_position_get_match_ptr(&state);
4289  start_ptr = VARDATA_ANY(src_text);
4290 
4291  initStringInfo(&str);
4292 
4293  do
4294  {
4296 
4297  /* copy the data skipped over by last text_position_next() */
4298  chunk_len = curr_ptr - start_ptr;
4299  appendBinaryStringInfo(&str, start_ptr, chunk_len);
4300 
4301  appendStringInfoText(&str, to_sub_text);
4302 
4303  start_ptr = curr_ptr + from_sub_text_len;
4304 
4305  found = text_position_next(&state);
4306  if (found)
4307  curr_ptr = text_position_get_match_ptr(&state);
4308  }
4309  while (found);
4310 
4311  /* copy trailing data */
4312  chunk_len = ((char *) src_text + VARSIZE_ANY(src_text)) - start_ptr;
4313  appendBinaryStringInfo(&str, start_ptr, chunk_len);
4314 
4316 
4317  ret_text = cstring_to_text_with_len(str.data, str.len);
4318  pfree(str.data);
4319 
4320  PG_RETURN_TEXT_P(ret_text);
4321 }
4322 
4323 /*
4324  * check_replace_text_has_escape
4325  *
4326  * Returns 0 if text contains no backslashes that need processing.
4327  * Returns 1 if text contains backslashes, but not regexp submatch specifiers.
4328  * Returns 2 if text contains regexp submatch specifiers (\1 .. \9).
4329  */
4330 static int
4332 {
4333  int result = 0;
4334  const char *p = VARDATA_ANY(replace_text);
4335  const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);
4336 
4337  while (p < p_end)
4338  {
4339  /* Find next escape char, if any. */
4340  p = memchr(p, '\\', p_end - p);
4341  if (p == NULL)
4342  break;
4343  p++;
4344  /* Note: a backslash at the end doesn't require extra processing. */
4345  if (p < p_end)
4346  {
4347  if (*p >= '1' && *p <= '9')
4348  return 2; /* Found a submatch specifier, so done */
4349  result = 1; /* Found some other sequence, keep looking */
4350  p++;
4351  }
4352  }
4353  return result;
4354 }
4355 
4356 /*
4357  * appendStringInfoRegexpSubstr
4358  *
4359  * Append replace_text to str, substituting regexp back references for
4360  * \n escapes. start_ptr is the start of the match in the source string,
4361  * at logical character position data_pos.
4362  */
4363 static void
4365  regmatch_t *pmatch,
4366  char *start_ptr, int data_pos)
4367 {
4368  const char *p = VARDATA_ANY(replace_text);
4369  const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);
4370 
4371  while (p < p_end)
4372  {
4373  const char *chunk_start = p;
4374  int so;
4375  int eo;
4376 
4377  /* Find next escape char, if any. */
4378  p = memchr(p, '\\', p_end - p);
4379  if (p == NULL)
4380  p = p_end;
4381 
4382  /* Copy the text we just scanned over, if any. */
4383  if (p > chunk_start)
4384  appendBinaryStringInfo(str, chunk_start, p - chunk_start);
4385 
4386  /* Done if at end of string, else advance over escape char. */
4387  if (p >= p_end)
4388  break;
4389  p++;
4390 
4391  if (p >= p_end)
4392  {
4393  /* Escape at very end of input. Treat same as unexpected char */
4394  appendStringInfoChar(str, '\\');
4395  break;
4396  }
4397 
4398  if (*p >= '1' && *p <= '9')
4399  {
4400  /* Use the back reference of regexp. */
4401  int idx = *p - '0';
4402 
4403  so = pmatch[idx].rm_so;
4404  eo = pmatch[idx].rm_eo;
4405  p++;
4406  }
4407  else if (*p == '&')
4408  {
4409  /* Use the entire matched string. */
4410  so = pmatch[0].rm_so;
4411  eo = pmatch[0].rm_eo;
4412  p++;
4413  }
4414  else if (*p == '\\')
4415  {
4416  /* \\ means transfer one \ to output. */
4417  appendStringInfoChar(str, '\\');
4418  p++;
4419  continue;
4420  }
4421  else
4422  {
4423  /*
4424  * If escape char is not followed by any expected char, just treat
4425  * it as ordinary data to copy. (XXX would it be better to throw
4426  * an error?)
4427  */
4428  appendStringInfoChar(str, '\\');
4429  continue;
4430  }
4431 
4432  if (so >= 0 && eo >= 0)
4433  {
4434  /*
4435  * Copy the text that is back reference of regexp. Note so and eo
4436  * are counted in characters not bytes.
4437  */
4438  char *chunk_start;
4439  int chunk_len;
4440 
4441  Assert(so >= data_pos);
4442  chunk_start = start_ptr;
4443  chunk_start += charlen_to_bytelen(chunk_start, so - data_pos);
4444  chunk_len = charlen_to_bytelen(chunk_start, eo - so);
4445  appendBinaryStringInfo(str, chunk_start, chunk_len);
4446  }
4447  }
4448 }
4449 
4450 /*
4451  * replace_text_regexp
4452  *
4453  * replace substring(s) in src_text that match pattern with replace_text.
4454  * The replace_text can contain backslash markers to substitute
4455  * (parts of) the matched text.
4456  *
4457  * cflags: regexp compile flags.
4458  * collation: collation to use.
4459  * search_start: the character (not byte) offset in src_text at which to
4460  * begin searching.
4461  * n: if 0, replace all matches; if > 0, replace only the N'th match.
4462  */
4463 text *
4464 replace_text_regexp(text *src_text, text *pattern_text,
4465  text *replace_text,
4466  int cflags, Oid collation,
4467  int search_start, int n)
4468 {
4469  text *ret_text;
4470  regex_t *re;
4471  int src_text_len = VARSIZE_ANY_EXHDR(src_text);
4472  int nmatches = 0;
4474  regmatch_t pmatch[10]; /* main match, plus \1 to \9 */
4475  int nmatch = lengthof(pmatch);
4476  pg_wchar *data;
4477  size_t data_len;
4478  int data_pos;
4479  char *start_ptr;
4480  int escape_status;
4481 
4482  initStringInfo(&buf);
4483 
4484  /* Convert data string to wide characters. */
4485  data = (pg_wchar *) palloc((src_text_len + 1) * sizeof(pg_wchar));
4486  data_len = pg_mb2wchar_with_len(VARDATA_ANY(src_text), data, src_text_len);
4487 
4488  /* Check whether replace_text has escapes, especially regexp submatches. */
4490 
4491  /* If no regexp submatches, we can use REG_NOSUB. */
4492  if (escape_status < 2)
4493  {
4494  cflags |= REG_NOSUB;
4495  /* Also tell pg_regexec we only want the whole-match location. */
4496  nmatch = 1;
4497  }
4498 
4499  /* Prepare the regexp. */
4500  re = RE_compile_and_cache(pattern_text, cflags, collation);
4501 
4502  /* start_ptr points to the data_pos'th character of src_text */
4503  start_ptr = (char *) VARDATA_ANY(src_text);
4504  data_pos = 0;
4505 
4506  while (search_start <= data_len)
4507  {
4508  int regexec_result;
4509 
4511 
4512  regexec_result = pg_regexec(re,
4513  data,
4514  data_len,
4515  search_start,
4516  NULL, /* no details */
4517  nmatch,
4518  pmatch,
4519  0);
4520 
4521  if (regexec_result == REG_NOMATCH)
4522  break;
4523 
4524  if (regexec_result != REG_OKAY)
4525  {
4526  char errMsg[100];
4527 
4529  pg_regerror(regexec_result, re, errMsg, sizeof(errMsg));
4530  ereport(ERROR,
4531  (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
4532  errmsg("regular expression failed: %s", errMsg)));
4533  }
4534 
4535  /*
4536  * Count matches, and decide whether to replace this match.
4537  */
4538  nmatches++;
4539  if (n > 0 && nmatches != n)
4540  {
4541  /*
4542  * No, so advance search_start, but not start_ptr/data_pos. (Thus,
4543  * we treat the matched text as if it weren't matched, and copy it
4544  * to the output later.)
4545  */
4546  search_start = pmatch[0].rm_eo;
4547  if (pmatch[0].rm_so == pmatch[0].rm_eo)
4548  search_start++;
4549  continue;
4550  }
4551 
4552  /*
4553  * Copy the text to the left of the match position. Note we are given
4554  * character not byte indexes.
4555  */
4556  if (pmatch[0].rm_so - data_pos > 0)
4557  {
4558  int chunk_len;
4559 
4560  chunk_len = charlen_to_bytelen(start_ptr,
4561  pmatch[0].rm_so - data_pos);
4562  appendBinaryStringInfo(&buf, start_ptr, chunk_len);
4563 
4564  /*
4565  * Advance start_ptr over that text, to avoid multiple rescans of
4566  * it if the replace_text contains multiple back-references.
4567  */
4568  start_ptr += chunk_len;
4569  data_pos = pmatch[0].rm_so;
4570  }
4571 
4572  /*
4573  * Copy the replace_text, processing escapes if any are present.
4574  */
4575  if (escape_status > 0)
4577  start_ptr, data_pos);
4578  else
4580 
4581  /* Advance start_ptr and data_pos over the matched text. */
4582  start_ptr += charlen_to_bytelen(start_ptr,
4583  pmatch[0].rm_eo - data_pos);
4584  data_pos = pmatch[0].rm_eo;
4585 
4586  /*
4587  * If we only want to replace one occurrence, we're done.
4588  */
4589  if (n > 0)
4590  break;
4591 
4592  /*
4593  * Advance search position. Normally we start the next search at the
4594  * end of the previous match; but if the match was of zero length, we
4595  * have to advance by one character, or we'd just find the same match
4596  * again.
4597  */
4598  search_start = data_pos;
4599  if (pmatch[0].rm_so == pmatch[0].rm_eo)
4600  search_start++;
4601  }
4602 
4603  /*
4604  * Copy the text to the right of the last match.
4605  */
4606  if (data_pos < data_len)
4607  {
4608  int chunk_len;
4609 
4610  chunk_len = ((char *) src_text + VARSIZE_ANY(src_text)) - start_ptr;
4611  appendBinaryStringInfo(&buf, start_ptr, chunk_len);
4612  }
4613 
4614  ret_text = cstring_to_text_with_len(buf.data, buf.len);
4615  pfree(buf.data);
4616  pfree(data);
4617 
4618  return ret_text;
4619 }
4620 
4621 /*
4622  * split_part
4623  * parse input string based on provided field separator
4624  * return N'th item (1 based, negative counts from end)
4625  */
4626 Datum
4628 {
4629  text *inputstring = PG_GETARG_TEXT_PP(0);
4630  text *fldsep = PG_GETARG_TEXT_PP(1);
4631  int fldnum = PG_GETARG_INT32(2);
4632  int inputstring_len;
4633  int fldsep_len;
4635  char *start_ptr;
4636  char *end_ptr;
4637  text *result_text;
4638  bool found;
4639 
4640  /* field number is 1 based */
4641  if (fldnum == 0)
4642  ereport(ERROR,
4643  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4644  errmsg("field position must not be zero")));
4645 
4646  inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
4647  fldsep_len = VARSIZE_ANY_EXHDR(fldsep);
4648 
4649  /* return empty string for empty input string */
4650  if (inputstring_len < 1)
4652 
4653  /* handle empty field separator */
4654  if (fldsep_len < 1)
4655  {
4656  /* if first or last field, return input string, else empty string */
4657  if (fldnum == 1 || fldnum == -1)
4658  PG_RETURN_TEXT_P(inputstring);
4659  else
4661  }
4662 
4663  /* find the first field separator */
4664  text_position_setup(inputstring, fldsep, PG_GET_COLLATION(), &state);
4665 
4666  found = text_position_next(&state);
4667 
4668  /* special case if fldsep not found at all */
4669  if (!found)
4670  {
4672  /* if first or last field, return input string, else empty string */
4673  if (fldnum == 1 || fldnum == -1)
4674  PG_RETURN_TEXT_P(inputstring);
4675  else
4677  }
4678 
4679  /*
4680  * take care of a negative field number (i.e. count from the right) by
4681  * converting to a positive field number; we need total number of fields
4682  */
4683  if (fldnum < 0)
4684  {
4685  /* we found a fldsep, so there are at least two fields */
4686  int numfields = 2;
4687 
4688  while (text_position_next(&state))
4689  numfields++;
4690 
4691  /* special case of last field does not require an extra pass */
4692  if (fldnum == -1)
4693  {
4694  start_ptr = text_position_get_match_ptr(&state) + fldsep_len;
4695  end_ptr = VARDATA_ANY(inputstring) + inputstring_len;
4698  end_ptr - start_ptr));
4699  }
4700 
4701  /* else, convert fldnum to positive notation */
4702  fldnum += numfields + 1;
4703 
4704  /* if nonexistent field, return empty string */
4705  if (fldnum <= 0)
4706  {
4709  }
4710 
4711  /* reset to pointing at first match, but now with positive fldnum */
4713  found = text_position_next(&state);
4714  Assert(found);
4715  }
4716 
4717  /* identify bounds of first field */
4718  start_ptr = VARDATA_ANY(inputstring);
4719  end_ptr = text_position_get_match_ptr(&state);
4720 
4721  while (found && --fldnum > 0)
4722  {
4723  /* identify bounds of next field */
4724  start_ptr = end_ptr + fldsep_len;
4725  found = text_position_next(&state);
4726  if (found)
4727  end_ptr = text_position_get_match_ptr(&state);
4728  }
4729 
4731 
4732  if (fldnum > 0)
4733  {
4734  /* N'th field separator not found */
4735  /* if last field requested, return it, else empty string */
4736  if (fldnum == 1)
4737  {
4738  int last_len = start_ptr - VARDATA_ANY(inputstring);
4739 
4740  result_text = cstring_to_text_with_len(start_ptr,
4741  inputstring_len - last_len);
4742  }
4743  else
4744  result_text = cstring_to_text("");
4745  }
4746  else
4747  {
4748  /* non-last field requested */
4749  result_text = cstring_to_text_with_len(start_ptr, end_ptr - start_ptr);
4750  }
4751 
4752  PG_RETURN_TEXT_P(result_text);
4753 }
4754 
4755 /*
4756  * Convenience function to return true when two text params are equal.
4757  */
4758 static bool
4759 text_isequal(text *txt1, text *txt2, Oid collid)
4760 {
4762  collid,
4763  PointerGetDatum(txt1),
4764  PointerGetDatum(txt2)));
4765 }
4766 
4767 /*
4768  * text_to_array
4769  * parse input string and return text array of elements,
4770  * based on provided field separator
4771  */
4772 Datum
4774 {
4775  SplitTextOutputData tstate;
4776 
4777  /* For array output, tstate should start as all zeroes */
4778  memset(&tstate, 0, sizeof(tstate));
4779 
4780  if (!split_text(fcinfo, &tstate))
4781  PG_RETURN_NULL();
4782 
4783  if (tstate.astate == NULL)
4785 
4788 }
4789 
4790 /*
4791  * text_to_array_null
4792  * parse input string and return text array of elements,
4793  * based on provided field separator and null string
4794  *
4795  * This is a separate entry point only to prevent the regression tests from
4796  * complaining about different argument sets for the same internal function.
4797  */
4798 Datum
4800 {
4801  return text_to_array(fcinfo);
4802 }
4803 
4804 /*
4805  * text_to_table
4806  * parse input string and return table of elements,
4807  * based on provided field separator
4808  */
4809 Datum
4811 {
4812  ReturnSetInfo *rsi = (ReturnSetInfo *) fcinfo->resultinfo;
4813  SplitTextOutputData tstate;
4814 
4815  tstate.astate = NULL;
4817  tstate.tupstore = rsi->setResult;
4818  tstate.tupdesc = rsi->setDesc;
4819 
4820  (void) split_text(fcinfo, &tstate);
4821 
4822  return (Datum) 0;
4823 }
4824 
4825 /*
4826  * text_to_table_null
4827  * parse input string and return table of elements,
4828  * based on provided field separator and null string
4829  *
4830  * This is a separate entry point only to prevent the regression tests from
4831  * complaining about different argument sets for the same internal function.
4832  */
4833 Datum
4835 {
4836  return text_to_table(fcinfo);
4837 }
4838 
4839 /*
4840  * Common code for text_to_array, text_to_array_null, text_to_table
4841  * and text_to_table_null functions.
4842  *
4843  * These are not strict so we have to test for null inputs explicitly.
4844  * Returns false if result is to be null, else returns true.
4845  *
4846  * Note that if the result is valid but empty (zero elements), we return
4847  * without changing *tstate --- caller must handle that case, too.
4848  */
4849 static bool
4851 {
4852  text *inputstring;
4853  text *fldsep;
4854  text *null_string;
4855  Oid collation = PG_GET_COLLATION();
4856  int inputstring_len;
4857  int fldsep_len;
4858  char *start_ptr;
4859  text *result_text;
4860 
4861  /* when input string is NULL, then result is NULL too */
4862  if (PG_ARGISNULL(0))
4863  return false;
4864 
4865  inputstring = PG_GETARG_TEXT_PP(0);
4866 
4867  /* fldsep can be NULL */
4868  if (!PG_ARGISNULL(1))
4869  fldsep = PG_GETARG_TEXT_PP(1);
4870  else
4871  fldsep = NULL;
4872 
4873  /* null_string can be NULL or omitted */
4874  if (PG_NARGS() > 2 && !PG_ARGISNULL(2))
4875  null_string = PG_GETARG_TEXT_PP(2);
4876  else
4877  null_string = NULL;
4878 
4879  if (fldsep != NULL)
4880  {
4881  /*
4882  * Normal case with non-null fldsep. Use the text_position machinery
4883  * to search for occurrences of fldsep.
4884  */
4886 
4887  inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
4888  fldsep_len = VARSIZE_ANY_EXHDR(fldsep);
4889 
4890  /* return empty set for empty input string */
4891  if (inputstring_len < 1)
4892  return true;
4893 
4894  /* empty field separator: return input string as a one-element set */
4895  if (fldsep_len < 1)
4896  {
4897  split_text_accum_result(tstate, inputstring,
4898  null_string, collation);
4899  return true;
4900  }
4901 
4902  text_position_setup(inputstring, fldsep, collation, &state);
4903 
4904  start_ptr = VARDATA_ANY(inputstring);
4905 
4906  for (;;)
4907  {
4908  bool found;
4909  char *end_ptr;
4910  int chunk_len;
4911 
4913 
4914  found = text_position_next(&state);
4915  if (!found)
4916  {
4917  /* fetch last field */
4918  chunk_len = ((char *) inputstring + VARSIZE_ANY(inputstring)) - start_ptr;
4919  end_ptr = NULL; /* not used, but some compilers complain */
4920  }
4921  else
4922  {
4923  /* fetch non-last field */
4924  end_ptr = text_position_get_match_ptr(&state);
4925  chunk_len = end_ptr - start_ptr;
4926  }
4927 
4928  /* build a temp text datum to pass to split_text_accum_result */
4929  result_text = cstring_to_text_with_len(start_ptr, chunk_len);
4930 
4931  /* stash away this field */
4932  split_text_accum_result(tstate, result_text,
4933  null_string, collation);
4934 
4935  pfree(result_text);
4936 
4937  if (!found)
4938  break;
4939 
4940  start_ptr = end_ptr + fldsep_len;
4941  }
4942 
4944  }
4945  else
4946  {
4947  /*
4948  * When fldsep is NULL, each character in the input string becomes a
4949  * separate element in the result set. The separator is effectively
4950  * the space between characters.
4951  */
4952  inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
4953 
4954  start_ptr = VARDATA_ANY(inputstring);
4955 
4956  while (inputstring_len > 0)
4957  {
4958  int chunk_len = pg_mblen(start_ptr);
4959 
4961 
4962  /* build a temp text datum to pass to split_text_accum_result */
4963  result_text = cstring_to_text_with_len(start_ptr, chunk_len);
4964 
4965  /* stash away this field */
4966  split_text_accum_result(tstate, result_text,
4967  null_string, collation);
4968 
4969  pfree(result_text);
4970 
4971  start_ptr += chunk_len;
4972  inputstring_len -= chunk_len;
4973  }
4974  }
4975 
4976  return true;
4977 }
4978 
4979 /*
4980  * Add text item to result set (table or array).
4981  *
4982  * This is also responsible for checking to see if the item matches
4983  * the null_string, in which case we should emit NULL instead.
4984  */
4985 static void
4987  text *field_value,
4988  text *null_string,
4989  Oid collation)
4990 {
4991  bool is_null = false;
4992 
4993  if (null_string && text_isequal(field_value, null_string, collation))
4994  is_null = true;
4995 
4996  if (tstate->tupstore)
4997  {
4998  Datum values[1];
4999  bool nulls[1];
5000 
5001  values[0] = PointerGetDatum(field_value);
5002  nulls[0] = is_null;
5003 
5005  tstate->tupdesc,
5006  values,
5007  nulls);
5008  }
5009  else
5010  {
5011  tstate->astate = accumArrayResult(tstate->astate,
5012  PointerGetDatum(field_value),
5013  is_null,
5014  TEXTOID,
5016  }
5017 }
5018 
5019 /*
5020  * array_to_text
5021  * concatenate Cstring representation of input array elements
5022  * using provided field separator
5023  */
5024 Datum
5026 {
5028  char *fldsep = text_to_cstring(PG_GETARG_TEXT_PP(1));
5029 
5030  PG_RETURN_TEXT_P(array_to_text_internal(fcinfo, v, fldsep, NULL));
5031 }
5032 
5033 /*
5034  * array_to_text_null
5035  * concatenate Cstring representation of input array elements
5036  * using provided field separator and null string
5037  *
5038  * This version is not strict so we have to test for null inputs explicitly.
5039  */
5040 Datum
5042 {
5043  ArrayType *v;
5044  char *fldsep;
5045  char *null_string;
5046 
5047  /* returns NULL when first or second parameter is NULL */
5048  if (PG_ARGISNULL(0) || PG_ARGISNULL(1))
5049  PG_RETURN_NULL();
5050 
5051  v = PG_GETARG_ARRAYTYPE_P(0);
5052  fldsep = text_to_cstring(PG_GETARG_TEXT_PP(1));
5053 
5054  /* NULL null string is passed through as a null pointer */
5055  if (!PG_ARGISNULL(2))
5056  null_string = text_to_cstring(PG_GETARG_TEXT_PP(2));
5057  else
5058  null_string = NULL;
5059 
5060  PG_RETURN_TEXT_P(array_to_text_internal(fcinfo, v, fldsep, null_string));
5061 }
5062 
5063 /*
5064  * common code for array_to_text and array_to_text_null functions
5065  */
5066 static text *
5068  const char *fldsep, const char *null_string)
5069 {
5070  text *result;
5071  int nitems,
5072  *dims,
5073  ndims;
5074  Oid element_type;
5075  int typlen;
5076  bool typbyval;
5077  char typalign;
5079  bool printed = false;
5080  char *p;
5081  bits8 *bitmap;
5082  int bitmask;
5083  int i;
5084  ArrayMetaState *my_extra;
5085 
5086  ndims = ARR_NDIM(v);
5087  dims = ARR_DIMS(v);
5088  nitems = ArrayGetNItems(ndims, dims);
5089 
5090  /* if there are no elements, return an empty string */
5091  if (nitems == 0)
5092  return cstring_to_text_with_len("", 0);
5093 
5094  element_type = ARR_ELEMTYPE(v);
5095  initStringInfo(&buf);
5096 
5097  /*
5098  * We arrange to look up info about element type, including its output
5099  * conversion proc, only once per series of calls, assuming the element
5100  * type doesn't change underneath us.
5101  */
5102  my_extra = (ArrayMetaState *) fcinfo->flinfo->fn_extra;
5103  if (my_extra == NULL)
5104  {
5105  fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
5106  sizeof(ArrayMetaState));
5107  my_extra = (ArrayMetaState *) fcinfo->flinfo->fn_extra;
5108  my_extra->element_type = ~element_type;
5109  }
5110 
5111  if (my_extra->element_type != element_type)
5112  {
5113  /*
5114  * Get info about element type, including its output conversion proc
5115  */
5116  get_type_io_data(element_type, IOFunc_output,
5117  &my_extra->typlen, &my_extra->typbyval,
5118  &my_extra->typalign, &my_extra->typdelim,
5119  &my_extra->typioparam, &my_extra->typiofunc);
5120  fmgr_info_cxt(my_extra->typiofunc, &my_extra->proc,
5121  fcinfo->flinfo->fn_mcxt);
5122  my_extra->element_type = element_type;
5123  }
5124  typlen = my_extra->typlen;
5125  typbyval = my_extra->typbyval;
5126  typalign = my_extra->typalign;
5127 
5128  p = ARR_DATA_PTR(v);
5129  bitmap = ARR_NULLBITMAP(v);
5130  bitmask = 1;
5131 
5132  for (i = 0; i < nitems; i++)
5133  {
5134  Datum itemvalue;
5135  char *value;
5136 
5137  /* Get source element, checking for NULL */
5138  if (bitmap && (*bitmap & bitmask) == 0)
5139  {
5140  /* if null_string is NULL, we just ignore null elements */
5141  if (null_string != NULL)
5142  {
5143  if (printed)
5144  appendStringInfo(&buf, "%s%s", fldsep, null_string);
5145  else
5146  appendStringInfoString(&buf, null_string);
5147  printed = true;
5148  }
5149  }
5150  else
5151  {
5152  itemvalue = fetch_att(p, typbyval, typlen);
5153 
5154  value = OutputFunctionCall(&my_extra->proc, itemvalue);
5155 
5156  if (printed)
5157  appendStringInfo(&buf, "%s%s", fldsep, value);
5158  else
5160  printed = true;
5161 
5162  p = att_addlength_pointer(p, typlen, p);
5163  p = (char *) att_align_nominal(p, typalign);
5164  }
5165 
5166  /* advance bitmap pointer if any */
5167  if (bitmap)
5168  {
5169  bitmask <<= 1;
5170  if (bitmask == 0x100)
5171  {
5172  bitmap++;
5173  bitmask = 1;
5174  }
5175  }
5176  }
5177 
5178  result = cstring_to_text_with_len(buf.data, buf.len);
5179  pfree(buf.data);
5180 
5181  return result;
5182 }
5183 
5184 #define HEXBASE 16
5185 /*
5186  * Convert an int32 to a string containing a base 16 (hex) representation of
5187  * the number.
5188  */
5189 Datum
5191 {
5193  char *ptr;
5194  const char *digits = "0123456789abcdef";
5195  char buf[32]; /* bigger than needed, but reasonable */
5196 
5197  ptr = buf + sizeof(buf) - 1;
5198  *ptr = '\0';
5199 
5200  do
5201  {
5202  *--ptr = digits[value % HEXBASE];
5203  value /= HEXBASE;
5204  } while (ptr > buf && value);
5205 
5207 }
5208 
5209 /*
5210  * Convert an int64 to a string containing a base 16 (hex) representation of
5211  * the number.
5212  */
5213 Datum
5215 {
5216  uint64 value = (uint64) PG_GETARG_INT64(0);
5217  char *ptr;
5218  const char *digits = "0123456789abcdef";
5219  char buf[32]; /* bigger than needed, but reasonable */
5220 
5221  ptr = buf + sizeof(buf) - 1;
5222  *ptr = '\0';
5223 
5224  do
5225  {
5226  *--ptr = digits[value % HEXBASE];
5227  value /= HEXBASE;
5228  } while (ptr > buf && value);
5229 
5231 }
5232 
5233 /*
5234  * Return the size of a datum, possibly compressed
5235  *
5236  * Works on any data type
5237  */
5238 Datum
5240 {
5242  int32 result;
5243  int typlen;
5244 
5245  /* On first call, get the input type's typlen, and save at *fn_extra */
5246  if (fcinfo->flinfo->fn_extra == NULL)
5247  {
5248  /* Lookup the datatype of the supplied argument */
5249  Oid argtypeid = get_fn_expr_argtype(fcinfo->flinfo, 0);
5250 
5251  typlen = get_typlen(argtypeid);
5252  if (typlen == 0) /* should not happen */
5253  elog(ERROR, "cache lookup failed for type %u", argtypeid);
5254 
5255  fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
5256  sizeof(int));
5257  *((int *) fcinfo->flinfo->fn_extra) = typlen;
5258  }
5259  else
5260  typlen = *((int *) fcinfo->flinfo->fn_extra);
5261 
5262  if (typlen == -1)
5263  {
5264  /* varlena type, possibly toasted */
5265  result = toast_datum_size(value);
5266  }
5267  else if (typlen == -2)
5268  {
5269  /* cstring */
5270  result = strlen(DatumGetCString(value)) + 1;
5271  }
5272  else
5273  {
5274  /* ordinary fixed-width type */
5275  result = typlen;
5276  }
5277 
5278  PG_RETURN_INT32(result);
5279 }
5280 
5281 /*
5282  * Return the compression method stored in the compressed attribute. Return
5283  * NULL for non varlena type or uncompressed data.
5284  */
5285 Datum
5287 {
5288  int typlen;
5289  char *result;
5290  ToastCompressionId cmid;
5291 
5292  /* On first call, get the input type's typlen, and save at *fn_extra */
5293  if (fcinfo->flinfo->fn_extra == NULL)
5294  {
5295  /* Lookup the datatype of the supplied argument */
5296  Oid argtypeid = get_fn_expr_argtype(fcinfo->flinfo, 0);
5297 
5298  typlen = get_typlen(argtypeid);
5299  if (typlen == 0) /* should not happen */
5300  elog(ERROR, "cache lookup failed for type %u", argtypeid);
5301 
5302  fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
5303  sizeof(int));
5304  *((int *) fcinfo->flinfo->fn_extra) = typlen;
5305  }
5306  else
5307  typlen = *((int *) fcinfo->flinfo->fn_extra);
5308 
5309  if (typlen != -1)
5310  PG_RETURN_NULL();
5311 
5312  /* get the compression method id stored in the compressed varlena */
5313  cmid = toast_get_compression_id((struct varlena *)
5315  if (cmid == TOAST_INVALID_COMPRESSION_ID)
5316  PG_RETURN_NULL();
5317 
5318  /* convert compression method id to compression method name */
5319  switch (cmid)
5320  {
5322  result = "pglz";
5323  break;
5325  result = "lz4";
5326  break;
5327  default:
5328  elog(ERROR, "invalid compression method id %d", cmid);
5329  }
5330 
5332 }
5333 
5334 /*
5335  * string_agg - Concatenates values and returns string.
5336  *
5337  * Syntax: string_agg(value text, delimiter text) RETURNS text
5338  *
5339  * Note: Any NULL values are ignored. The first-call delimiter isn't
5340  * actually used at all, and on subsequent calls the delimiter precedes
5341  * the associated value.
5342  */
5343 
5344 /* subroutine to initialize state */
5345 static StringInfo
5347 {
5348  StringInfo state;
5349  MemoryContext aggcontext;
5350  MemoryContext oldcontext;
5351 
5352  if (!AggCheckCallContext(fcinfo, &aggcontext))
5353  {
5354  /* cannot be called directly because of internal-type argument */
5355  elog(ERROR, "string_agg_transfn called in non-aggregate context");
5356  }
5357 
5358  /*
5359  * Create state in aggregate context. It'll stay there across subsequent
5360  * calls.
5361  */
5362  oldcontext = MemoryContextSwitchTo(aggcontext);
5363  state = makeStringInfo();
5364  MemoryContextSwitchTo(oldcontext);
5365 
5366  return state;
5367 }
5368 
5369 Datum
5371 {
5372  StringInfo state;
5373 
5374  state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
5375 
5376  /* Append the value unless null. */
5377  if (!PG_ARGISNULL(1))
5378  {
5379  /* On the first time through, we ignore the delimiter. */
5380  if (state == NULL)
5381  state = makeStringAggState(fcinfo);
5382  else if (!PG_ARGISNULL(2))
5383  appendStringInfoText(state, PG_GETARG_TEXT_PP(2)); /* delimiter */
5384 
5385  appendStringInfoText(state, PG_GETARG_TEXT_PP(1)); /* value */
5386  }
5387 
5388  /*
5389  * The transition type for string_agg() is declared to be "internal",
5390  * which is a pass-by-value type the same size as a pointer.
5391  */
5393 }
5394 
5395 Datum
5397 {
5398  StringInfo state;
5399 
5400  /* cannot be called directly because of internal-type argument */
5401  Assert(AggCheckCallContext(fcinfo, NULL));
5402 
5403  state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
5404 
5405  if (state != NULL)
5407  else
5408  PG_RETURN_NULL();
5409 }
5410 
5411 /*
5412  * Prepare cache with fmgr info for the output functions of the datatypes of
5413  * the arguments of a concat-like function, beginning with argument "argidx".
5414  * (Arguments before that will have corresponding slots in the resulting
5415  * FmgrInfo array, but we don't fill those slots.)
5416  */
5417 static FmgrInfo *
5419 {
5420  FmgrInfo *foutcache;
5421  int i;
5422 
5423  /* We keep the info in fn_mcxt so it survives across calls */
5424  foutcache = (FmgrInfo *) MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
5425  PG_NARGS() * sizeof(FmgrInfo));
5426 
5427  for (i = argidx; i < PG_NARGS(); i++)
5428  {
5429  Oid valtype;
5430  Oid typOutput;
5431  bool typIsVarlena;
5432 
5433  valtype = get_fn_expr_argtype(fcinfo->flinfo, i);
5434  if (!OidIsValid(valtype))
5435  elog(ERROR, "could not determine data type of concat() input");
5436 
5437  getTypeOutputInfo(valtype, &typOutput, &typIsVarlena);
5438  fmgr_info_cxt(typOutput, &foutcache[i], fcinfo->flinfo->fn_mcxt);
5439  }
5440 
5441  fcinfo->flinfo->fn_extra = foutcache;
5442 
5443  return foutcache;
5444 }
5445 
5446 /*
5447  * Implementation of both concat() and concat_ws().
5448  *
5449  * sepstr is the separator string to place between values.
5450  * argidx identifies the first argument to concatenate (counting from zero);
5451  * note that this must be constant across any one series of calls.
5452  *
5453  * Returns NULL if result should be NULL, else text value.
5454  */
5455 static text *
5456 concat_internal(const char *sepstr, int argidx,
5457  FunctionCallInfo fcinfo)
5458 {
5459  text *result;
5461  FmgrInfo *foutcache;
5462  bool first_arg = true;
5463  int i;
5464 
5465  /*
5466  * concat(VARIADIC some-array) is essentially equivalent to
5467  * array_to_text(), ie concat the array elements with the given separator.
5468  * So we just pass the case off to that code.
5469  */
5470  if (get_fn_expr_variadic(fcinfo->flinfo))
5471  {
5472  ArrayType *arr;
5473 
5474  /* Should have just the one argument */
5475  Assert(argidx == PG_NARGS() - 1);
5476 
5477  /* concat(VARIADIC NULL) is defined as NULL */
5478  if (PG_ARGISNULL(argidx))
5479  return NULL;
5480 
5481  /*
5482  * Non-null argument had better be an array. We assume that any call
5483  * context that could let get_fn_expr_variadic return true will have
5484  * checked that a VARIADIC-labeled parameter actually is an array. So
5485  * it should be okay to just Assert that it's an array rather than
5486  * doing a full-fledged error check.
5487  */
5489 
5490  /* OK, safe to fetch the array value */
5491  arr = PG_GETARG_ARRAYTYPE_P(argidx);
5492 
5493  /*
5494  * And serialize the array. We tell array_to_text to ignore null
5495  * elements, which matches the behavior of the loop below.
5496  */
5497  return array_to_text_internal(fcinfo, arr, sepstr, NULL);
5498  }
5499 
5500  /* Normal case without explicit VARIADIC marker */
5501  initStringInfo(&str);
5502 
5503  /* Get output function info, building it if first time through */
5504  foutcache = (FmgrInfo *) fcinfo->flinfo->fn_extra;
5505  if (foutcache == NULL)
5506  foutcache = build_concat_foutcache(fcinfo, argidx);
5507 
5508  for (i = argidx; i < PG_NARGS(); i++)
5509  {
5510  if (!PG_ARGISNULL(i))
5511  {
5513 
5514  /* add separator if appropriate */
5515  if (first_arg)
5516  first_arg = false;
5517  else
5518  appendStringInfoString(&str, sepstr);
5519 
5520  /* call the appropriate type output function, append the result */
5522  OutputFunctionCall(&foutcache[i], value));
5523  }
5524  }
5525 
5526  result = cstring_to_text_with_len(str.data, str.len);
5527  pfree(str.data);
5528 
5529  return result;
5530 }
5531 
5532 /*
5533  * Concatenate all arguments. NULL arguments are ignored.
5534  */
5535 Datum
5537 {
5538  text *result;
5539 
5540  result = concat_internal("", 0, fcinfo);
5541  if (result == NULL)
5542  PG_RETURN_NULL();
5543  PG_RETURN_TEXT_P(result);
5544 }
5545 
5546 /*
5547  * Concatenate all but first argument value with separators. The first
5548  * parameter is used as the separator. NULL arguments are ignored.
5549  */
5550 Datum
5552 {
5553  char *sep;
5554  text *result;
5555 
5556  /* return NULL when separator is NULL */
5557  if (PG_ARGISNULL(0))
5558  PG_RETURN_NULL();
5560 
5561  result = concat_internal(sep, 1, fcinfo);
5562  if (result == NULL)
5563  PG_RETURN_NULL();
5564  PG_RETURN_TEXT_P(result);
5565 }
5566 
5567 /*
5568  * Return first n characters in the string. When n is negative,
5569  * return all but last |n| characters.
5570  */
5571 Datum
5573 {
5574  int n = PG_GETARG_INT32(1);
5575 
5576  if (n < 0)
5577  {
5578  text *str = PG_GETARG_TEXT_PP(0);
5579  const char *p = VARDATA_ANY(str);
5580  int len = VARSIZE_ANY_EXHDR(str);
5581  int rlen;
5582 
5583  n = pg_mbstrlen_with_len(p, len) + n;
5584  rlen = pg_mbcharcliplen(p, len, n);
5586  }
5587  else
5589 }
5590 
5591 /*
5592  * Return last n characters in the string. When n is negative,
5593  * return all but first |n| characters.
5594  */
5595 Datum
5597 {
5598  text *str = PG_GETARG_TEXT_PP(0);
5599  const char *p = VARDATA_ANY(str);
5600  int len = VARSIZE_ANY_EXHDR(str);
5601  int n = PG_GETARG_INT32(1);
5602  int off;
5603 
5604  if (n < 0)
5605  n = -n;
5606  else
5607  n = pg_mbstrlen_with_len(p, len) - n;
5608  off = pg_mbcharcliplen(p, len, n);
5609 
5611 }
5612 
5613 /*
5614  * Return reversed string
5615  */
5616 Datum
5618 {
5619  text *str = PG_GETARG_TEXT_PP(0);
5620  const char *p = VARDATA_ANY(str);
5621  int len = VARSIZE_ANY_EXHDR(str);
5622  const char *endp = p + len;
5623  text *result;
5624  char *dst;
5625 
5626  result = palloc(len + VARHDRSZ);
5627  dst = (char *) VARDATA(result) + len;
5628  SET_VARSIZE(result, len + VARHDRSZ);
5629 
5631  {
5632  /* multibyte version */
5633  while (p < endp)
5634  {
5635  int sz;
5636 
5637  sz = pg_mblen(p);
5638  dst -= sz;
5639  memcpy(dst, p, sz);
5640  p += sz;
5641  }
5642  }
5643  else
5644  {
5645  /* single byte version */
5646  while (p < endp)
5647  *(--dst) = *p++;
5648  }
5649 
5650  PG_RETURN_TEXT_P(result);
5651 }
5652 
5653 
5654 /*
5655  * Support macros for text_format()
5656  */
5657 #define TEXT_FORMAT_FLAG_MINUS 0x0001 /* is minus flag present? */
5658 
5659 #define ADVANCE_PARSE_POINTER(ptr,end_ptr) \
5660  do { \
5661  if (++(ptr) >= (end_ptr)) \
5662  ereport(ERROR, \
5663  (errcode(ERRCODE_INVALID_PARAMETER_VALUE), \
5664  errmsg("unterminated format() type specifier"), \
5665  errhint("For a single \"%%\" use \"%%%%\"."))); \
5666  } while (0)
5667 
5668 /*
5669  * Returns a formatted string
5670  */
5671 Datum
5673 {
5674  text *fmt;
5676  const char *cp;
5677  const char *start_ptr;
5678  const char *end_ptr;
5679  text *result;
5680  int arg;
5681  bool funcvariadic;
5682  int nargs;
5683  Datum *elements = NULL;
5684  bool *nulls = NULL;
5685  Oid element_type = InvalidOid;
5686  Oid prev_type = InvalidOid;
5687  Oid prev_width_type = InvalidOid;
5688  FmgrInfo typoutputfinfo;
5689  FmgrInfo typoutputinfo_width;
5690 
5691  /* When format string is null, immediately return null */
5692  if (PG_ARGISNULL(0))
5693  PG_RETURN_NULL();
5694 
5695  /* If argument is marked VARIADIC, expand array into elements */
5696  if (get_fn_expr_variadic(fcinfo->flinfo))
5697  {
5698  ArrayType *arr;
5699  int16 elmlen;
5700  bool elmbyval;
5701  char elmalign;
5702  int nitems;
5703 
5704  /* Should have just the one argument */
5705  Assert(PG_NARGS() == 2);
5706 
5707  /* If argument is NULL, we treat it as zero-length array */
5708  if (PG_ARGISNULL(1))
5709  nitems = 0;
5710  else
5711  {
5712  /*
5713  * Non-null argument had better be an array. We assume that any
5714  * call context that could let get_fn_expr_variadic return true
5715  * will have checked that a VARIADIC-labeled parameter actually is
5716  * an array. So it should be okay to just Assert that it's an
5717  * array rather than doing a full-fledged error check.
5718  */
5720 
5721  /* OK, safe to fetch the array value */
5722  arr = PG_GETARG_ARRAYTYPE_P(1);
5723 
5724  /* Get info about array element type */
5725  element_type = ARR_ELEMTYPE(arr);
5726  get_typlenbyvalalign(element_type,
5727  &elmlen, &elmbyval, &elmalign);
5728 
5729  /* Extract all array elements */
5730  deconstruct_array(arr, element_type, elmlen, elmbyval, elmalign,
5731  &elements, &nulls, &nitems);
5732  }
5733 
5734  nargs = nitems + 1;
5735  funcvariadic = true;
5736  }
5737  else
5738  {
5739  /* Non-variadic case, we'll process the arguments individually */
5740  nargs = PG_NARGS();
5741  funcvariadic = false;
5742  }
5743 
5744  /* Setup for main loop. */
5745  fmt = PG_GETARG_TEXT_PP(0);
5746  start_ptr = VARDATA_ANY(fmt);
5747  end_ptr = start_ptr + VARSIZE_ANY_EXHDR(fmt);
5748  initStringInfo(&str);
5749  arg = 1; /* next argument position to print */
5750 
5751  /* Scan format string, looking for conversion specifiers. */
5752  for (cp = start_ptr; cp < end_ptr; cp++)
5753  {
5754  int argpos;
5755  int widthpos;
5756  int flags;
5757  int width;
5758  Datum value;
5759  bool isNull;
5760  Oid typid;
5761 
5762  /*
5763  * If it's not the start of a conversion specifier, just copy it to
5764  * the output buffer.
5765  */
5766  if (*cp != '%')
5767  {
5769  continue;
5770  }
5771 
5772  ADVANCE_PARSE_POINTER(cp, end_ptr);
5773 
5774  /* Easy case: %% outputs a single % */
5775  if (*cp == '%')
5776  {
5778  continue;
5779  }
5780 
5781  /* Parse the optional portions of the format specifier */
5782  cp = text_format_parse_format(cp, end_ptr,
5783  &argpos, &widthpos,
5784  &flags, &width);
5785 
5786  /*
5787  * Next we should see the main conversion specifier. Whether or not
5788  * an argument position was present, it's known that at least one
5789  * character remains in the string at this point. Experience suggests
5790  * that it's worth checking that that character is one of the expected
5791  * ones before we try to fetch arguments, so as to produce the least
5792  * confusing response to a mis-formatted specifier.
5793  */
5794  if (strchr("sIL", *cp) == NULL)
5795  ereport(ERROR,
5796  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5797  errmsg("unrecognized format() type specifier \"%.*s\"",
5798  pg_mblen(cp), cp),
5799  errhint("For a single \"%%\" use \"%%%%\".")));
5800 
5801  /* If indirect width was specified, get its value */
5802  if (widthpos >= 0)
5803  {
5804  /* Collect the specified or next argument position */
5805  if (widthpos > 0)
5806  arg = widthpos;
5807  if (arg >= nargs)
5808  ereport(ERROR,
5809  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5810  errmsg("too few arguments for format()")));
5811 
5812  /* Get the value and type of the selected argument */
5813  if (!funcvariadic)
5814  {
5816  isNull = PG_ARGISNULL(arg);
5817  typid = get_fn_expr_argtype(fcinfo->flinfo, arg);
5818  }
5819  else
5820  {
5821  value = elements[arg - 1];
5822  isNull = nulls[arg - 1];
5823  typid = element_type;
5824  }
5825  if (!OidIsValid(typid))
5826  elog(ERROR, "could not determine data type of format() input");
5827 
5828  arg++;
5829 
5830  /* We can treat NULL width the same as zero */
5831  if (isNull)
5832  width = 0;
5833  else if (typid == INT4OID)
5834  width = DatumGetInt32(value);
5835  else if (typid == INT2OID)
5836  width = DatumGetInt16(value);
5837  else
5838  {
5839  /* For less-usual datatypes, convert to text then to int */
5840  char *str;
5841 
5842  if (typid != prev_width_type)
5843  {
5844  Oid typoutputfunc;
5845  bool typIsVarlena;
5846 
5847  getTypeOutputInfo(typid, &typoutputfunc, &typIsVarlena);
5848  fmgr_info(typoutputfunc, &typoutputinfo_width);
5849  prev_width_type = typid;
5850  }
5851 
5852  str = OutputFunctionCall(&typoutputinfo_width, value);
5853 
5854  /* pg_strtoint32 will complain about bad data or overflow */
5855  width = pg_strtoint32(str);
5856 
5857  pfree(str);
5858  }
5859  }
5860 
5861  /* Collect the specified or next argument position */
5862  if (argpos > 0)
5863  arg = argpos;
5864  if (arg >= nargs)
5865  ereport(ERROR,
5866  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5867  errmsg("too few arguments for format()")));
5868 
5869  /* Get the value and type of the selected argument */
5870  if (!funcvariadic)
5871  {
5873  isNull = PG_ARGISNULL(arg);
5874  typid = get_fn_expr_argtype(fcinfo->flinfo, arg);
5875  }
5876  else
5877  {
5878  value = elements[arg - 1];
5879  isNull = nulls[arg - 1];
5880  typid = element_type;
5881  }
5882  if (!OidIsValid(typid))
5883  elog(ERROR, "could not determine data type of format() input");
5884 
5885  arg++;
5886 
5887  /*
5888  * Get the appropriate typOutput function, reusing previous one if
5889  * same type as previous argument. That's particularly useful in the
5890  * variadic-array case, but often saves work even for ordinary calls.
5891  */
5892  if (typid != prev_type)
5893  {
5894  Oid typoutputfunc;
5895  bool typIsVarlena;
5896 
5897  getTypeOutputInfo(typid, &typoutputfunc, &typIsVarlena);
5898  fmgr_info(typoutputfunc, &typoutputfinfo);
5899  prev_type = typid;
5900  }
5901 
5902  /*
5903  * And now we can format the value.
5904  */
5905  switch (*cp)
5906  {
5907  case 's':
5908  case 'I':
5909  case 'L':
5910  text_format_string_conversion(&str, *cp, &typoutputfinfo,
5911  value, isNull,
5912  flags, width);
5913  break;
5914  default:
5915  /* should not get here, because of previous check */
5916  ereport(ERROR,
5917  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5918  errmsg("unrecognized format() type specifier \"%.*s\"",
5919  pg_mblen(cp), cp),
5920  errhint("For a single \"%%\" use \"%%%%\".")));
5921  break;
5922  }
5923  }
5924 
5925  /* Don't need deconstruct_array results anymore. */
5926  if (elements != NULL)
5927  pfree(elements);
5928  if (nulls != NULL)
5929  pfree(nulls);
5930 
5931  /* Generate results. */
5932  result = cstring_to_text_with_len(str.data, str.len);
5933  pfree(str.data);
5934 
5935  PG_RETURN_TEXT_P(result);
5936 }
5937 
5938 /*
5939  * Parse contiguous digits as a decimal number.
5940  *
5941  * Returns true if some digits could be parsed.
5942  * The value is returned into *value, and *ptr is advanced to the next
5943  * character to be parsed.
5944  *
5945  * Note parsing invariant: at least one character is known available before
5946  * string end (end_ptr) at entry, and this is still true at exit.
5947  */
5948 static bool
5949 text_format_parse_digits(const char **ptr, const char *end_ptr, int *value)
5950 {
5951  bool found = false;
5952  const char *cp = *ptr;
5953  int val = 0;
5954 
5955  while (*cp >= '0' && *cp <= '9')
5956  {
5957  int8 digit = (*cp - '0');
5958 
5959  if (unlikely(pg_mul_s32_overflow(val, 10, &val)) ||
5960  unlikely(pg_add_s32_overflow(val, digit, &val)))
5961  ereport(ERROR,
5962  (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
5963  errmsg("number is out of range")));
5964  ADVANCE_PARSE_POINTER(cp, end_ptr);
5965  found = true;
5966  }
5967 
5968  *ptr = cp;
5969  *value = val;
5970 
5971  return found;
5972 }
5973 
5974 /*
5975  * Parse a format specifier (generally following the SUS printf spec).
5976  *
5977  * We have already advanced over the initial '%', and we are looking for
5978  * [argpos][flags][width]type (but the type character is not consumed here).
5979  *
5980  * Inputs are start_ptr (the position after '%') and end_ptr (string end + 1).
5981  * Output parameters:
5982  * argpos: argument position for value to be printed. -1 means unspecified.
5983  * widthpos: argument position for width. Zero means the argument position
5984  * was unspecified (ie, take the next arg) and -1 means no width
5985  * argument (width was omitted or specified as a constant).
5986  * flags: bitmask of flags.
5987  * width: directly-specified width value. Zero means the width was omitted
5988  * (note it's not necessary to distinguish this case from an explicit
5989  * zero width value).
5990  *
5991  * The function result is the next character position to be parsed, ie, the
5992  * location where the type character is/should be.
5993  *
5994  * Note parsing invariant: at least one character is known available before
5995  * string end (end_ptr) at entry, and this is still true at exit.
5996  */
5997 static const char *
5998 text_format_parse_format(const char *start_ptr, const char *end_ptr,
5999  int *argpos, int *widthpos,
6000  int *flags, int *width)
6001 {
6002  const char *cp = start_ptr;
6003  int n;
6004 
6005  /* set defaults for output parameters */
6006  *argpos = -1;
6007  *widthpos = -1;
6008  *flags = 0;
6009  *width = 0;
6010 
6011  /* try to identify first number */
6012  if (text_format_parse_digits(&cp, end_ptr, &n))
6013  {
6014  if (*cp != '$')
6015  {
6016  /* Must be just a width and a type, so we're done */
6017  *width = n;
6018  return cp;
6019  }
6020  /* The number was argument position */
6021  *argpos = n;
6022  /* Explicit 0 for argument index is immediately refused */
6023  if (n == 0)
6024  ereport(ERROR,
6025  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6026  errmsg("format specifies argument 0, but arguments are numbered from 1")));
6027  ADVANCE_PARSE_POINTER(cp, end_ptr);
6028  }
6029 
6030  /* Handle flags (only minus is supported now) */
6031  while (*cp == '