PostgreSQL Source Code  git master
varlena.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * varlena.c
4  * Functions for the variable-length built-in types.
5  *
6  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  * src/backend/utils/adt/varlena.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 #include "postgres.h"
16 
17 #include <ctype.h>
18 #include <limits.h>
19 
20 #include "access/detoast.h"
22 #include "catalog/pg_collation.h"
23 #include "catalog/pg_type.h"
24 #include "common/hashfn.h"
25 #include "common/int.h"
27 #include "common/unicode_norm.h"
28 #include "common/unicode_version.h"
29 #include "funcapi.h"
30 #include "lib/hyperloglog.h"
31 #include "libpq/pqformat.h"
32 #include "miscadmin.h"
33 #include "nodes/execnodes.h"
34 #include "parser/scansup.h"
35 #include "port/pg_bswap.h"
36 #include "regex/regex.h"
37 #include "utils/builtins.h"
38 #include "utils/bytea.h"
39 #include "utils/guc.h"
40 #include "utils/lsyscache.h"
41 #include "utils/memutils.h"
42 #include "utils/pg_locale.h"
43 #include "utils/sortsupport.h"
44 #include "utils/varlena.h"
45 
46 
47 /* GUC variable */
49 
50 typedef struct varlena VarString;
51 
52 /*
53  * State for text_position_* functions.
54  */
55 typedef struct
56 {
57  bool is_multibyte_char_in_char; /* need to check char boundaries? */
58 
59  char *str1; /* haystack string */
60  char *str2; /* needle string */
61  int len1; /* string lengths in bytes */
62  int len2;
63 
64  /* Skip table for Boyer-Moore-Horspool search algorithm: */
65  int skiptablemask; /* mask for ANDing with skiptable subscripts */
66  int skiptable[256]; /* skip distance for given mismatched char */
67 
68  char *last_match; /* pointer to last match in 'str1' */
69 
70  /*
71  * Sometimes we need to convert the byte position of a match to a
72  * character position. These store the last position that was converted,
73  * so that on the next call, we can continue from that point, rather than
74  * count characters from the very beginning.
75  */
76  char *refpoint; /* pointer within original haystack string */
77  int refpos; /* 0-based character offset of the same point */
79 
80 typedef struct
81 {
82  char *buf1; /* 1st string, or abbreviation original string
83  * buf */
84  char *buf2; /* 2nd string, or abbreviation strxfrm() buf */
85  int buflen1; /* Allocated length of buf1 */
86  int buflen2; /* Allocated length of buf2 */
87  int last_len1; /* Length of last buf1 string/strxfrm() input */
88  int last_len2; /* Length of last buf2 string/strxfrm() blob */
89  int last_returned; /* Last comparison result (cache) */
90  bool cache_blob; /* Does buf2 contain strxfrm() blob, etc? */
91  bool collate_c;
92  Oid typid; /* Actual datatype (text/bpchar/bytea/name) */
93  hyperLogLogState abbr_card; /* Abbreviated key cardinality state */
94  hyperLogLogState full_card; /* Full key cardinality state */
95  double prop_card; /* Required cardinality proportion */
98 
99 /*
100  * Output data for split_text(): we output either to an array or a table.
101  * tupstore and tupdesc must be set up in advance to output to a table.
102  */
103 typedef struct
104 {
109 
110 /*
111  * This should be large enough that most strings will fit, but small enough
112  * that we feel comfortable putting it on the stack
113  */
114 #define TEXTBUFLEN 1024
115 
116 #define DatumGetVarStringP(X) ((VarString *) PG_DETOAST_DATUM(X))
117 #define DatumGetVarStringPP(X) ((VarString *) PG_DETOAST_DATUM_PACKED(X))
118 
119 static int varstrfastcmp_c(Datum x, Datum y, SortSupport ssup);
120 static int bpcharfastcmp_c(Datum x, Datum y, SortSupport ssup);
121 static int namefastcmp_c(Datum x, Datum y, SortSupport ssup);
122 static int varlenafastcmp_locale(Datum x, Datum y, SortSupport ssup);
123 static int namefastcmp_locale(Datum x, Datum y, SortSupport ssup);
124 static int varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup);
125 static Datum varstr_abbrev_convert(Datum original, SortSupport ssup);
126 static bool varstr_abbrev_abort(int memtupcount, SortSupport ssup);
127 static int32 text_length(Datum str);
128 static text *text_catenate(text *t1, text *t2);
129 static text *text_substring(Datum str,
130  int32 start,
131  int32 length,
132  bool length_not_specified);
133 static text *text_overlay(text *t1, text *t2, int sp, int sl);
134 static int text_position(text *t1, text *t2, Oid collid);
137 static char *text_position_next_internal(char *start_ptr, TextPositionState *state);
141 static void check_collation_set(Oid collid);
142 static int text_cmp(text *arg1, text *arg2, Oid collid);
143 static bytea *bytea_catenate(bytea *t1, bytea *t2);
145  int S,
146  int L,
147  bool length_not_specified);
148 static bytea *bytea_overlay(bytea *t1, bytea *t2, int sp, int sl);
149 static void appendStringInfoText(StringInfo str, const text *t);
150 static bool split_text(FunctionCallInfo fcinfo, SplitTextOutputData *tstate);
151 static void split_text_accum_result(SplitTextOutputData *tstate,
152  text *field_value,
153  text *null_string,
154  Oid collation);
156  const char *fldsep, const char *null_string);
158 static bool text_format_parse_digits(const char **ptr, const char *end_ptr,
159  int *value);
160 static const char *text_format_parse_format(const char *start_ptr,
161  const char *end_ptr,
162  int *argpos, int *widthpos,
163  int *flags, int *width);
164 static void text_format_string_conversion(StringInfo buf, char conversion,
165  FmgrInfo *typOutputInfo,
166  Datum value, bool isNull,
167  int flags, int width);
168 static void text_format_append_string(StringInfo buf, const char *str,
169  int flags, int width);
170 
171 
172 /*****************************************************************************
173  * CONVERSION ROUTINES EXPORTED FOR USE BY C CODE *
174  *****************************************************************************/
175 
176 /*
177  * cstring_to_text
178  *
179  * Create a text value from a null-terminated C string.
180  *
181  * The new text value is freshly palloc'd with a full-size VARHDR.
182  */
183 text *
184 cstring_to_text(const char *s)
185 {
186  return cstring_to_text_with_len(s, strlen(s));
187 }
188 
189 /*
190  * cstring_to_text_with_len
191  *
192  * Same as cstring_to_text except the caller specifies the string length;
193  * the string need not be null_terminated.
194  */
195 text *
196 cstring_to_text_with_len(const char *s, int len)
197 {
198  text *result = (text *) palloc(len + VARHDRSZ);
199 
200  SET_VARSIZE(result, len + VARHDRSZ);
201  memcpy(VARDATA(result), s, len);
202 
203  return result;
204 }
205 
206 /*
207  * text_to_cstring
208  *
209  * Create a palloc'd, null-terminated C string from a text value.
210  *
211  * We support being passed a compressed or toasted text value.
212  * This is a bit bogus since such values shouldn't really be referred to as
213  * "text *", but it seems useful for robustness. If we didn't handle that
214  * case here, we'd need another routine that did, anyway.
215  */
216 char *
218 {
219  /* must cast away the const, unfortunately */
220  text *tunpacked = pg_detoast_datum_packed(unconstify(text *, t));
221  int len = VARSIZE_ANY_EXHDR(tunpacked);
222  char *result;
223 
224  result = (char *) palloc(len + 1);
225  memcpy(result, VARDATA_ANY(tunpacked), len);
226  result[len] = '\0';
227 
228  if (tunpacked != t)
229  pfree(tunpacked);
230 
231  return result;
232 }
233 
234 /*
235  * text_to_cstring_buffer
236  *
237  * Copy a text value into a caller-supplied buffer of size dst_len.
238  *
239  * The text string is truncated if necessary to fit. The result is
240  * guaranteed null-terminated (unless dst_len == 0).
241  *
242  * We support being passed a compressed or toasted text value.
243  * This is a bit bogus since such values shouldn't really be referred to as
244  * "text *", but it seems useful for robustness. If we didn't handle that
245  * case here, we'd need another routine that did, anyway.
246  */
247 void
248 text_to_cstring_buffer(const text *src, char *dst, size_t dst_len)
249 {
250  /* must cast away the const, unfortunately */
251  text *srcunpacked = pg_detoast_datum_packed(unconstify(text *, src));
252  size_t src_len = VARSIZE_ANY_EXHDR(srcunpacked);
253 
254  if (dst_len > 0)
255  {
256  dst_len--;
257  if (dst_len >= src_len)
258  dst_len = src_len;
259  else /* ensure truncation is encoding-safe */
260  dst_len = pg_mbcliplen(VARDATA_ANY(srcunpacked), src_len, dst_len);
261  memcpy(dst, VARDATA_ANY(srcunpacked), dst_len);
262  dst[dst_len] = '\0';
263  }
264 
265  if (srcunpacked != src)
266  pfree(srcunpacked);
267 }
268 
269 
270 /*****************************************************************************
271  * USER I/O ROUTINES *
272  *****************************************************************************/
273 
274 
275 #define VAL(CH) ((CH) - '0')
276 #define DIG(VAL) ((VAL) + '0')
277 
278 /*
279  * byteain - converts from printable representation of byte array
280  *
281  * Non-printable characters must be passed as '\nnn' (octal) and are
282  * converted to internal form. '\' must be passed as '\\'.
283  * ereport(ERROR, ...) if bad form.
284  *
285  * BUGS:
286  * The input is scanned twice.
287  * The error checking of input is minimal.
288  */
289 Datum
291 {
292  char *inputText = PG_GETARG_CSTRING(0);
293  Node *escontext = fcinfo->context;
294  char *tp;
295  char *rp;
296  int bc;
297  bytea *result;
298 
299  /* Recognize hex input */
300  if (inputText[0] == '\\' && inputText[1] == 'x')
301  {
302  size_t len = strlen(inputText);
303 
304  bc = (len - 2) / 2 + VARHDRSZ; /* maximum possible length */
305  result = palloc(bc);
306  bc = hex_decode_safe(inputText + 2, len - 2, VARDATA(result),
307  escontext);
308  SET_VARSIZE(result, bc + VARHDRSZ); /* actual length */
309 
310  PG_RETURN_BYTEA_P(result);
311  }
312 
313  /* Else, it's the traditional escaped style */
314  for (bc = 0, tp = inputText; *tp != '\0'; bc++)
315  {
316  if (tp[0] != '\\')
317  tp++;
318  else if ((tp[0] == '\\') &&
319  (tp[1] >= '0' && tp[1] <= '3') &&
320  (tp[2] >= '0' && tp[2] <= '7') &&
321  (tp[3] >= '0' && tp[3] <= '7'))
322  tp += 4;
323  else if ((tp[0] == '\\') &&
324  (tp[1] == '\\'))
325  tp += 2;
326  else
327  {
328  /*
329  * one backslash, not followed by another or ### valid octal
330  */
331  ereturn(escontext, (Datum) 0,
332  (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
333  errmsg("invalid input syntax for type %s", "bytea")));
334  }
335  }
336 
337  bc += VARHDRSZ;
338 
339  result = (bytea *) palloc(bc);
340  SET_VARSIZE(result, bc);
341 
342  tp = inputText;
343  rp = VARDATA(result);
344  while (*tp != '\0')
345  {
346  if (tp[0] != '\\')
347  *rp++ = *tp++;
348  else if ((tp[0] == '\\') &&
349  (tp[1] >= '0' && tp[1] <= '3') &&
350  (tp[2] >= '0' && tp[2] <= '7') &&
351  (tp[3] >= '0' && tp[3] <= '7'))
352  {
353  bc = VAL(tp[1]);
354  bc <<= 3;
355  bc += VAL(tp[2]);
356  bc <<= 3;
357  *rp++ = bc + VAL(tp[3]);
358 
359  tp += 4;
360  }
361  else if ((tp[0] == '\\') &&
362  (tp[1] == '\\'))
363  {
364  *rp++ = '\\';
365  tp += 2;
366  }
367  else
368  {
369  /*
370  * We should never get here. The first pass should not allow it.
371  */
372  ereturn(escontext, (Datum) 0,
373  (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
374  errmsg("invalid input syntax for type %s", "bytea")));
375  }
376  }
377 
378  PG_RETURN_BYTEA_P(result);
379 }
380 
381 /*
382  * byteaout - converts to printable representation of byte array
383  *
384  * In the traditional escaped format, non-printable characters are
385  * printed as '\nnn' (octal) and '\' as '\\'.
386  */
387 Datum
389 {
390  bytea *vlena = PG_GETARG_BYTEA_PP(0);
391  char *result;
392  char *rp;
393 
395  {
396  /* Print hex format */
397  rp = result = palloc(VARSIZE_ANY_EXHDR(vlena) * 2 + 2 + 1);
398  *rp++ = '\\';
399  *rp++ = 'x';
400  rp += hex_encode(VARDATA_ANY(vlena), VARSIZE_ANY_EXHDR(vlena), rp);
401  }
402  else if (bytea_output == BYTEA_OUTPUT_ESCAPE)
403  {
404  /* Print traditional escaped format */
405  char *vp;
406  uint64 len;
407  int i;
408 
409  len = 1; /* empty string has 1 char */
410  vp = VARDATA_ANY(vlena);
411  for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++)
412  {
413  if (*vp == '\\')
414  len += 2;
415  else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e)
416  len += 4;
417  else
418  len++;
419  }
420 
421  /*
422  * In principle len can't overflow uint32 if the input fit in 1GB, but
423  * for safety let's check rather than relying on palloc's internal
424  * check.
425  */
426  if (len > MaxAllocSize)
427  ereport(ERROR,
428  (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
429  errmsg_internal("result of bytea output conversion is too large")));
430  rp = result = (char *) palloc(len);
431 
432  vp = VARDATA_ANY(vlena);
433  for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++)
434  {
435  if (*vp == '\\')
436  {
437  *rp++ = '\\';
438  *rp++ = '\\';
439  }
440  else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e)
441  {
442  int val; /* holds unprintable chars */
443 
444  val = *vp;
445  rp[0] = '\\';
446  rp[3] = DIG(val & 07);
447  val >>= 3;
448  rp[2] = DIG(val & 07);
449  val >>= 3;
450  rp[1] = DIG(val & 03);
451  rp += 4;
452  }
453  else
454  *rp++ = *vp;
455  }
456  }
457  else
458  {
459  elog(ERROR, "unrecognized \"bytea_output\" setting: %d",
460  bytea_output);
461  rp = result = NULL; /* keep compiler quiet */
462  }
463  *rp = '\0';
464  PG_RETURN_CSTRING(result);
465 }
466 
467 /*
468  * bytearecv - converts external binary format to bytea
469  */
470 Datum
472 {
474  bytea *result;
475  int nbytes;
476 
477  nbytes = buf->len - buf->cursor;
478  result = (bytea *) palloc(nbytes + VARHDRSZ);
479  SET_VARSIZE(result, nbytes + VARHDRSZ);
480  pq_copymsgbytes(buf, VARDATA(result), nbytes);
481  PG_RETURN_BYTEA_P(result);
482 }
483 
484 /*
485  * byteasend - converts bytea to binary format
486  *
487  * This is a special case: just copy the input...
488  */
489 Datum
491 {
492  bytea *vlena = PG_GETARG_BYTEA_P_COPY(0);
493 
494  PG_RETURN_BYTEA_P(vlena);
495 }
496 
497 Datum
499 {
501 
502  state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
503 
504  /* Append the value unless null, preceding it with the delimiter. */
505  if (!PG_ARGISNULL(1))
506  {
508  bool isfirst = false;
509 
510  /*
511  * You might think we can just throw away the first delimiter, however
512  * we must keep it as we may be a parallel worker doing partial
513  * aggregation building a state to send to the main process. We need
514  * to keep the delimiter of every aggregation so that the combine
515  * function can properly join up the strings of two separately
516  * partially aggregated results. The first delimiter is only stripped
517  * off in the final function. To know how much to strip off the front
518  * of the string, we store the length of the first delimiter in the
519  * StringInfo's cursor field, which we don't otherwise need here.
520  */
521  if (state == NULL)
522  {
523  state = makeStringAggState(fcinfo);
524  isfirst = true;
525  }
526 
527  if (!PG_ARGISNULL(2))
528  {
529  bytea *delim = PG_GETARG_BYTEA_PP(2);
530 
532  VARSIZE_ANY_EXHDR(delim));
533  if (isfirst)
534  state->cursor = VARSIZE_ANY_EXHDR(delim);
535  }
536 
539  }
540 
541  /*
542  * The transition type for string_agg() is declared to be "internal",
543  * which is a pass-by-value type the same size as a pointer.
544  */
545  if (state)
547  PG_RETURN_NULL();
548 }
549 
550 Datum
552 {
554 
555  /* cannot be called directly because of internal-type argument */
556  Assert(AggCheckCallContext(fcinfo, NULL));
557 
558  state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
559 
560  if (state != NULL)
561  {
562  /* As per comment in transfn, strip data before the cursor position */
563  bytea *result;
564  int strippedlen = state->len - state->cursor;
565 
566  result = (bytea *) palloc(strippedlen + VARHDRSZ);
567  SET_VARSIZE(result, strippedlen + VARHDRSZ);
568  memcpy(VARDATA(result), &state->data[state->cursor], strippedlen);
569  PG_RETURN_BYTEA_P(result);
570  }
571  else
572  PG_RETURN_NULL();
573 }
574 
575 /*
576  * textin - converts cstring to internal representation
577  */
578 Datum
580 {
581  char *inputText = PG_GETARG_CSTRING(0);
582 
583  PG_RETURN_TEXT_P(cstring_to_text(inputText));
584 }
585 
586 /*
587  * textout - converts internal representation to cstring
588  */
589 Datum
591 {
592  Datum txt = PG_GETARG_DATUM(0);
593 
595 }
596 
597 /*
598  * textrecv - converts external binary format to text
599  */
600 Datum
602 {
604  text *result;
605  char *str;
606  int nbytes;
607 
608  str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
609 
610  result = cstring_to_text_with_len(str, nbytes);
611  pfree(str);
612  PG_RETURN_TEXT_P(result);
613 }
614 
615 /*
616  * textsend - converts text to binary format
617  */
618 Datum
620 {
621  text *t = PG_GETARG_TEXT_PP(0);
623 
627 }
628 
629 
630 /*
631  * unknownin - converts cstring to internal representation
632  */
633 Datum
635 {
636  char *str = PG_GETARG_CSTRING(0);
637 
638  /* representation is same as cstring */
640 }
641 
642 /*
643  * unknownout - converts internal representation to cstring
644  */
645 Datum
647 {
648  /* representation is same as cstring */
649  char *str = PG_GETARG_CSTRING(0);
650 
652 }
653 
654 /*
655  * unknownrecv - converts external binary format to unknown
656  */
657 Datum
659 {
661  char *str;
662  int nbytes;
663 
664  str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
665  /* representation is same as cstring */
667 }
668 
669 /*
670  * unknownsend - converts unknown to binary format
671  */
672 Datum
674 {
675  /* representation is same as cstring */
676  char *str = PG_GETARG_CSTRING(0);
678 
680  pq_sendtext(&buf, str, strlen(str));
682 }
683 
684 
685 /* ========== PUBLIC ROUTINES ========== */
686 
687 /*
688  * textlen -
689  * returns the logical length of a text*
690  * (which is less than the VARSIZE of the text*)
691  */
692 Datum
694 {
696 
697  /* try to avoid decompressing argument */
699 }
700 
701 /*
702  * text_length -
703  * Does the real work for textlen()
704  *
705  * This is broken out so it can be called directly by other string processing
706  * functions. Note that the argument is passed as a Datum, to indicate that
707  * it may still be in compressed form. We can avoid decompressing it at all
708  * in some cases.
709  */
710 static int32
712 {
713  /* fastpath when max encoding length is one */
716  else
717  {
718  text *t = DatumGetTextPP(str);
719 
721  VARSIZE_ANY_EXHDR(t)));
722  }
723 }
724 
725 /*
726  * textoctetlen -
727  * returns the physical length of a text*
728  * (which is less than the VARSIZE of the text*)
729  */
730 Datum
732 {
734 
735  /* We need not detoast the input at all */
737 }
738 
739 /*
740  * textcat -
741  * takes two text* and returns a text* that is the concatenation of
742  * the two.
743  *
744  * Rewritten by Sapa, sapa@hq.icb.chel.su. 8-Jul-96.
745  * Updated by Thomas, Thomas.Lockhart@jpl.nasa.gov 1997-07-10.
746  * Allocate space for output in all cases.
747  * XXX - thomas 1997-07-10
748  */
749 Datum
751 {
752  text *t1 = PG_GETARG_TEXT_PP(0);
753  text *t2 = PG_GETARG_TEXT_PP(1);
754 
756 }
757 
758 /*
759  * text_catenate
760  * Guts of textcat(), broken out so it can be used by other functions
761  *
762  * Arguments can be in short-header form, but not compressed or out-of-line
763  */
764 static text *
766 {
767  text *result;
768  int len1,
769  len2,
770  len;
771  char *ptr;
772 
773  len1 = VARSIZE_ANY_EXHDR(t1);
774  len2 = VARSIZE_ANY_EXHDR(t2);
775 
776  /* paranoia ... probably should throw error instead? */
777  if (len1 < 0)
778  len1 = 0;
779  if (len2 < 0)
780  len2 = 0;
781 
782  len = len1 + len2 + VARHDRSZ;
783  result = (text *) palloc(len);
784 
785  /* Set size of result string... */
786  SET_VARSIZE(result, len);
787 
788  /* Fill data field of result string... */
789  ptr = VARDATA(result);
790  if (len1 > 0)
791  memcpy(ptr, VARDATA_ANY(t1), len1);
792  if (len2 > 0)
793  memcpy(ptr + len1, VARDATA_ANY(t2), len2);
794 
795  return result;
796 }
797 
798 /*
799  * charlen_to_bytelen()
800  * Compute the number of bytes occupied by n characters starting at *p
801  *
802  * It is caller's responsibility that there actually are n characters;
803  * the string need not be null-terminated.
804  */
805 static int
806 charlen_to_bytelen(const char *p, int n)
807 {
809  {
810  /* Optimization for single-byte encodings */
811  return n;
812  }
813  else
814  {
815  const char *s;
816 
817  for (s = p; n > 0; n--)
818  s += pg_mblen(s);
819 
820  return s - p;
821  }
822 }
823 
824 /*
825  * text_substr()
826  * Return a substring starting at the specified position.
827  * - thomas 1997-12-31
828  *
829  * Input:
830  * - string
831  * - starting position (is one-based)
832  * - string length
833  *
834  * If the starting position is zero or less, then return from the start of the string
835  * adjusting the length to be consistent with the "negative start" per SQL.
836  * If the length is less than zero, return the remaining string.
837  *
838  * Added multibyte support.
839  * - Tatsuo Ishii 1998-4-21
840  * Changed behavior if starting position is less than one to conform to SQL behavior.
841  * Formerly returned the entire string; now returns a portion.
842  * - Thomas Lockhart 1998-12-10
843  * Now uses faster TOAST-slicing interface
844  * - John Gray 2002-02-22
845  * Remove "#ifdef MULTIBYTE" and test for encoding_max_length instead. Change
846  * behaviors conflicting with SQL to meet SQL (if E = S + L < S throw
847  * error; if E < 1, return '', not entire string). Fixed MB related bug when
848  * S > LC and < LC + 4 sometimes garbage characters are returned.
849  * - Joe Conway 2002-08-10
850  */
851 Datum
853 {
855  PG_GETARG_INT32(1),
856  PG_GETARG_INT32(2),
857  false));
858 }
859 
860 /*
861  * text_substr_no_len -
862  * Wrapper to avoid opr_sanity failure due to
863  * one function accepting a different number of args.
864  */
865 Datum
867 {
869  PG_GETARG_INT32(1),
870  -1, true));
871 }
872 
873 /*
874  * text_substring -
875  * Does the real work for text_substr() and text_substr_no_len()
876  *
877  * This is broken out so it can be called directly by other string processing
878  * functions. Note that the argument is passed as a Datum, to indicate that
879  * it may still be in compressed/toasted form. We can avoid detoasting all
880  * of it in some cases.
881  *
882  * The result is always a freshly palloc'd datum.
883  */
884 static text *
885 text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
886 {
888  int32 S = start; /* start position */
889  int32 S1; /* adjusted start position */
890  int32 L1; /* adjusted substring length */
891  int32 E; /* end position */
892 
893  /*
894  * SQL99 says S can be zero or negative (which we don't document), but we
895  * still must fetch from the start of the string.
896  * https://www.postgresql.org/message-id/170905442373.643.11536838320909376197%40wrigleys.postgresql.org
897  */
898  S1 = Max(S, 1);
899 
900  /* life is easy if the encoding max length is 1 */
901  if (eml == 1)
902  {
903  if (length_not_specified) /* special case - get length to end of
904  * string */
905  L1 = -1;
906  else if (length < 0)
907  {
908  /* SQL99 says to throw an error for E < S, i.e., negative length */
909  ereport(ERROR,
910  (errcode(ERRCODE_SUBSTRING_ERROR),
911  errmsg("negative substring length not allowed")));
912  L1 = -1; /* silence stupider compilers */
913  }
914  else if (pg_add_s32_overflow(S, length, &E))
915  {
916  /*
917  * L could be large enough for S + L to overflow, in which case
918  * the substring must run to end of string.
919  */
920  L1 = -1;
921  }
922  else
923  {
924  /*
925  * A zero or negative value for the end position can happen if the
926  * start was negative or one. SQL99 says to return a zero-length
927  * string.
928  */
929  if (E < 1)
930  return cstring_to_text("");
931 
932  L1 = E - S1;
933  }
934 
935  /*
936  * If the start position is past the end of the string, SQL99 says to
937  * return a zero-length string -- DatumGetTextPSlice() will do that
938  * for us. We need only convert S1 to zero-based starting position.
939  */
940  return DatumGetTextPSlice(str, S1 - 1, L1);
941  }
942  else if (eml > 1)
943  {
944  /*
945  * When encoding max length is > 1, we can't get LC without
946  * detoasting, so we'll grab a conservatively large slice now and go
947  * back later to do the right thing
948  */
949  int32 slice_start;
950  int32 slice_size;
951  int32 slice_strlen;
952  text *slice;
953  int32 E1;
954  int32 i;
955  char *p;
956  char *s;
957  text *ret;
958 
959  /*
960  * We need to start at position zero because there is no way to know
961  * in advance which byte offset corresponds to the supplied start
962  * position.
963  */
964  slice_start = 0;
965 
966  if (length_not_specified) /* special case - get length to end of
967  * string */
968  slice_size = L1 = -1;
969  else if (length < 0)
970  {
971  /* SQL99 says to throw an error for E < S, i.e., negative length */
972  ereport(ERROR,
973  (errcode(ERRCODE_SUBSTRING_ERROR),
974  errmsg("negative substring length not allowed")));
975  slice_size = L1 = -1; /* silence stupider compilers */
976  }
977  else if (pg_add_s32_overflow(S, length, &E))
978  {
979  /*
980  * L could be large enough for S + L to overflow, in which case
981  * the substring must run to end of string.
982  */
983  slice_size = L1 = -1;
984  }
985  else
986  {
987  /*
988  * A zero or negative value for the end position can happen if the
989  * start was negative or one. SQL99 says to return a zero-length
990  * string.
991  */
992  if (E < 1)
993  return cstring_to_text("");
994 
995  /*
996  * if E is past the end of the string, the tuple toaster will
997  * truncate the length for us
998  */
999  L1 = E - S1;
1000 
1001  /*
1002  * Total slice size in bytes can't be any longer than the start
1003  * position plus substring length times the encoding max length.
1004  * If that overflows, we can just use -1.
1005  */
1006  if (pg_mul_s32_overflow(E, eml, &slice_size))
1007  slice_size = -1;
1008  }
1009 
1010  /*
1011  * If we're working with an untoasted source, no need to do an extra
1012  * copying step.
1013  */
1016  slice = DatumGetTextPSlice(str, slice_start, slice_size);
1017  else
1018  slice = (text *) DatumGetPointer(str);
1019 
1020  /* see if we got back an empty string */
1021  if (VARSIZE_ANY_EXHDR(slice) == 0)
1022  {
1023  if (slice != (text *) DatumGetPointer(str))
1024  pfree(slice);
1025  return cstring_to_text("");
1026  }
1027 
1028  /* Now we can get the actual length of the slice in MB characters */
1029  slice_strlen = pg_mbstrlen_with_len(VARDATA_ANY(slice),
1030  VARSIZE_ANY_EXHDR(slice));
1031 
1032  /*
1033  * Check that the start position wasn't > slice_strlen. If so, SQL99
1034  * says to return a zero-length string.
1035  */
1036  if (S1 > slice_strlen)
1037  {
1038  if (slice != (text *) DatumGetPointer(str))
1039  pfree(slice);
1040  return cstring_to_text("");
1041  }
1042 
1043  /*
1044  * Adjust L1 and E1 now that we know the slice string length. Again
1045  * remember that S1 is one based, and slice_start is zero based.
1046  */
1047  if (L1 > -1)
1048  E1 = Min(S1 + L1, slice_start + 1 + slice_strlen);
1049  else
1050  E1 = slice_start + 1 + slice_strlen;
1051 
1052  /*
1053  * Find the start position in the slice; remember S1 is not zero based
1054  */
1055  p = VARDATA_ANY(slice);
1056  for (i = 0; i < S1 - 1; i++)
1057  p += pg_mblen(p);
1058 
1059  /* hang onto a pointer to our start position */
1060  s = p;
1061 
1062  /*
1063  * Count the actual bytes used by the substring of the requested
1064  * length.
1065  */
1066  for (i = S1; i < E1; i++)
1067  p += pg_mblen(p);
1068 
1069  ret = (text *) palloc(VARHDRSZ + (p - s));
1070  SET_VARSIZE(ret, VARHDRSZ + (p - s));
1071  memcpy(VARDATA(ret), s, (p - s));
1072 
1073  if (slice != (text *) DatumGetPointer(str))
1074  pfree(slice);
1075 
1076  return ret;
1077  }
1078  else
1079  elog(ERROR, "invalid backend encoding: encoding max length < 1");
1080 
1081  /* not reached: suppress compiler warning */
1082  return NULL;
1083 }
1084 
1085 /*
1086  * textoverlay
1087  * Replace specified substring of first string with second
1088  *
1089  * The SQL standard defines OVERLAY() in terms of substring and concatenation.
1090  * This code is a direct implementation of what the standard says.
1091  */
1092 Datum
1094 {
1095  text *t1 = PG_GETARG_TEXT_PP(0);
1096  text *t2 = PG_GETARG_TEXT_PP(1);
1097  int sp = PG_GETARG_INT32(2); /* substring start position */
1098  int sl = PG_GETARG_INT32(3); /* substring length */
1099 
1100  PG_RETURN_TEXT_P(text_overlay(t1, t2, sp, sl));
1101 }
1102 
1103 Datum
1105 {
1106  text *t1 = PG_GETARG_TEXT_PP(0);
1107  text *t2 = PG_GETARG_TEXT_PP(1);
1108  int sp = PG_GETARG_INT32(2); /* substring start position */
1109  int sl;
1110 
1111  sl = text_length(PointerGetDatum(t2)); /* defaults to length(t2) */
1112  PG_RETURN_TEXT_P(text_overlay(t1, t2, sp, sl));
1113 }
1114 
1115 static text *
1116 text_overlay(text *t1, text *t2, int sp, int sl)
1117 {
1118  text *result;
1119  text *s1;
1120  text *s2;
1121  int sp_pl_sl;
1122 
1123  /*
1124  * Check for possible integer-overflow cases. For negative sp, throw a
1125  * "substring length" error because that's what should be expected
1126  * according to the spec's definition of OVERLAY().
1127  */
1128  if (sp <= 0)
1129  ereport(ERROR,
1130  (errcode(ERRCODE_SUBSTRING_ERROR),
1131  errmsg("negative substring length not allowed")));
1132  if (pg_add_s32_overflow(sp, sl, &sp_pl_sl))
1133  ereport(ERROR,
1134  (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
1135  errmsg("integer out of range")));
1136 
1137  s1 = text_substring(PointerGetDatum(t1), 1, sp - 1, false);
1138  s2 = text_substring(PointerGetDatum(t1), sp_pl_sl, -1, true);
1139  result = text_catenate(s1, t2);
1140  result = text_catenate(result, s2);
1141 
1142  return result;
1143 }
1144 
1145 /*
1146  * textpos -
1147  * Return the position of the specified substring.
1148  * Implements the SQL POSITION() function.
1149  * Ref: A Guide To The SQL Standard, Date & Darwen, 1997
1150  * - thomas 1997-07-27
1151  */
1152 Datum
1154 {
1155  text *str = PG_GETARG_TEXT_PP(0);
1156  text *search_str = PG_GETARG_TEXT_PP(1);
1157 
1159 }
1160 
1161 /*
1162  * text_position -
1163  * Does the real work for textpos()
1164  *
1165  * Inputs:
1166  * t1 - string to be searched
1167  * t2 - pattern to match within t1
1168  * Result:
1169  * Character index of the first matched char, starting from 1,
1170  * or 0 if no match.
1171  *
1172  * This is broken out so it can be called directly by other string processing
1173  * functions.
1174  */
1175 static int
1177 {
1179  int result;
1180 
1181  /* Empty needle always matches at position 1 */
1182  if (VARSIZE_ANY_EXHDR(t2) < 1)
1183  return 1;
1184 
1185  /* Otherwise, can't match if haystack is shorter than needle */
1186  if (VARSIZE_ANY_EXHDR(t1) < VARSIZE_ANY_EXHDR(t2))
1187  return 0;
1188 
1189  text_position_setup(t1, t2, collid, &state);
1190  if (!text_position_next(&state))
1191  result = 0;
1192  else
1195  return result;
1196 }
1197 
1198 
1199 /*
1200  * text_position_setup, text_position_next, text_position_cleanup -
1201  * Component steps of text_position()
1202  *
1203  * These are broken out so that a string can be efficiently searched for
1204  * multiple occurrences of the same pattern. text_position_next may be
1205  * called multiple times, and it advances to the next match on each call.
1206  * text_position_get_match_ptr() and text_position_get_match_pos() return
1207  * a pointer or 1-based character position of the last match, respectively.
1208  *
1209  * The "state" variable is normally just a local variable in the caller.
1210  *
1211  * NOTE: text_position_next skips over the matched portion. For example,
1212  * searching for "xx" in "xxx" returns only one match, not two.
1213  */
1214 
1215 static void
1217 {
1218  int len1 = VARSIZE_ANY_EXHDR(t1);
1219  int len2 = VARSIZE_ANY_EXHDR(t2);
1220  pg_locale_t mylocale = 0;
1221 
1223 
1224  if (!lc_collate_is_c(collid))
1225  mylocale = pg_newlocale_from_collation(collid);
1226 
1227  if (!pg_locale_deterministic(mylocale))
1228  ereport(ERROR,
1229  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1230  errmsg("nondeterministic collations are not supported for substring searches")));
1231 
1232  Assert(len1 > 0);
1233  Assert(len2 > 0);
1234 
1235  /*
1236  * Even with a multi-byte encoding, we perform the search using the raw
1237  * byte sequence, ignoring multibyte issues. For UTF-8, that works fine,
1238  * because in UTF-8 the byte sequence of one character cannot contain
1239  * another character. For other multi-byte encodings, we do the search
1240  * initially as a simple byte search, ignoring multibyte issues, but
1241  * verify afterwards that the match we found is at a character boundary,
1242  * and continue the search if it was a false match.
1243  */
1245  state->is_multibyte_char_in_char = false;
1246  else if (GetDatabaseEncoding() == PG_UTF8)
1247  state->is_multibyte_char_in_char = false;
1248  else
1249  state->is_multibyte_char_in_char = true;
1250 
1251  state->str1 = VARDATA_ANY(t1);
1252  state->str2 = VARDATA_ANY(t2);
1253  state->len1 = len1;
1254  state->len2 = len2;
1255  state->last_match = NULL;
1256  state->refpoint = state->str1;
1257  state->refpos = 0;
1258 
1259  /*
1260  * Prepare the skip table for Boyer-Moore-Horspool searching. In these
1261  * notes we use the terminology that the "haystack" is the string to be
1262  * searched (t1) and the "needle" is the pattern being sought (t2).
1263  *
1264  * If the needle is empty or bigger than the haystack then there is no
1265  * point in wasting cycles initializing the table. We also choose not to
1266  * use B-M-H for needles of length 1, since the skip table can't possibly
1267  * save anything in that case.
1268  */
1269  if (len1 >= len2 && len2 > 1)
1270  {
1271  int searchlength = len1 - len2;
1272  int skiptablemask;
1273  int last;
1274  int i;
1275  const char *str2 = state->str2;
1276 
1277  /*
1278  * First we must determine how much of the skip table to use. The
1279  * declaration of TextPositionState allows up to 256 elements, but for
1280  * short search problems we don't really want to have to initialize so
1281  * many elements --- it would take too long in comparison to the
1282  * actual search time. So we choose a useful skip table size based on
1283  * the haystack length minus the needle length. The closer the needle
1284  * length is to the haystack length the less useful skipping becomes.
1285  *
1286  * Note: since we use bit-masking to select table elements, the skip
1287  * table size MUST be a power of 2, and so the mask must be 2^N-1.
1288  */
1289  if (searchlength < 16)
1290  skiptablemask = 3;
1291  else if (searchlength < 64)
1292  skiptablemask = 7;
1293  else if (searchlength < 128)
1294  skiptablemask = 15;
1295  else if (searchlength < 512)
1296  skiptablemask = 31;
1297  else if (searchlength < 2048)
1298  skiptablemask = 63;
1299  else if (searchlength < 4096)
1300  skiptablemask = 127;
1301  else
1302  skiptablemask = 255;
1303  state->skiptablemask = skiptablemask;
1304 
1305  /*
1306  * Initialize the skip table. We set all elements to the needle
1307  * length, since this is the correct skip distance for any character
1308  * not found in the needle.
1309  */
1310  for (i = 0; i <= skiptablemask; i++)
1311  state->skiptable[i] = len2;
1312 
1313  /*
1314  * Now examine the needle. For each character except the last one,
1315  * set the corresponding table element to the appropriate skip
1316  * distance. Note that when two characters share the same skip table
1317  * entry, the one later in the needle must determine the skip
1318  * distance.
1319  */
1320  last = len2 - 1;
1321 
1322  for (i = 0; i < last; i++)
1323  state->skiptable[(unsigned char) str2[i] & skiptablemask] = last - i;
1324  }
1325 }
1326 
1327 /*
1328  * Advance to the next match, starting from the end of the previous match
1329  * (or the beginning of the string, on first call). Returns true if a match
1330  * is found.
1331  *
1332  * Note that this refuses to match an empty-string needle. Most callers
1333  * will have handled that case specially and we'll never see it here.
1334  */
1335 static bool
1337 {
1338  int needle_len = state->len2;
1339  char *start_ptr;
1340  char *matchptr;
1341 
1342  if (needle_len <= 0)
1343  return false; /* result for empty pattern */
1344 
1345  /* Start from the point right after the previous match. */
1346  if (state->last_match)
1347  start_ptr = state->last_match + needle_len;
1348  else
1349  start_ptr = state->str1;
1350 
1351 retry:
1352  matchptr = text_position_next_internal(start_ptr, state);
1353 
1354  if (!matchptr)
1355  return false;
1356 
1357  /*
1358  * Found a match for the byte sequence. If this is a multibyte encoding,
1359  * where one character's byte sequence can appear inside a longer
1360  * multi-byte character, we need to verify that the match was at a
1361  * character boundary, not in the middle of a multi-byte character.
1362  */
1363  if (state->is_multibyte_char_in_char)
1364  {
1365  /* Walk one character at a time, until we reach the match. */
1366 
1367  /* the search should never move backwards. */
1368  Assert(state->refpoint <= matchptr);
1369 
1370  while (state->refpoint < matchptr)
1371  {
1372  /* step to next character. */
1373  state->refpoint += pg_mblen(state->refpoint);
1374  state->refpos++;
1375 
1376  /*
1377  * If we stepped over the match's start position, then it was a
1378  * false positive, where the byte sequence appeared in the middle
1379  * of a multi-byte character. Skip it, and continue the search at
1380  * the next character boundary.
1381  */
1382  if (state->refpoint > matchptr)
1383  {
1384  start_ptr = state->refpoint;
1385  goto retry;
1386  }
1387  }
1388  }
1389 
1390  state->last_match = matchptr;
1391  return true;
1392 }
1393 
1394 /*
1395  * Subroutine of text_position_next(). This searches for the raw byte
1396  * sequence, ignoring any multi-byte encoding issues. Returns the first
1397  * match starting at 'start_ptr', or NULL if no match is found.
1398  */
1399 static char *
1401 {
1402  int haystack_len = state->len1;
1403  int needle_len = state->len2;
1404  int skiptablemask = state->skiptablemask;
1405  const char *haystack = state->str1;
1406  const char *needle = state->str2;
1407  const char *haystack_end = &haystack[haystack_len];
1408  const char *hptr;
1409 
1410  Assert(start_ptr >= haystack && start_ptr <= haystack_end);
1411 
1412  if (needle_len == 1)
1413  {
1414  /* No point in using B-M-H for a one-character needle */
1415  char nchar = *needle;
1416 
1417  hptr = start_ptr;
1418  while (hptr < haystack_end)
1419  {
1420  if (*hptr == nchar)
1421  return (char *) hptr;
1422  hptr++;
1423  }
1424  }
1425  else
1426  {
1427  const char *needle_last = &needle[needle_len - 1];
1428 
1429  /* Start at startpos plus the length of the needle */
1430  hptr = start_ptr + needle_len - 1;
1431  while (hptr < haystack_end)
1432  {
1433  /* Match the needle scanning *backward* */
1434  const char *nptr;
1435  const char *p;
1436 
1437  nptr = needle_last;
1438  p = hptr;
1439  while (*nptr == *p)
1440  {
1441  /* Matched it all? If so, return 1-based position */
1442  if (nptr == needle)
1443  return (char *) p;
1444  nptr--, p--;
1445  }
1446 
1447  /*
1448  * No match, so use the haystack char at hptr to decide how far to
1449  * advance. If the needle had any occurrence of that character
1450  * (or more precisely, one sharing the same skiptable entry)
1451  * before its last character, then we advance far enough to align
1452  * the last such needle character with that haystack position.
1453  * Otherwise we can advance by the whole needle length.
1454  */
1455  hptr += state->skiptable[(unsigned char) *hptr & skiptablemask];
1456  }
1457  }
1458 
1459  return 0; /* not found */
1460 }
1461 
1462 /*
1463  * Return a pointer to the current match.
1464  *
1465  * The returned pointer points into the original haystack string.
1466  */
1467 static char *
1469 {
1470  return state->last_match;
1471 }
1472 
1473 /*
1474  * Return the offset of the current match.
1475  *
1476  * The offset is in characters, 1-based.
1477  */
1478 static int
1480 {
1481  /* Convert the byte position to char position. */
1482  state->refpos += pg_mbstrlen_with_len(state->refpoint,
1483  state->last_match - state->refpoint);
1484  state->refpoint = state->last_match;
1485  return state->refpos + 1;
1486 }
1487 
1488 /*
1489  * Reset search state to the initial state installed by text_position_setup.
1490  *
1491  * The next call to text_position_next will search from the beginning
1492  * of the string.
1493  */
1494 static void
1496 {
1497  state->last_match = NULL;
1498  state->refpoint = state->str1;
1499  state->refpos = 0;
1500 }
1501 
1502 static void
1504 {
1505  /* no cleanup needed */
1506 }
1507 
1508 
1509 static void
1511 {
1512  if (!OidIsValid(collid))
1513  {
1514  /*
1515  * This typically means that the parser could not resolve a conflict
1516  * of implicit collations, so report it that way.
1517  */
1518  ereport(ERROR,
1519  (errcode(ERRCODE_INDETERMINATE_COLLATION),
1520  errmsg("could not determine which collation to use for string comparison"),
1521  errhint("Use the COLLATE clause to set the collation explicitly.")));
1522  }
1523 }
1524 
1525 /* varstr_cmp()
1526  * Comparison function for text strings with given lengths.
1527  * Includes locale support, but must copy strings to temporary memory
1528  * to allow null-termination for inputs to strcoll().
1529  * Returns an integer less than, equal to, or greater than zero, indicating
1530  * whether arg1 is less than, equal to, or greater than arg2.
1531  *
1532  * Note: many functions that depend on this are marked leakproof; therefore,
1533  * avoid reporting the actual contents of the input when throwing errors.
1534  * All errors herein should be things that can't happen except on corrupt
1535  * data, anyway; otherwise we will have trouble with indexing strings that
1536  * would cause them.
1537  */
1538 int
1539 varstr_cmp(const char *arg1, int len1, const char *arg2, int len2, Oid collid)
1540 {
1541  int result;
1542 
1544 
1545  /*
1546  * Unfortunately, there is no strncoll(), so in the non-C locale case we
1547  * have to do some memory copying. This turns out to be significantly
1548  * slower, so we optimize the case where LC_COLLATE is C. We also try to
1549  * optimize relatively-short strings by avoiding palloc/pfree overhead.
1550  */
1551  if (lc_collate_is_c(collid))
1552  {
1553  result = memcmp(arg1, arg2, Min(len1, len2));
1554  if ((result == 0) && (len1 != len2))
1555  result = (len1 < len2) ? -1 : 1;
1556  }
1557  else
1558  {
1559  pg_locale_t mylocale;
1560 
1561  mylocale = pg_newlocale_from_collation(collid);
1562 
1563  /*
1564  * memcmp() can't tell us which of two unequal strings sorts first,
1565  * but it's a cheap way to tell if they're equal. Testing shows that
1566  * memcmp() followed by strcoll() is only trivially slower than
1567  * strcoll() by itself, so we don't lose much if this doesn't work out
1568  * very often, and if it does - for example, because there are many
1569  * equal strings in the input - then we win big by avoiding expensive
1570  * collation-aware comparisons.
1571  */
1572  if (len1 == len2 && memcmp(arg1, arg2, len1) == 0)
1573  return 0;
1574 
1575  result = pg_strncoll(arg1, len1, arg2, len2, mylocale);
1576 
1577  /* Break tie if necessary. */
1578  if (result == 0 && pg_locale_deterministic(mylocale))
1579  {
1580  result = memcmp(arg1, arg2, Min(len1, len2));
1581  if ((result == 0) && (len1 != len2))
1582  result = (len1 < len2) ? -1 : 1;
1583  }
1584  }
1585 
1586  return result;
1587 }
1588 
1589 /* text_cmp()
1590  * Internal comparison function for text strings.
1591  * Returns -1, 0 or 1
1592  */
1593 static int
1594 text_cmp(text *arg1, text *arg2, Oid collid)
1595 {
1596  char *a1p,
1597  *a2p;
1598  int len1,
1599  len2;
1600 
1601  a1p = VARDATA_ANY(arg1);
1602  a2p = VARDATA_ANY(arg2);
1603 
1604  len1 = VARSIZE_ANY_EXHDR(arg1);
1605  len2 = VARSIZE_ANY_EXHDR(arg2);
1606 
1607  return varstr_cmp(a1p, len1, a2p, len2, collid);
1608 }
1609 
1610 /*
1611  * Comparison functions for text strings.
1612  *
1613  * Note: btree indexes need these routines not to leak memory; therefore,
1614  * be careful to free working copies of toasted datums. Most places don't
1615  * need to be so careful.
1616  */
1617 
1618 Datum
1620 {
1622  bool locale_is_c = false;
1623  pg_locale_t mylocale = 0;
1624  bool result;
1625 
1627 
1628  if (lc_collate_is_c(collid))
1629  locale_is_c = true;
1630  else
1631  mylocale = pg_newlocale_from_collation(collid);
1632 
1633  if (locale_is_c || pg_locale_deterministic(mylocale))
1634  {
1635  Datum arg1 = PG_GETARG_DATUM(0);
1636  Datum arg2 = PG_GETARG_DATUM(1);
1637  Size len1,
1638  len2;
1639 
1640  /*
1641  * Since we only care about equality or not-equality, we can avoid all
1642  * the expense of strcoll() here, and just do bitwise comparison. In
1643  * fact, we don't even have to do a bitwise comparison if we can show
1644  * the lengths of the strings are unequal; which might save us from
1645  * having to detoast one or both values.
1646  */
1647  len1 = toast_raw_datum_size(arg1);
1648  len2 = toast_raw_datum_size(arg2);
1649  if (len1 != len2)
1650  result = false;
1651  else
1652  {
1653  text *targ1 = DatumGetTextPP(arg1);
1654  text *targ2 = DatumGetTextPP(arg2);
1655 
1656  result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1657  len1 - VARHDRSZ) == 0);
1658 
1659  PG_FREE_IF_COPY(targ1, 0);
1660  PG_FREE_IF_COPY(targ2, 1);
1661  }
1662  }
1663  else
1664  {
1665  text *arg1 = PG_GETARG_TEXT_PP(0);
1666  text *arg2 = PG_GETARG_TEXT_PP(1);
1667 
1668  result = (text_cmp(arg1, arg2, collid) == 0);
1669 
1670  PG_FREE_IF_COPY(arg1, 0);
1671  PG_FREE_IF_COPY(arg2, 1);
1672  }
1673 
1674  PG_RETURN_BOOL(result);
1675 }
1676 
1677 Datum
1679 {
1681  bool locale_is_c = false;
1682  pg_locale_t mylocale = 0;
1683  bool result;
1684 
1686 
1687  if (lc_collate_is_c(collid))
1688  locale_is_c = true;
1689  else
1690  mylocale = pg_newlocale_from_collation(collid);
1691 
1692  if (locale_is_c || pg_locale_deterministic(mylocale))
1693  {
1694  Datum arg1 = PG_GETARG_DATUM(0);
1695  Datum arg2 = PG_GETARG_DATUM(1);
1696  Size len1,
1697  len2;
1698 
1699  /* See comment in texteq() */
1700  len1 = toast_raw_datum_size(arg1);
1701  len2 = toast_raw_datum_size(arg2);
1702  if (len1 != len2)
1703  result = true;
1704  else
1705  {
1706  text *targ1 = DatumGetTextPP(arg1);
1707  text *targ2 = DatumGetTextPP(arg2);
1708 
1709  result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1710  len1 - VARHDRSZ) != 0);
1711 
1712  PG_FREE_IF_COPY(targ1, 0);
1713  PG_FREE_IF_COPY(targ2, 1);
1714  }
1715  }
1716  else
1717  {
1718  text *arg1 = PG_GETARG_TEXT_PP(0);
1719  text *arg2 = PG_GETARG_TEXT_PP(1);
1720 
1721  result = (text_cmp(arg1, arg2, collid) != 0);
1722 
1723  PG_FREE_IF_COPY(arg1, 0);
1724  PG_FREE_IF_COPY(arg2, 1);
1725  }
1726 
1727  PG_RETURN_BOOL(result);
1728 }
1729 
1730 Datum
1732 {
1733  text *arg1 = PG_GETARG_TEXT_PP(0);
1734  text *arg2 = PG_GETARG_TEXT_PP(1);
1735  bool result;
1736 
1737  result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) < 0);
1738 
1739  PG_FREE_IF_COPY(arg1, 0);
1740  PG_FREE_IF_COPY(arg2, 1);
1741 
1742  PG_RETURN_BOOL(result);
1743 }
1744 
1745 Datum
1747 {
1748  text *arg1 = PG_GETARG_TEXT_PP(0);
1749  text *arg2 = PG_GETARG_TEXT_PP(1);
1750  bool result;
1751 
1752  result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) <= 0);
1753 
1754  PG_FREE_IF_COPY(arg1, 0);
1755  PG_FREE_IF_COPY(arg2, 1);
1756 
1757  PG_RETURN_BOOL(result);
1758 }
1759 
1760 Datum
1762 {
1763  text *arg1 = PG_GETARG_TEXT_PP(0);
1764  text *arg2 = PG_GETARG_TEXT_PP(1);
1765  bool result;
1766 
1767  result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) > 0);
1768 
1769  PG_FREE_IF_COPY(arg1, 0);
1770  PG_FREE_IF_COPY(arg2, 1);
1771 
1772  PG_RETURN_BOOL(result);
1773 }
1774 
1775 Datum
1777 {
1778  text *arg1 = PG_GETARG_TEXT_PP(0);
1779  text *arg2 = PG_GETARG_TEXT_PP(1);
1780  bool result;
1781 
1782  result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) >= 0);
1783 
1784  PG_FREE_IF_COPY(arg1, 0);
1785  PG_FREE_IF_COPY(arg2, 1);
1786 
1787  PG_RETURN_BOOL(result);
1788 }
1789 
1790 Datum
1792 {
1793  Datum arg1 = PG_GETARG_DATUM(0);
1794  Datum arg2 = PG_GETARG_DATUM(1);
1796  pg_locale_t mylocale = 0;
1797  bool result;
1798  Size len1,
1799  len2;
1800 
1802 
1803  if (!lc_collate_is_c(collid))
1804  mylocale = pg_newlocale_from_collation(collid);
1805 
1806  if (!pg_locale_deterministic(mylocale))
1807  ereport(ERROR,
1808  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1809  errmsg("nondeterministic collations are not supported for substring searches")));
1810 
1811  len1 = toast_raw_datum_size(arg1);
1812  len2 = toast_raw_datum_size(arg2);
1813  if (len2 > len1)
1814  result = false;
1815  else
1816  {
1817  text *targ1 = text_substring(arg1, 1, len2, false);
1818  text *targ2 = DatumGetTextPP(arg2);
1819 
1820  result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1821  VARSIZE_ANY_EXHDR(targ2)) == 0);
1822 
1823  PG_FREE_IF_COPY(targ1, 0);
1824  PG_FREE_IF_COPY(targ2, 1);
1825  }
1826 
1827  PG_RETURN_BOOL(result);
1828 }
1829 
1830 Datum
1832 {
1833  text *arg1 = PG_GETARG_TEXT_PP(0);
1834  text *arg2 = PG_GETARG_TEXT_PP(1);
1835  int32 result;
1836 
1837  result = text_cmp(arg1, arg2, PG_GET_COLLATION());
1838 
1839  PG_FREE_IF_COPY(arg1, 0);
1840  PG_FREE_IF_COPY(arg2, 1);
1841 
1842  PG_RETURN_INT32(result);
1843 }
1844 
1845 Datum
1847 {
1849  Oid collid = ssup->ssup_collation;
1850  MemoryContext oldcontext;
1851 
1852  oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
1853 
1854  /* Use generic string SortSupport */
1855  varstr_sortsupport(ssup, TEXTOID, collid);
1856 
1857  MemoryContextSwitchTo(oldcontext);
1858 
1859  PG_RETURN_VOID();
1860 }
1861 
1862 /*
1863  * Generic sortsupport interface for character type's operator classes.
1864  * Includes locale support, and support for BpChar semantics (i.e. removing
1865  * trailing spaces before comparison).
1866  *
1867  * Relies on the assumption that text, VarChar, BpChar, and bytea all have the
1868  * same representation. Callers that always use the C collation (e.g.
1869  * non-collatable type callers like bytea) may have NUL bytes in their strings;
1870  * this will not work with any other collation, though.
1871  */
1872 void
1874 {
1875  bool abbreviate = ssup->abbreviate;
1876  bool collate_c = false;
1877  VarStringSortSupport *sss;
1878  pg_locale_t locale = 0;
1879 
1881 
1882  /*
1883  * If possible, set ssup->comparator to a function which can be used to
1884  * directly compare two datums. If we can do this, we'll avoid the
1885  * overhead of a trip through the fmgr layer for every comparison, which
1886  * can be substantial.
1887  *
1888  * Most typically, we'll set the comparator to varlenafastcmp_locale,
1889  * which uses strcoll() to perform comparisons. We use that for the
1890  * BpChar case too, but type NAME uses namefastcmp_locale. However, if
1891  * LC_COLLATE = C, we can make things quite a bit faster with
1892  * varstrfastcmp_c, bpcharfastcmp_c, or namefastcmp_c, all of which use
1893  * memcmp() rather than strcoll().
1894  */
1895  if (lc_collate_is_c(collid))
1896  {
1897  if (typid == BPCHAROID)
1898  ssup->comparator = bpcharfastcmp_c;
1899  else if (typid == NAMEOID)
1900  {
1901  ssup->comparator = namefastcmp_c;
1902  /* Not supporting abbreviation with type NAME, for now */
1903  abbreviate = false;
1904  }
1905  else
1906  ssup->comparator = varstrfastcmp_c;
1907 
1908  collate_c = true;
1909  }
1910  else
1911  {
1912  /*
1913  * We need a collation-sensitive comparison. To make things faster,
1914  * we'll figure out the collation based on the locale id and cache the
1915  * result.
1916  */
1918 
1919  /*
1920  * We use varlenafastcmp_locale except for type NAME.
1921  */
1922  if (typid == NAMEOID)
1923  {
1925  /* Not supporting abbreviation with type NAME, for now */
1926  abbreviate = false;
1927  }
1928  else
1930  }
1931 
1932  /*
1933  * Unfortunately, it seems that abbreviation for non-C collations is
1934  * broken on many common platforms; see pg_strxfrm_enabled().
1935  *
1936  * Even apart from the risk of broken locales, it's possible that there
1937  * are platforms where the use of abbreviated keys should be disabled at
1938  * compile time. Having only 4 byte datums could make worst-case
1939  * performance drastically more likely, for example. Moreover, macOS's
1940  * strxfrm() implementation is known to not effectively concentrate a
1941  * significant amount of entropy from the original string in earlier
1942  * transformed blobs. It's possible that other supported platforms are
1943  * similarly encumbered. So, if we ever get past disabling this
1944  * categorically, we may still want or need to disable it for particular
1945  * platforms.
1946  */
1947  if (!collate_c && !pg_strxfrm_enabled(locale))
1948  abbreviate = false;
1949 
1950  /*
1951  * If we're using abbreviated keys, or if we're using a locale-aware
1952  * comparison, we need to initialize a VarStringSortSupport object. Both
1953  * cases will make use of the temporary buffers we initialize here for
1954  * scratch space (and to detect requirement for BpChar semantics from
1955  * caller), and the abbreviation case requires additional state.
1956  */
1957  if (abbreviate || !collate_c)
1958  {
1959  sss = palloc(sizeof(VarStringSortSupport));
1960  sss->buf1 = palloc(TEXTBUFLEN);
1961  sss->buflen1 = TEXTBUFLEN;
1962  sss->buf2 = palloc(TEXTBUFLEN);
1963  sss->buflen2 = TEXTBUFLEN;
1964  /* Start with invalid values */
1965  sss->last_len1 = -1;
1966  sss->last_len2 = -1;
1967  /* Initialize */
1968  sss->last_returned = 0;
1969  sss->locale = locale;
1970 
1971  /*
1972  * To avoid somehow confusing a strxfrm() blob and an original string,
1973  * constantly keep track of the variety of data that buf1 and buf2
1974  * currently contain.
1975  *
1976  * Comparisons may be interleaved with conversion calls. Frequently,
1977  * conversions and comparisons are batched into two distinct phases,
1978  * but the correctness of caching cannot hinge upon this. For
1979  * comparison caching, buffer state is only trusted if cache_blob is
1980  * found set to false, whereas strxfrm() caching only trusts the state
1981  * when cache_blob is found set to true.
1982  *
1983  * Arbitrarily initialize cache_blob to true.
1984  */
1985  sss->cache_blob = true;
1986  sss->collate_c = collate_c;
1987  sss->typid = typid;
1988  ssup->ssup_extra = sss;
1989 
1990  /*
1991  * If possible, plan to use the abbreviated keys optimization. The
1992  * core code may switch back to authoritative comparator should
1993  * abbreviation be aborted.
1994  */
1995  if (abbreviate)
1996  {
1997  sss->prop_card = 0.20;
1998  initHyperLogLog(&sss->abbr_card, 10);
1999  initHyperLogLog(&sss->full_card, 10);
2000  ssup->abbrev_full_comparator = ssup->comparator;
2004  }
2005  }
2006 }
2007 
2008 /*
2009  * sortsupport comparison func (for C locale case)
2010  */
2011 static int
2013 {
2014  VarString *arg1 = DatumGetVarStringPP(x);
2015  VarString *arg2 = DatumGetVarStringPP(y);
2016  char *a1p,
2017  *a2p;
2018  int len1,
2019  len2,
2020  result;
2021 
2022  a1p = VARDATA_ANY(arg1);
2023  a2p = VARDATA_ANY(arg2);
2024 
2025  len1 = VARSIZE_ANY_EXHDR(arg1);
2026  len2 = VARSIZE_ANY_EXHDR(arg2);
2027 
2028  result = memcmp(a1p, a2p, Min(len1, len2));
2029  if ((result == 0) && (len1 != len2))
2030  result = (len1 < len2) ? -1 : 1;
2031 
2032  /* We can't afford to leak memory here. */
2033  if (PointerGetDatum(arg1) != x)
2034  pfree(arg1);
2035  if (PointerGetDatum(arg2) != y)
2036  pfree(arg2);
2037 
2038  return result;
2039 }
2040 
2041 /*
2042  * sortsupport comparison func (for BpChar C locale case)
2043  *
2044  * BpChar outsources its sortsupport to this module. Specialization for the
2045  * varstr_sortsupport BpChar case, modeled on
2046  * internal_bpchar_pattern_compare().
2047  */
2048 static int
2050 {
2051  BpChar *arg1 = DatumGetBpCharPP(x);
2052  BpChar *arg2 = DatumGetBpCharPP(y);
2053  char *a1p,
2054  *a2p;
2055  int len1,
2056  len2,
2057  result;
2058 
2059  a1p = VARDATA_ANY(arg1);
2060  a2p = VARDATA_ANY(arg2);
2061 
2062  len1 = bpchartruelen(a1p, VARSIZE_ANY_EXHDR(arg1));
2063  len2 = bpchartruelen(a2p, VARSIZE_ANY_EXHDR(arg2));
2064 
2065  result = memcmp(a1p, a2p, Min(len1, len2));
2066  if ((result == 0) && (len1 != len2))
2067  result = (len1 < len2) ? -1 : 1;
2068 
2069  /* We can't afford to leak memory here. */
2070  if (PointerGetDatum(arg1) != x)
2071  pfree(arg1);
2072  if (PointerGetDatum(arg2) != y)
2073  pfree(arg2);
2074 
2075  return result;
2076 }
2077 
2078 /*
2079  * sortsupport comparison func (for NAME C locale case)
2080  */
2081 static int
2083 {
2084  Name arg1 = DatumGetName(x);
2085  Name arg2 = DatumGetName(y);
2086 
2087  return strncmp(NameStr(*arg1), NameStr(*arg2), NAMEDATALEN);
2088 }
2089 
2090 /*
2091  * sortsupport comparison func (for locale case with all varlena types)
2092  */
2093 static int
2095 {
2096  VarString *arg1 = DatumGetVarStringPP(x);
2097  VarString *arg2 = DatumGetVarStringPP(y);
2098  char *a1p,
2099  *a2p;
2100  int len1,
2101  len2,
2102  result;
2103 
2104  a1p = VARDATA_ANY(arg1);
2105  a2p = VARDATA_ANY(arg2);
2106 
2107  len1 = VARSIZE_ANY_EXHDR(arg1);
2108  len2 = VARSIZE_ANY_EXHDR(arg2);
2109 
2110  result = varstrfastcmp_locale(a1p, len1, a2p, len2, ssup);
2111 
2112  /* We can't afford to leak memory here. */
2113  if (PointerGetDatum(arg1) != x)
2114  pfree(arg1);
2115  if (PointerGetDatum(arg2) != y)
2116  pfree(arg2);
2117 
2118  return result;
2119 }
2120 
2121 /*
2122  * sortsupport comparison func (for locale case with NAME type)
2123  */
2124 static int
2126 {
2127  Name arg1 = DatumGetName(x);
2128  Name arg2 = DatumGetName(y);
2129 
2130  return varstrfastcmp_locale(NameStr(*arg1), strlen(NameStr(*arg1)),
2131  NameStr(*arg2), strlen(NameStr(*arg2)),
2132  ssup);
2133 }
2134 
2135 /*
2136  * sortsupport comparison func for locale cases
2137  */
2138 static int
2139 varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup)
2140 {
2142  int result;
2143  bool arg1_match;
2144 
2145  /* Fast pre-check for equality, as discussed in varstr_cmp() */
2146  if (len1 == len2 && memcmp(a1p, a2p, len1) == 0)
2147  {
2148  /*
2149  * No change in buf1 or buf2 contents, so avoid changing last_len1 or
2150  * last_len2. Existing contents of buffers might still be used by
2151  * next call.
2152  *
2153  * It's fine to allow the comparison of BpChar padding bytes here,
2154  * even though that implies that the memcmp() will usually be
2155  * performed for BpChar callers (though multibyte characters could
2156  * still prevent that from occurring). The memcmp() is still very
2157  * cheap, and BpChar's funny semantics have us remove trailing spaces
2158  * (not limited to padding), so we need make no distinction between
2159  * padding space characters and "real" space characters.
2160  */
2161  return 0;
2162  }
2163 
2164  if (sss->typid == BPCHAROID)
2165  {
2166  /* Get true number of bytes, ignoring trailing spaces */
2167  len1 = bpchartruelen(a1p, len1);
2168  len2 = bpchartruelen(a2p, len2);
2169  }
2170 
2171  if (len1 >= sss->buflen1)
2172  {
2173  sss->buflen1 = Max(len1 + 1, Min(sss->buflen1 * 2, MaxAllocSize));
2174  sss->buf1 = repalloc(sss->buf1, sss->buflen1);
2175  }
2176  if (len2 >= sss->buflen2)
2177  {
2178  sss->buflen2 = Max(len2 + 1, Min(sss->buflen2 * 2, MaxAllocSize));
2179  sss->buf2 = repalloc(sss->buf2, sss->buflen2);
2180  }
2181 
2182  /*
2183  * We're likely to be asked to compare the same strings repeatedly, and
2184  * memcmp() is so much cheaper than strcoll() that it pays to try to cache
2185  * comparisons, even though in general there is no reason to think that
2186  * that will work out (every string datum may be unique). Caching does
2187  * not slow things down measurably when it doesn't work out, and can speed
2188  * things up by rather a lot when it does. In part, this is because the
2189  * memcmp() compares data from cachelines that are needed in L1 cache even
2190  * when the last comparison's result cannot be reused.
2191  */
2192  arg1_match = true;
2193  if (len1 != sss->last_len1 || memcmp(sss->buf1, a1p, len1) != 0)
2194  {
2195  arg1_match = false;
2196  memcpy(sss->buf1, a1p, len1);
2197  sss->buf1[len1] = '\0';
2198  sss->last_len1 = len1;
2199  }
2200 
2201  /*
2202  * If we're comparing the same two strings as last time, we can return the
2203  * same answer without calling strcoll() again. This is more likely than
2204  * it seems (at least with moderate to low cardinality sets), because
2205  * quicksort compares the same pivot against many values.
2206  */
2207  if (len2 != sss->last_len2 || memcmp(sss->buf2, a2p, len2) != 0)
2208  {
2209  memcpy(sss->buf2, a2p, len2);
2210  sss->buf2[len2] = '\0';
2211  sss->last_len2 = len2;
2212  }
2213  else if (arg1_match && !sss->cache_blob)
2214  {
2215  /* Use result cached following last actual strcoll() call */
2216  return sss->last_returned;
2217  }
2218 
2219  result = pg_strcoll(sss->buf1, sss->buf2, sss->locale);
2220 
2221  /* Break tie if necessary. */
2222  if (result == 0 && pg_locale_deterministic(sss->locale))
2223  result = strcmp(sss->buf1, sss->buf2);
2224 
2225  /* Cache result, perhaps saving an expensive strcoll() call next time */
2226  sss->cache_blob = false;
2227  sss->last_returned = result;
2228  return result;
2229 }
2230 
2231 /*
2232  * Conversion routine for sortsupport. Converts original to abbreviated key
2233  * representation. Our encoding strategy is simple -- pack the first 8 bytes
2234  * of a strxfrm() blob into a Datum (on little-endian machines, the 8 bytes are
2235  * stored in reverse order), and treat it as an unsigned integer. When the "C"
2236  * locale is used, or in case of bytea, just memcpy() from original instead.
2237  */
2238 static Datum
2240 {
2241  const size_t max_prefix_bytes = sizeof(Datum);
2243  VarString *authoritative = DatumGetVarStringPP(original);
2244  char *authoritative_data = VARDATA_ANY(authoritative);
2245 
2246  /* working state */
2247  Datum res;
2248  char *pres;
2249  int len;
2250  uint32 hash;
2251 
2252  pres = (char *) &res;
2253  /* memset(), so any non-overwritten bytes are NUL */
2254  memset(pres, 0, max_prefix_bytes);
2255  len = VARSIZE_ANY_EXHDR(authoritative);
2256 
2257  /* Get number of bytes, ignoring trailing spaces */
2258  if (sss->typid == BPCHAROID)
2259  len = bpchartruelen(authoritative_data, len);
2260 
2261  /*
2262  * If we're using the C collation, use memcpy(), rather than strxfrm(), to
2263  * abbreviate keys. The full comparator for the C locale is always
2264  * memcmp(). It would be incorrect to allow bytea callers (callers that
2265  * always force the C collation -- bytea isn't a collatable type, but this
2266  * approach is convenient) to use strxfrm(). This is because bytea
2267  * strings may contain NUL bytes. Besides, this should be faster, too.
2268  *
2269  * More generally, it's okay that bytea callers can have NUL bytes in
2270  * strings because abbreviated cmp need not make a distinction between
2271  * terminating NUL bytes, and NUL bytes representing actual NULs in the
2272  * authoritative representation. Hopefully a comparison at or past one
2273  * abbreviated key's terminating NUL byte will resolve the comparison
2274  * without consulting the authoritative representation; specifically, some
2275  * later non-NUL byte in the longer string can resolve the comparison
2276  * against a subsequent terminating NUL in the shorter string. There will
2277  * usually be what is effectively a "length-wise" resolution there and
2278  * then.
2279  *
2280  * If that doesn't work out -- if all bytes in the longer string
2281  * positioned at or past the offset of the smaller string's (first)
2282  * terminating NUL are actually representative of NUL bytes in the
2283  * authoritative binary string (perhaps with some *terminating* NUL bytes
2284  * towards the end of the longer string iff it happens to still be small)
2285  * -- then an authoritative tie-breaker will happen, and do the right
2286  * thing: explicitly consider string length.
2287  */
2288  if (sss->collate_c)
2289  memcpy(pres, authoritative_data, Min(len, max_prefix_bytes));
2290  else
2291  {
2292  Size bsize;
2293 
2294  /*
2295  * We're not using the C collation, so fall back on strxfrm or ICU
2296  * analogs.
2297  */
2298 
2299  /* By convention, we use buffer 1 to store and NUL-terminate */
2300  if (len >= sss->buflen1)
2301  {
2302  sss->buflen1 = Max(len + 1, Min(sss->buflen1 * 2, MaxAllocSize));
2303  sss->buf1 = repalloc(sss->buf1, sss->buflen1);
2304  }
2305 
2306  /* Might be able to reuse strxfrm() blob from last call */
2307  if (sss->last_len1 == len && sss->cache_blob &&
2308  memcmp(sss->buf1, authoritative_data, len) == 0)
2309  {
2310  memcpy(pres, sss->buf2, Min(max_prefix_bytes, sss->last_len2));
2311  /* No change affecting cardinality, so no hashing required */
2312  goto done;
2313  }
2314 
2315  memcpy(sss->buf1, authoritative_data, len);
2316 
2317  /*
2318  * pg_strxfrm() and pg_strxfrm_prefix expect NUL-terminated strings.
2319  */
2320  sss->buf1[len] = '\0';
2321  sss->last_len1 = len;
2322 
2324  {
2325  if (sss->buflen2 < max_prefix_bytes)
2326  {
2327  sss->buflen2 = Max(max_prefix_bytes,
2328  Min(sss->buflen2 * 2, MaxAllocSize));
2329  sss->buf2 = repalloc(sss->buf2, sss->buflen2);
2330  }
2331 
2332  bsize = pg_strxfrm_prefix(sss->buf2, sss->buf1,
2333  max_prefix_bytes, sss->locale);
2334  sss->last_len2 = bsize;
2335  }
2336  else
2337  {
2338  /*
2339  * Loop: Call pg_strxfrm(), possibly enlarge buffer, and try
2340  * again. The pg_strxfrm() function leaves the result buffer
2341  * content undefined if the result did not fit, so we need to
2342  * retry until everything fits, even though we only need the first
2343  * few bytes in the end.
2344  */
2345  for (;;)
2346  {
2347  bsize = pg_strxfrm(sss->buf2, sss->buf1, sss->buflen2,
2348  sss->locale);
2349 
2350  sss->last_len2 = bsize;
2351  if (bsize < sss->buflen2)
2352  break;
2353 
2354  /*
2355  * Grow buffer and retry.
2356  */
2357  sss->buflen2 = Max(bsize + 1,
2358  Min(sss->buflen2 * 2, MaxAllocSize));
2359  sss->buf2 = repalloc(sss->buf2, sss->buflen2);
2360  }
2361  }
2362 
2363  /*
2364  * Every Datum byte is always compared. This is safe because the
2365  * strxfrm() blob is itself NUL terminated, leaving no danger of
2366  * misinterpreting any NUL bytes not intended to be interpreted as
2367  * logically representing termination.
2368  *
2369  * (Actually, even if there were NUL bytes in the blob it would be
2370  * okay. See remarks on bytea case above.)
2371  */
2372  memcpy(pres, sss->buf2, Min(max_prefix_bytes, bsize));
2373  }
2374 
2375  /*
2376  * Maintain approximate cardinality of both abbreviated keys and original,
2377  * authoritative keys using HyperLogLog. Used as cheap insurance against
2378  * the worst case, where we do many string transformations for no saving
2379  * in full strcoll()-based comparisons. These statistics are used by
2380  * varstr_abbrev_abort().
2381  *
2382  * First, Hash key proper, or a significant fraction of it. Mix in length
2383  * in order to compensate for cases where differences are past
2384  * PG_CACHE_LINE_SIZE bytes, so as to limit the overhead of hashing.
2385  */
2386  hash = DatumGetUInt32(hash_any((unsigned char *) authoritative_data,
2388 
2389  if (len > PG_CACHE_LINE_SIZE)
2391 
2392  addHyperLogLog(&sss->full_card, hash);
2393 
2394  /* Hash abbreviated key */
2395 #if SIZEOF_DATUM == 8
2396  {
2397  uint32 lohalf,
2398  hihalf;
2399 
2400  lohalf = (uint32) res;
2401  hihalf = (uint32) (res >> 32);
2402  hash = DatumGetUInt32(hash_uint32(lohalf ^ hihalf));
2403  }
2404 #else /* SIZEOF_DATUM != 8 */
2406 #endif
2407 
2408  addHyperLogLog(&sss->abbr_card, hash);
2409 
2410  /* Cache result, perhaps saving an expensive strxfrm() call next time */
2411  sss->cache_blob = true;
2412 done:
2413 
2414  /*
2415  * Byteswap on little-endian machines.
2416  *
2417  * This is needed so that ssup_datum_unsigned_cmp() (an unsigned integer
2418  * 3-way comparator) works correctly on all platforms. If we didn't do
2419  * this, the comparator would have to call memcmp() with a pair of
2420  * pointers to the first byte of each abbreviated key, which is slower.
2421  */
2422  res = DatumBigEndianToNative(res);
2423 
2424  /* Don't leak memory here */
2425  if (PointerGetDatum(authoritative) != original)
2426  pfree(authoritative);
2427 
2428  return res;
2429 }
2430 
2431 /*
2432  * Callback for estimating effectiveness of abbreviated key optimization, using
2433  * heuristic rules. Returns value indicating if the abbreviation optimization
2434  * should be aborted, based on its projected effectiveness.
2435  */
2436 static bool
2437 varstr_abbrev_abort(int memtupcount, SortSupport ssup)
2438 {
2440  double abbrev_distinct,
2441  key_distinct;
2442 
2443  Assert(ssup->abbreviate);
2444 
2445  /* Have a little patience */
2446  if (memtupcount < 100)
2447  return false;
2448 
2449  abbrev_distinct = estimateHyperLogLog(&sss->abbr_card);
2450  key_distinct = estimateHyperLogLog(&sss->full_card);
2451 
2452  /*
2453  * Clamp cardinality estimates to at least one distinct value. While
2454  * NULLs are generally disregarded, if only NULL values were seen so far,
2455  * that might misrepresent costs if we failed to clamp.
2456  */
2457  if (abbrev_distinct <= 1.0)
2458  abbrev_distinct = 1.0;
2459 
2460  if (key_distinct <= 1.0)
2461  key_distinct = 1.0;
2462 
2463  /*
2464  * In the worst case all abbreviated keys are identical, while at the same
2465  * time there are differences within full key strings not captured in
2466  * abbreviations.
2467  */
2468 #ifdef TRACE_SORT
2469  if (trace_sort)
2470  {
2471  double norm_abbrev_card = abbrev_distinct / (double) memtupcount;
2472 
2473  elog(LOG, "varstr_abbrev: abbrev_distinct after %d: %f "
2474  "(key_distinct: %f, norm_abbrev_card: %f, prop_card: %f)",
2475  memtupcount, abbrev_distinct, key_distinct, norm_abbrev_card,
2476  sss->prop_card);
2477  }
2478 #endif
2479 
2480  /*
2481  * If the number of distinct abbreviated keys approximately matches the
2482  * number of distinct authoritative original keys, that's reason enough to
2483  * proceed. We can win even with a very low cardinality set if most
2484  * tie-breakers only memcmp(). This is by far the most important
2485  * consideration.
2486  *
2487  * While comparisons that are resolved at the abbreviated key level are
2488  * considerably cheaper than tie-breakers resolved with memcmp(), both of
2489  * those two outcomes are so much cheaper than a full strcoll() once
2490  * sorting is underway that it doesn't seem worth it to weigh abbreviated
2491  * cardinality against the overall size of the set in order to more
2492  * accurately model costs. Assume that an abbreviated comparison, and an
2493  * abbreviated comparison with a cheap memcmp()-based authoritative
2494  * resolution are equivalent.
2495  */
2496  if (abbrev_distinct > key_distinct * sss->prop_card)
2497  {
2498  /*
2499  * When we have exceeded 10,000 tuples, decay required cardinality
2500  * aggressively for next call.
2501  *
2502  * This is useful because the number of comparisons required on
2503  * average increases at a linearithmic rate, and at roughly 10,000
2504  * tuples that factor will start to dominate over the linear costs of
2505  * string transformation (this is a conservative estimate). The decay
2506  * rate is chosen to be a little less aggressive than halving -- which
2507  * (since we're called at points at which memtupcount has doubled)
2508  * would never see the cost model actually abort past the first call
2509  * following a decay. This decay rate is mostly a precaution against
2510  * a sudden, violent swing in how well abbreviated cardinality tracks
2511  * full key cardinality. The decay also serves to prevent a marginal
2512  * case from being aborted too late, when too much has already been
2513  * invested in string transformation.
2514  *
2515  * It's possible for sets of several million distinct strings with
2516  * mere tens of thousands of distinct abbreviated keys to still
2517  * benefit very significantly. This will generally occur provided
2518  * each abbreviated key is a proxy for a roughly uniform number of the
2519  * set's full keys. If it isn't so, we hope to catch that early and
2520  * abort. If it isn't caught early, by the time the problem is
2521  * apparent it's probably not worth aborting.
2522  */
2523  if (memtupcount > 10000)
2524  sss->prop_card *= 0.65;
2525 
2526  return false;
2527  }
2528 
2529  /*
2530  * Abort abbreviation strategy.
2531  *
2532  * The worst case, where all abbreviated keys are identical while all
2533  * original strings differ will typically only see a regression of about
2534  * 10% in execution time for small to medium sized lists of strings.
2535  * Whereas on modern CPUs where cache stalls are the dominant cost, we can
2536  * often expect very large improvements, particularly with sets of strings
2537  * of moderately high to high abbreviated cardinality. There is little to
2538  * lose but much to gain, which our strategy reflects.
2539  */
2540 #ifdef TRACE_SORT
2541  if (trace_sort)
2542  elog(LOG, "varstr_abbrev: aborted abbreviation at %d "
2543  "(abbrev_distinct: %f, key_distinct: %f, prop_card: %f)",
2544  memtupcount, abbrev_distinct, key_distinct, sss->prop_card);
2545 #endif
2546 
2547  return true;
2548 }
2549 
2550 /*
2551  * Generic equalimage support function for character type's operator classes.
2552  * Disables the use of deduplication with nondeterministic collations.
2553  */
2554 Datum
2556 {
2557  /* Oid opcintype = PG_GETARG_OID(0); */
2559 
2561 
2562  if (lc_collate_is_c(collid) ||
2563  collid == DEFAULT_COLLATION_OID ||
2565  PG_RETURN_BOOL(true);
2566  else
2567  PG_RETURN_BOOL(false);
2568 }
2569 
2570 Datum
2572 {
2573  text *arg1 = PG_GETARG_TEXT_PP(0);
2574  text *arg2 = PG_GETARG_TEXT_PP(1);
2575  text *result;
2576 
2577  result = ((text_cmp(arg1, arg2, PG_GET_COLLATION()) > 0) ? arg1 : arg2);
2578 
2579  PG_RETURN_TEXT_P(result);
2580 }
2581 
2582 Datum
2584 {
2585  text *arg1 = PG_GETARG_TEXT_PP(0);
2586  text *arg2 = PG_GETARG_TEXT_PP(1);
2587  text *result;
2588 
2589  result = ((text_cmp(arg1, arg2, PG_GET_COLLATION()) < 0) ? arg1 : arg2);
2590 
2591  PG_RETURN_TEXT_P(result);
2592 }
2593 
2594 
2595 /*
2596  * Cross-type comparison functions for types text and name.
2597  */
2598 
2599 Datum
2601 {
2602  Name arg1 = PG_GETARG_NAME(0);
2603  text *arg2 = PG_GETARG_TEXT_PP(1);
2604  size_t len1 = strlen(NameStr(*arg1));
2605  size_t len2 = VARSIZE_ANY_EXHDR(arg2);
2607  bool result;
2608 
2610 
2611  if (collid == C_COLLATION_OID)
2612  result = (len1 == len2 &&
2613  memcmp(NameStr(*arg1), VARDATA_ANY(arg2), len1) == 0);
2614  else
2615  result = (varstr_cmp(NameStr(*arg1), len1,
2616  VARDATA_ANY(arg2), len2,
2617  collid) == 0);
2618 
2619  PG_FREE_IF_COPY(arg2, 1);
2620 
2621  PG_RETURN_BOOL(result);
2622 }
2623 
2624 Datum
2626 {
2627  text *arg1 = PG_GETARG_TEXT_PP(0);
2628  Name arg2 = PG_GETARG_NAME(1);
2629  size_t len1 = VARSIZE_ANY_EXHDR(arg1);
2630  size_t len2 = strlen(NameStr(*arg2));
2632  bool result;
2633 
2635 
2636  if (collid == C_COLLATION_OID)
2637  result = (len1 == len2 &&
2638  memcmp(VARDATA_ANY(arg1), NameStr(*arg2), len1) == 0);
2639  else
2640  result = (varstr_cmp(VARDATA_ANY(arg1), len1,
2641  NameStr(*arg2), len2,
2642  collid) == 0);
2643 
2644  PG_FREE_IF_COPY(arg1, 0);
2645 
2646  PG_RETURN_BOOL(result);
2647 }
2648 
2649 Datum
2651 {
2652  Name arg1 = PG_GETARG_NAME(0);
2653  text *arg2 = PG_GETARG_TEXT_PP(1);
2654  size_t len1 = strlen(NameStr(*arg1));
2655  size_t len2 = VARSIZE_ANY_EXHDR(arg2);
2657  bool result;
2658 
2660 
2661  if (collid == C_COLLATION_OID)
2662  result = !(len1 == len2 &&
2663  memcmp(NameStr(*arg1), VARDATA_ANY(arg2), len1) == 0);
2664  else
2665  result = !(varstr_cmp(NameStr(*arg1), len1,
2666  VARDATA_ANY(arg2), len2,
2667  collid) == 0);
2668 
2669  PG_FREE_IF_COPY(arg2, 1);
2670 
2671  PG_RETURN_BOOL(result);
2672 }
2673 
2674 Datum
2676 {
2677  text *arg1 = PG_GETARG_TEXT_PP(0);
2678  Name arg2 = PG_GETARG_NAME(1);
2679  size_t len1 = VARSIZE_ANY_EXHDR(arg1);
2680  size_t len2 = strlen(NameStr(*arg2));
2682  bool result;
2683 
2685 
2686  if (collid == C_COLLATION_OID)
2687  result = !(len1 == len2 &&
2688  memcmp(VARDATA_ANY(arg1), NameStr(*arg2), len1) == 0);
2689  else
2690  result = !(varstr_cmp(VARDATA_ANY(arg1), len1,
2691  NameStr(*arg2), len2,
2692  collid) == 0);
2693 
2694  PG_FREE_IF_COPY(arg1, 0);
2695 
2696  PG_RETURN_BOOL(result);
2697 }
2698 
2699 Datum
2701 {
2702  Name arg1 = PG_GETARG_NAME(0);
2703  text *arg2 = PG_GETARG_TEXT_PP(1);
2704  int32 result;
2705 
2706  result = varstr_cmp(NameStr(*arg1), strlen(NameStr(*arg1)),
2707  VARDATA_ANY(arg2), VARSIZE_ANY_EXHDR(arg2),
2708  PG_GET_COLLATION());
2709 
2710  PG_FREE_IF_COPY(arg2, 1);
2711 
2712  PG_RETURN_INT32(result);
2713 }
2714 
2715 Datum
2717 {
2718  text *arg1 = PG_GETARG_TEXT_PP(0);
2719  Name arg2 = PG_GETARG_NAME(1);
2720  int32 result;
2721 
2722  result = varstr_cmp(VARDATA_ANY(arg1), VARSIZE_ANY_EXHDR(arg1),
2723  NameStr(*arg2), strlen(NameStr(*arg2)),
2724  PG_GET_COLLATION());
2725 
2726  PG_FREE_IF_COPY(arg1, 0);
2727 
2728  PG_RETURN_INT32(result);
2729 }
2730 
2731 #define CmpCall(cmpfunc) \
2732  DatumGetInt32(DirectFunctionCall2Coll(cmpfunc, \
2733  PG_GET_COLLATION(), \
2734  PG_GETARG_DATUM(0), \
2735  PG_GETARG_DATUM(1)))
2736 
2737 Datum
2739 {
2741 }
2742 
2743 Datum
2745 {
2747 }
2748 
2749 Datum
2751 {
2753 }
2754 
2755 Datum
2757 {
2759 }
2760 
2761 Datum
2763 {
2765 }
2766 
2767 Datum
2769 {
2771 }
2772 
2773 Datum
2775 {
2777 }
2778 
2779 Datum
2781 {
2783 }
2784 
2785 #undef CmpCall
2786 
2787 
2788 /*
2789  * The following operators support character-by-character comparison
2790  * of text datums, to allow building indexes suitable for LIKE clauses.
2791  * Note that the regular texteq/textne comparison operators, and regular
2792  * support functions 1 and 2 with "C" collation are assumed to be
2793  * compatible with these!
2794  */
2795 
2796 static int
2798 {
2799  int result;
2800  int len1,
2801  len2;
2802 
2803  len1 = VARSIZE_ANY_EXHDR(arg1);
2804  len2 = VARSIZE_ANY_EXHDR(arg2);
2805 
2806  result = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
2807  if (result != 0)
2808  return result;
2809  else if (len1 < len2)
2810  return -1;
2811  else if (len1 > len2)
2812  return 1;
2813  else
2814  return 0;
2815 }
2816 
2817 
2818 Datum
2820 {
2821  text *arg1 = PG_GETARG_TEXT_PP(0);
2822  text *arg2 = PG_GETARG_TEXT_PP(1);
2823  int result;
2824 
2825  result = internal_text_pattern_compare(arg1, arg2);
2826 
2827  PG_FREE_IF_COPY(arg1, 0);
2828  PG_FREE_IF_COPY(arg2, 1);
2829 
2830  PG_RETURN_BOOL(result < 0);
2831 }
2832 
2833 
2834 Datum
2836 {
2837  text *arg1 = PG_GETARG_TEXT_PP(0);
2838  text *arg2 = PG_GETARG_TEXT_PP(1);
2839  int result;
2840 
2841  result = internal_text_pattern_compare(arg1, arg2);
2842 
2843  PG_FREE_IF_COPY(arg1, 0);
2844  PG_FREE_IF_COPY(arg2, 1);
2845 
2846  PG_RETURN_BOOL(result <= 0);
2847 }
2848 
2849 
2850 Datum
2852 {
2853  text *arg1 = PG_GETARG_TEXT_PP(0);
2854  text *arg2 = PG_GETARG_TEXT_PP(1);
2855  int result;
2856 
2857  result = internal_text_pattern_compare(arg1, arg2);
2858 
2859  PG_FREE_IF_COPY(arg1, 0);
2860  PG_FREE_IF_COPY(arg2, 1);
2861 
2862  PG_RETURN_BOOL(result >= 0);
2863 }
2864 
2865 
2866 Datum
2868 {
2869  text *arg1 = PG_GETARG_TEXT_PP(0);
2870  text *arg2 = PG_GETARG_TEXT_PP(1);
2871  int result;
2872 
2873  result = internal_text_pattern_compare(arg1, arg2);
2874 
2875  PG_FREE_IF_COPY(arg1, 0);
2876  PG_FREE_IF_COPY(arg2, 1);
2877 
2878  PG_RETURN_BOOL(result > 0);
2879 }
2880 
2881 
2882 Datum
2884 {
2885  text *arg1 = PG_GETARG_TEXT_PP(0);
2886  text *arg2 = PG_GETARG_TEXT_PP(1);
2887  int result;
2888 
2889  result = internal_text_pattern_compare(arg1, arg2);
2890 
2891  PG_FREE_IF_COPY(arg1, 0);
2892  PG_FREE_IF_COPY(arg2, 1);
2893 
2894  PG_RETURN_INT32(result);
2895 }
2896 
2897 
2898 Datum
2900 {
2902  MemoryContext oldcontext;
2903 
2904  oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
2905 
2906  /* Use generic string SortSupport, forcing "C" collation */
2907  varstr_sortsupport(ssup, TEXTOID, C_COLLATION_OID);
2908 
2909  MemoryContextSwitchTo(oldcontext);
2910 
2911  PG_RETURN_VOID();
2912 }
2913 
2914 
2915 /*-------------------------------------------------------------
2916  * byteaoctetlen
2917  *
2918  * get the number of bytes contained in an instance of type 'bytea'
2919  *-------------------------------------------------------------
2920  */
2921 Datum
2923 {
2924  Datum str = PG_GETARG_DATUM(0);
2925 
2926  /* We need not detoast the input at all */
2928 }
2929 
2930 /*
2931  * byteacat -
2932  * takes two bytea* and returns a bytea* that is the concatenation of
2933  * the two.
2934  *
2935  * Cloned from textcat and modified as required.
2936  */
2937 Datum
2939 {
2940  bytea *t1 = PG_GETARG_BYTEA_PP(0);
2941  bytea *t2 = PG_GETARG_BYTEA_PP(1);
2942 
2944 }
2945 
2946 /*
2947  * bytea_catenate
2948  * Guts of byteacat(), broken out so it can be used by other functions
2949  *
2950  * Arguments can be in short-header form, but not compressed or out-of-line
2951  */
2952 static bytea *
2954 {
2955  bytea *result;
2956  int len1,
2957  len2,
2958  len;
2959  char *ptr;
2960 
2961  len1 = VARSIZE_ANY_EXHDR(t1);
2962  len2 = VARSIZE_ANY_EXHDR(t2);
2963 
2964  /* paranoia ... probably should throw error instead? */
2965  if (len1 < 0)
2966  len1 = 0;
2967  if (len2 < 0)
2968  len2 = 0;
2969 
2970  len = len1 + len2 + VARHDRSZ;
2971  result = (bytea *) palloc(len);
2972 
2973  /* Set size of result string... */
2974  SET_VARSIZE(result, len);
2975 
2976  /* Fill data field of result string... */
2977  ptr = VARDATA(result);
2978  if (len1 > 0)
2979  memcpy(ptr, VARDATA_ANY(t1), len1);
2980  if (len2 > 0)
2981  memcpy(ptr + len1, VARDATA_ANY(t2), len2);
2982 
2983  return result;
2984 }
2985 
2986 #define PG_STR_GET_BYTEA(str_) \
2987  DatumGetByteaPP(DirectFunctionCall1(byteain, CStringGetDatum(str_)))
2988 
2989 /*
2990  * bytea_substr()
2991  * Return a substring starting at the specified position.
2992  * Cloned from text_substr and modified as required.
2993  *
2994  * Input:
2995  * - string
2996  * - starting position (is one-based)
2997  * - string length (optional)
2998  *
2999  * If the starting position is zero or less, then return from the start of the string
3000  * adjusting the length to be consistent with the "negative start" per SQL.
3001  * If the length is less than zero, an ERROR is thrown. If no third argument
3002  * (length) is provided, the length to the end of the string is assumed.
3003  */
3004 Datum
3006 {
3008  PG_GETARG_INT32(1),
3009  PG_GETARG_INT32(2),
3010  false));
3011 }
3012 
3013 /*
3014  * bytea_substr_no_len -
3015  * Wrapper to avoid opr_sanity failure due to
3016  * one function accepting a different number of args.
3017  */
3018 Datum
3020 {
3022  PG_GETARG_INT32(1),
3023  -1,
3024  true));
3025 }
3026 
3027 static bytea *
3029  int S,
3030  int L,
3031  bool length_not_specified)
3032 {
3033  int32 S1; /* adjusted start position */
3034  int32 L1; /* adjusted substring length */
3035  int32 E; /* end position */
3036 
3037  /*
3038  * The logic here should generally match text_substring().
3039  */
3040  S1 = Max(S, 1);
3041 
3042  if (length_not_specified)
3043  {
3044  /*
3045  * Not passed a length - DatumGetByteaPSlice() grabs everything to the
3046  * end of the string if we pass it a negative value for length.
3047  */
3048  L1 = -1;
3049  }
3050  else if (L < 0)
3051  {
3052  /* SQL99 says to throw an error for E < S, i.e., negative length */
3053  ereport(ERROR,
3054  (errcode(ERRCODE_SUBSTRING_ERROR),
3055  errmsg("negative substring length not allowed")));
3056  L1 = -1; /* silence stupider compilers */
3057  }
3058  else if (pg_add_s32_overflow(S, L, &E))
3059  {
3060  /*
3061  * L could be large enough for S + L to overflow, in which case the
3062  * substring must run to end of string.
3063  */
3064  L1 = -1;
3065  }
3066  else
3067  {
3068  /*
3069  * A zero or negative value for the end position can happen if the
3070  * start was negative or one. SQL99 says to return a zero-length
3071  * string.
3072  */
3073  if (E < 1)
3074  return PG_STR_GET_BYTEA("");
3075 
3076  L1 = E - S1;
3077  }
3078 
3079  /*
3080  * If the start position is past the end of the string, SQL99 says to
3081  * return a zero-length string -- DatumGetByteaPSlice() will do that for
3082  * us. We need only convert S1 to zero-based starting position.
3083  */
3084  return DatumGetByteaPSlice(str, S1 - 1, L1);
3085 }
3086 
3087 /*
3088  * byteaoverlay
3089  * Replace specified substring of first string with second
3090  *
3091  * The SQL standard defines OVERLAY() in terms of substring and concatenation.
3092  * This code is a direct implementation of what the standard says.
3093  */
3094 Datum
3096 {
3097  bytea *t1 = PG_GETARG_BYTEA_PP(0);
3098  bytea *t2 = PG_GETARG_BYTEA_PP(1);
3099  int sp = PG_GETARG_INT32(2); /* substring start position */
3100  int sl = PG_GETARG_INT32(3); /* substring length */
3101 
3102  PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl));
3103 }
3104 
3105 Datum
3107 {
3108  bytea *t1 = PG_GETARG_BYTEA_PP(0);
3109  bytea *t2 = PG_GETARG_BYTEA_PP(1);
3110  int sp = PG_GETARG_INT32(2); /* substring start position */
3111  int sl;
3112 
3113  sl = VARSIZE_ANY_EXHDR(t2); /* defaults to length(t2) */
3114  PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl));
3115 }
3116 
3117 static bytea *
3118 bytea_overlay(bytea *t1, bytea *t2, int sp, int sl)
3119 {
3120  bytea *result;
3121  bytea *s1;
3122  bytea *s2;
3123  int sp_pl_sl;
3124 
3125  /*
3126  * Check for possible integer-overflow cases. For negative sp, throw a
3127  * "substring length" error because that's what should be expected
3128  * according to the spec's definition of OVERLAY().
3129  */
3130  if (sp <= 0)
3131  ereport(ERROR,
3132  (errcode(ERRCODE_SUBSTRING_ERROR),
3133  errmsg("negative substring length not allowed")));
3134  if (pg_add_s32_overflow(sp, sl, &sp_pl_sl))
3135  ereport(ERROR,
3136  (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
3137  errmsg("integer out of range")));
3138 
3139  s1 = bytea_substring(PointerGetDatum(t1), 1, sp - 1, false);
3140  s2 = bytea_substring(PointerGetDatum(t1), sp_pl_sl, -1, true);
3141  result = bytea_catenate(s1, t2);
3142  result = bytea_catenate(result, s2);
3143 
3144  return result;
3145 }
3146 
3147 /*
3148  * bit_count
3149  */
3150 Datum
3152 {
3153  bytea *t1 = PG_GETARG_BYTEA_PP(0);
3154 
3156 }
3157 
3158 /*
3159  * byteapos -
3160  * Return the position of the specified substring.
3161  * Implements the SQL POSITION() function.
3162  * Cloned from textpos and modified as required.
3163  */
3164 Datum
3166 {
3167  bytea *t1 = PG_GETARG_BYTEA_PP(0);
3168  bytea *t2 = PG_GETARG_BYTEA_PP(1);
3169  int pos;
3170  int px,
3171  p;
3172  int len1,
3173  len2;
3174  char *p1,
3175  *p2;
3176 
3177  len1 = VARSIZE_ANY_EXHDR(t1);
3178  len2 = VARSIZE_ANY_EXHDR(t2);
3179 
3180  if (len2 <= 0)
3181  PG_RETURN_INT32(1); /* result for empty pattern */
3182 
3183  p1 = VARDATA_ANY(t1);
3184  p2 = VARDATA_ANY(t2);
3185 
3186  pos = 0;
3187  px = (len1 - len2);
3188  for (p = 0; p <= px; p++)
3189  {
3190  if ((*p2 == *p1) && (memcmp(p1, p2, len2) == 0))
3191  {
3192  pos = p + 1;
3193  break;
3194  };
3195  p1++;
3196  };
3197 
3198  PG_RETURN_INT32(pos);
3199 }
3200 
3201 /*-------------------------------------------------------------
3202  * byteaGetByte
3203  *
3204  * this routine treats "bytea" as an array of bytes.
3205  * It returns the Nth byte (a number between 0 and 255).
3206  *-------------------------------------------------------------
3207  */
3208 Datum
3210 {
3211  bytea *v = PG_GETARG_BYTEA_PP(0);
3212  int32 n = PG_GETARG_INT32(1);
3213  int len;
3214  int byte;
3215 
3216  len = VARSIZE_ANY_EXHDR(v);
3217 
3218  if (n < 0 || n >= len)
3219  ereport(ERROR,
3220  (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3221  errmsg("index %d out of valid range, 0..%d",
3222  n, len - 1)));
3223 
3224  byte = ((unsigned char *) VARDATA_ANY(v))[n];
3225 
3226  PG_RETURN_INT32(byte);
3227 }
3228 
3229 /*-------------------------------------------------------------
3230  * byteaGetBit
3231  *
3232  * This routine treats a "bytea" type like an array of bits.
3233  * It returns the value of the Nth bit (0 or 1).
3234  *
3235  *-------------------------------------------------------------
3236  */
3237 Datum
3239 {
3240  bytea *v = PG_GETARG_BYTEA_PP(0);
3241  int64 n = PG_GETARG_INT64(1);
3242  int byteNo,
3243  bitNo;
3244  int len;
3245  int byte;
3246 
3247  len = VARSIZE_ANY_EXHDR(v);
3248 
3249  if (n < 0 || n >= (int64) len * 8)
3250  ereport(ERROR,
3251  (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3252  errmsg("index %lld out of valid range, 0..%lld",
3253  (long long) n, (long long) len * 8 - 1)));
3254 
3255  /* n/8 is now known < len, so safe to cast to int */
3256  byteNo = (int) (n / 8);
3257  bitNo = (int) (n % 8);
3258 
3259  byte = ((unsigned char *) VARDATA_ANY(v))[byteNo];
3260 
3261  if (byte & (1 << bitNo))
3262  PG_RETURN_INT32(1);
3263  else
3264  PG_RETURN_INT32(0);
3265 }
3266 
3267 /*-------------------------------------------------------------
3268  * byteaSetByte
3269  *
3270  * Given an instance of type 'bytea' creates a new one with
3271  * the Nth byte set to the given value.
3272  *
3273  *-------------------------------------------------------------
3274  */
3275 Datum
3277 {
3279  int32 n = PG_GETARG_INT32(1);
3280  int32 newByte = PG_GETARG_INT32(2);
3281  int len;
3282 
3283  len = VARSIZE(res) - VARHDRSZ;
3284 
3285  if (n < 0 || n >= len)
3286  ereport(ERROR,
3287  (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3288  errmsg("index %d out of valid range, 0..%d",
3289  n, len - 1)));
3290 
3291  /*
3292  * Now set the byte.
3293  */
3294  ((unsigned char *) VARDATA(res))[n] = newByte;
3295 
3297 }
3298 
3299 /*-------------------------------------------------------------
3300  * byteaSetBit
3301  *
3302  * Given an instance of type 'bytea' creates a new one with
3303  * the Nth bit set to the given value.
3304  *
3305  *-------------------------------------------------------------
3306  */
3307 Datum
3309 {
3311  int64 n = PG_GETARG_INT64(1);
3312  int32 newBit = PG_GETARG_INT32(2);
3313  int len;
3314  int oldByte,
3315  newByte;
3316  int byteNo,
3317  bitNo;
3318 
3319  len = VARSIZE(res) - VARHDRSZ;
3320 
3321  if (n < 0 || n >= (int64) len * 8)
3322  ereport(ERROR,
3323  (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3324  errmsg("index %lld out of valid range, 0..%lld",
3325  (long long) n, (long long) len * 8 - 1)));
3326 
3327  /* n/8 is now known < len, so safe to cast to int */
3328  byteNo = (int) (n / 8);
3329  bitNo = (int) (n % 8);
3330 
3331  /*
3332  * sanity check!
3333  */
3334  if (newBit != 0 && newBit != 1)
3335  ereport(ERROR,
3336  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
3337  errmsg("new bit must be 0 or 1")));
3338 
3339  /*
3340  * Update the byte.
3341  */
3342  oldByte = ((unsigned char *) VARDATA(res))[byteNo];
3343 
3344  if (newBit == 0)
3345  newByte = oldByte & (~(1 << bitNo));
3346  else
3347  newByte = oldByte | (1 << bitNo);
3348 
3349  ((unsigned char *) VARDATA(res))[byteNo] = newByte;
3350 
3352 }
3353 
3354 
3355 /* text_name()
3356  * Converts a text type to a Name type.
3357  */
3358 Datum
3360 {
3361  text *s = PG_GETARG_TEXT_PP(0);
3362  Name result;
3363  int len;
3364 
3365  len = VARSIZE_ANY_EXHDR(s);
3366 
3367  /* Truncate oversize input */
3368  if (len >= NAMEDATALEN)
3370 
3371  /* We use palloc0 here to ensure result is zero-padded */
3372  result = (Name) palloc0(NAMEDATALEN);
3373  memcpy(NameStr(*result), VARDATA_ANY(s), len);
3374 
3375  PG_RETURN_NAME(result);
3376 }
3377 
3378 /* name_text()
3379  * Converts a Name type to a text type.
3380  */
3381 Datum
3383 {
3384  Name s = PG_GETARG_NAME(0);
3385 
3387 }
3388 
3389 
3390 /*
3391  * textToQualifiedNameList - convert a text object to list of names
3392  *
3393  * This implements the input parsing needed by nextval() and other
3394  * functions that take a text parameter representing a qualified name.
3395  * We split the name at dots, downcase if not double-quoted, and
3396  * truncate names if they're too long.
3397  */
3398 List *
3400 {
3401  char *rawname;
3402  List *result = NIL;
3403  List *namelist;
3404  ListCell *l;
3405 
3406  /* Convert to C string (handles possible detoasting). */
3407  /* Note we rely on being able to modify rawname below. */
3408  rawname = text_to_cstring(textval);
3409 
3410  if (!SplitIdentifierString(rawname, '.', &namelist))
3411  ereport(ERROR,
3412  (errcode(ERRCODE_INVALID_NAME),
3413  errmsg("invalid name syntax")));
3414 
3415  if (namelist == NIL)
3416  ereport(ERROR,
3417  (errcode(ERRCODE_INVALID_NAME),
3418  errmsg("invalid name syntax")));
3419 
3420  foreach(l, namelist)
3421  {
3422  char *curname = (char *) lfirst(l);
3423 
3424  result = lappend(result, makeString(pstrdup(curname)));
3425  }
3426 
3427  pfree(rawname);
3428  list_free(namelist);
3429 
3430  return result;
3431 }
3432 
3433 /*
3434  * SplitIdentifierString --- parse a string containing identifiers
3435  *
3436  * This is the guts of textToQualifiedNameList, and is exported for use in
3437  * other situations such as parsing GUC variables. In the GUC case, it's
3438  * important to avoid memory leaks, so the API is designed to minimize the
3439  * amount of stuff that needs to be allocated and freed.
3440  *
3441  * Inputs:
3442  * rawstring: the input string; must be overwritable! On return, it's
3443  * been modified to contain the separated identifiers.
3444  * separator: the separator punctuation expected between identifiers
3445  * (typically '.' or ','). Whitespace may also appear around
3446  * identifiers.
3447  * Outputs:
3448  * namelist: filled with a palloc'd list of pointers to identifiers within
3449  * rawstring. Caller should list_free() this even on error return.
3450  *
3451  * Returns true if okay, false if there is a syntax error in the string.
3452  *
3453  * Note that an empty string is considered okay here, though not in
3454  * textToQualifiedNameList.
3455  */
3456 bool
3457 SplitIdentifierString(char *rawstring, char separator,
3458  List **namelist)
3459 {
3460  char *nextp = rawstring;
3461  bool done = false;
3462 
3463  *namelist = NIL;
3464 
3465  while (scanner_isspace(*nextp))
3466  nextp++; /* skip leading whitespace */
3467 
3468  if (*nextp == '\0')
3469  return true; /* allow empty string */
3470 
3471  /* At the top of the loop, we are at start of a new identifier. */
3472  do
3473  {
3474  char *curname;
3475  char *endp;
3476 
3477  if (*nextp == '"')
3478  {
3479  /* Quoted name --- collapse quote-quote pairs, no downcasing */
3480  curname = nextp + 1;
3481  for (;;)
3482  {
3483  endp = strchr(nextp + 1, '"');
3484  if (endp == NULL)
3485  return false; /* mismatched quotes */
3486  if (endp[1] != '"')
3487  break; /* found end of quoted name */
3488  /* Collapse adjacent quotes into one quote, and look again */
3489  memmove(endp, endp + 1, strlen(endp));
3490  nextp = endp;
3491  }
3492  /* endp now points at the terminating quote */
3493  nextp = endp + 1;
3494  }
3495  else
3496  {
3497  /* Unquoted name --- extends to separator or whitespace */
3498  char *downname;
3499  int len;
3500 
3501  curname = nextp;
3502  while (*nextp && *nextp != separator &&
3503  !scanner_isspace(*nextp))
3504  nextp++;
3505  endp = nextp;
3506  if (curname == nextp)
3507  return false; /* empty unquoted name not allowed */
3508 
3509  /*
3510  * Downcase the identifier, using same code as main lexer does.
3511  *
3512  * XXX because we want to overwrite the input in-place, we cannot
3513  * support a downcasing transformation that increases the string
3514  * length. This is not a problem given the current implementation
3515  * of downcase_truncate_identifier, but we'll probably have to do
3516  * something about this someday.
3517  */
3518  len = endp - curname;
3519  downname = downcase_truncate_identifier(curname, len, false);
3520  Assert(strlen(downname) <= len);
3521  strncpy(curname, downname, len); /* strncpy is required here */
3522  pfree(downname);
3523  }
3524 
3525  while (scanner_isspace(*nextp))
3526  nextp++; /* skip trailing whitespace */
3527 
3528  if (*nextp == separator)
3529  {
3530  nextp++;
3531  while (scanner_isspace(*nextp))
3532  nextp++; /* skip leading whitespace for next */
3533  /* we expect another name, so done remains false */
3534  }
3535  else if (*nextp == '\0')
3536  done = true;
3537  else
3538  return false; /* invalid syntax */
3539 
3540  /* Now safe to overwrite separator with a null */
3541  *endp = '\0';
3542 
3543  /* Truncate name if it's overlength */
3544  truncate_identifier(curname, strlen(curname), false);
3545 
3546  /*
3547  * Finished isolating current name --- add it to list
3548  */
3549  *namelist = lappend(*namelist, curname);
3550 
3551  /* Loop back if we didn't reach end of string */
3552  } while (!done);
3553 
3554  return true;
3555 }
3556 
3557 
3558 /*
3559  * SplitDirectoriesString --- parse a string containing file/directory names
3560  *
3561  * This works fine on file names too; the function name is historical.
3562  *
3563  * This is similar to SplitIdentifierString, except that the parsing
3564  * rules are meant to handle pathnames instead of identifiers: there is
3565  * no downcasing, embedded spaces are allowed, the max length is MAXPGPATH-1,
3566  * and we apply canonicalize_path() to each extracted string. Because of the
3567  * last, the returned strings are separately palloc'd rather than being
3568  * pointers into rawstring --- but we still scribble on rawstring.
3569  *
3570  * Inputs:
3571  * rawstring: the input string; must be modifiable!
3572  * separator: the separator punctuation expected between directories
3573  * (typically ',' or ';'). Whitespace may also appear around
3574  * directories.
3575  * Outputs:
3576  * namelist: filled with a palloc'd list of directory names.
3577  * Caller should list_free_deep() this even on error return.
3578  *
3579  * Returns true if okay, false if there is a syntax error in the string.
3580  *
3581  * Note that an empty string is considered okay here.
3582  */
3583 bool
3584 SplitDirectoriesString(char *rawstring, char separator,
3585  List **namelist)
3586 {
3587  char *nextp = rawstring;
3588  bool done = false;
3589 
3590  *namelist = NIL;
3591 
3592  while (scanner_isspace(*nextp))
3593  nextp++; /* skip leading whitespace */
3594 
3595  if (*nextp == '\0')
3596  return true; /* allow empty string */
3597 
3598  /* At the top of the loop, we are at start of a new directory. */
3599  do
3600  {
3601  char *curname;
3602  char *endp;
3603 
3604  if (*nextp == '"')
3605  {
3606  /* Quoted name --- collapse quote-quote pairs */
3607  curname = nextp + 1;
3608  for (;;)
3609  {
3610  endp = strchr(nextp + 1, '"');
3611  if (endp == NULL)
3612  return false; /* mismatched quotes */
3613  if (endp[1] != '"')
3614  break; /* found end of quoted name */
3615  /* Collapse adjacent quotes into one quote, and look again */
3616  memmove(endp, endp + 1, strlen(endp));
3617  nextp = endp;
3618  }
3619  /* endp now points at the terminating quote */
3620  nextp = endp + 1;
3621  }
3622  else
3623  {
3624  /* Unquoted name --- extends to separator or end of string */
3625  curname = endp = nextp;
3626  while (*nextp && *nextp != separator)
3627  {
3628  /* trailing whitespace should not be included in name */
3629  if (!scanner_isspace(*nextp))
3630  endp = nextp + 1;
3631  nextp++;
3632  }
3633  if (curname == endp)
3634  return false; /* empty unquoted name not allowed */
3635  }
3636 
3637  while (scanner_isspace(*nextp))
3638  nextp++; /* skip trailing whitespace */
3639 
3640  if (*nextp == separator)
3641  {
3642  nextp++;
3643  while (scanner_isspace(*nextp))
3644  nextp++; /* skip leading whitespace for next */
3645  /* we expect another name, so done remains false */
3646  }
3647  else if (*nextp == '\0')
3648  done = true;
3649  else
3650  return false; /* invalid syntax */
3651 
3652  /* Now safe to overwrite separator with a null */
3653  *endp = '\0';
3654 
3655  /* Truncate path if it's overlength */
3656  if (strlen(curname) >= MAXPGPATH)
3657  curname[MAXPGPATH - 1] = '\0';
3658 
3659  /*
3660  * Finished isolating current name --- add it to list
3661  */
3662  curname = pstrdup(curname);
3663  canonicalize_path(curname);
3664  *namelist = lappend(*namelist, curname);
3665 
3666  /* Loop back if we didn't reach end of string */
3667  } while (!done);
3668 
3669  return true;
3670 }
3671 
3672 
3673 /*
3674  * SplitGUCList --- parse a string containing identifiers or file names
3675  *
3676  * This is used to split the value of a GUC_LIST_QUOTE GUC variable, without
3677  * presuming whether the elements will be taken as identifiers or file names.
3678  * We assume the input has already been through flatten_set_variable_args(),
3679  * so that we need never downcase (if appropriate, that was done already).
3680  * Nor do we ever truncate, since we don't know the correct max length.
3681  * We disallow embedded whitespace for simplicity (it shouldn't matter,
3682  * because any embedded whitespace should have led to double-quoting).
3683  * Otherwise the API is identical to SplitIdentifierString.
3684  *
3685  * XXX it's annoying to have so many copies of this string-splitting logic.
3686  * However, it's not clear that having one function with a bunch of option
3687  * flags would be much better.
3688  *
3689  * XXX there is a version of this function in src/bin/pg_dump/dumputils.c.
3690  * Be sure to update that if you have to change this.
3691  *
3692  * Inputs:
3693  * rawstring: the input string; must be overwritable! On return, it's
3694  * been modified to contain the separated identifiers.
3695  * separator: the separator punctuation expected between identifiers
3696  * (typically '.' or ','). Whitespace may also appear around
3697  * identifiers.
3698  * Outputs:
3699  * namelist: filled with a palloc'd list of pointers to identifiers within
3700  * rawstring. Caller should list_free() this even on error return.
3701  *
3702  * Returns true if okay, false if there is a syntax error in the string.
3703  */
3704 bool
3705 SplitGUCList(char *rawstring, char separator,
3706  List **namelist)
3707 {
3708  char *nextp = rawstring;
3709  bool done = false;
3710 
3711  *namelist = NIL;
3712 
3713  while (scanner_isspace(*nextp))
3714  nextp++; /* skip leading whitespace */
3715 
3716  if (*nextp == '\0')
3717  return true; /* allow empty string */
3718 
3719  /* At the top of the loop, we are at start of a new identifier. */
3720  do
3721  {
3722  char *curname;
3723  char *endp;
3724 
3725  if (*nextp == '"')
3726  {
3727  /* Quoted name --- collapse quote-quote pairs */
3728  curname = nextp + 1;
3729  for (;;)
3730  {
3731  endp = strchr(nextp + 1, '"');
3732  if (endp == NULL)
3733  return false; /* mismatched quotes */
3734  if (endp[1] != '"')
3735  break; /* found end of quoted name */
3736  /* Collapse adjacent quotes into one quote, and look again */
3737  memmove(endp, endp + 1, strlen(endp));
3738  nextp = endp;
3739  }
3740  /* endp now points at the terminating quote */
3741  nextp = endp + 1;
3742  }
3743  else
3744  {
3745  /* Unquoted name --- extends to separator or whitespace */
3746  curname = nextp;
3747  while (*nextp && *nextp != separator &&
3748  !scanner_isspace(*nextp))
3749  nextp++;
3750  endp = nextp;
3751  if (curname == nextp)
3752  return false; /* empty unquoted name not allowed */
3753  }
3754 
3755  while (scanner_isspace(*nextp))
3756  nextp++; /* skip trailing whitespace */
3757 
3758  if (*nextp == separator)
3759  {
3760  nextp++;
3761  while (scanner_isspace(*nextp))
3762  nextp++; /* skip leading whitespace for next */
3763  /* we expect another name, so done remains false */
3764  }
3765  else if (*nextp == '\0')
3766  done = true;
3767  else
3768  return false; /* invalid syntax */
3769 
3770  /* Now safe to overwrite separator with a null */
3771  *endp = '\0';
3772 
3773  /*
3774  * Finished isolating current name --- add it to list
3775  */
3776  *namelist = lappend(*namelist, curname);
3777 
3778  /* Loop back if we didn't reach end of string */
3779  } while (!done);
3780 
3781  return true;
3782 }
3783 
3784 
3785 /*****************************************************************************
3786  * Comparison Functions used for bytea
3787  *
3788  * Note: btree indexes need these routines not to leak memory; therefore,
3789  * be careful to free working copies of toasted datums. Most places don't
3790  * need to be so careful.
3791  *****************************************************************************/
3792 
3793 Datum
3795 {
3796  Datum arg1 = PG_GETARG_DATUM(0);
3797  Datum arg2 = PG_GETARG_DATUM(1);
3798  bool result;
3799  Size len1,
3800  len2;
3801 
3802  /*
3803  * We can use a fast path for unequal lengths, which might save us from
3804  * having to detoast one or both values.
3805  */
3806  len1 = toast_raw_datum_size(arg1);
3807  len2 = toast_raw_datum_size(arg2);
3808  if (len1 != len2)
3809  result = false;
3810  else
3811  {
3812  bytea *barg1 = DatumGetByteaPP(arg1);
3813  bytea *barg2 = DatumGetByteaPP(arg2);
3814 
3815  result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2),
3816  len1 - VARHDRSZ) == 0);
3817 
3818  PG_FREE_IF_COPY(barg1, 0);
3819  PG_FREE_IF_COPY(barg2, 1);
3820  }
3821 
3822  PG_RETURN_BOOL(result);
3823 }
3824 
3825 Datum
3827 {
3828  Datum arg1 = PG_GETARG_DATUM(0);
3829  Datum arg2 = PG_GETARG_DATUM(1);
3830  bool result;
3831  Size len1,
3832  len2;
3833 
3834  /*
3835  * We can use a fast path for unequal lengths, which might save us from
3836  * having to detoast one or both values.
3837  */
3838  len1 = toast_raw_datum_size(arg1);
3839  len2 = toast_raw_datum_size(arg2);
3840  if (len1 != len2)
3841  result = true;
3842  else
3843  {
3844  bytea *barg1 = DatumGetByteaPP(arg1);
3845  bytea *barg2 = DatumGetByteaPP(arg2);
3846 
3847  result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2),
3848  len1 - VARHDRSZ) != 0);
3849 
3850  PG_FREE_IF_COPY(barg1, 0);
3851  PG_FREE_IF_COPY(barg2, 1);
3852  }
3853 
3854  PG_RETURN_BOOL(result);
3855 }
3856 
3857 Datum
3859 {
3860  bytea *arg1 = PG_GETARG_BYTEA_PP(0);
3861  bytea *arg2 = PG_GETARG_BYTEA_PP(1);
3862  int len1,
3863  len2;
3864  int cmp;
3865 
3866  len1 = VARSIZE_ANY_EXHDR(arg1);
3867  len2 = VARSIZE_ANY_EXHDR(arg2);
3868 
3869  cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3870 
3871  PG_FREE_IF_COPY(arg1, 0);
3872  PG_FREE_IF_COPY(arg2, 1);
3873 
3874  PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 < len2)));
3875 }
3876 
3877 Datum
3879 {
3880  bytea *arg1 = PG_GETARG_BYTEA_PP(0);
3881  bytea *arg2 = PG_GETARG_BYTEA_PP(1);
3882  int len1,
3883  len2;
3884  int cmp;
3885 
3886  len1 = VARSIZE_ANY_EXHDR(arg1);
3887  len2 = VARSIZE_ANY_EXHDR(arg2);
3888 
3889  cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3890 
3891  PG_FREE_IF_COPY(arg1, 0);
3892  PG_FREE_IF_COPY(arg2, 1);
3893 
3894  PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 <= len2)));
3895 }
3896 
3897 Datum
3899 {
3900  bytea *arg1 = PG_GETARG_BYTEA_PP(0);
3901  bytea *arg2 = PG_GETARG_BYTEA_PP(1);
3902  int len1,
3903  len2;
3904  int cmp;
3905 
3906  len1 = VARSIZE_ANY_EXHDR(arg1);
3907  len2 = VARSIZE_ANY_EXHDR(arg2);
3908 
3909  cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3910 
3911  PG_FREE_IF_COPY(arg1, 0);
3912  PG_FREE_IF_COPY(arg2, 1);
3913 
3914  PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 > len2)));
3915 }
3916 
3917 Datum
3919 {
3920  bytea *arg1 = PG_GETARG_BYTEA_PP(0);
3921  bytea *arg2 = PG_GETARG_BYTEA_PP(1);
3922  int len1,
3923  len2;
3924  int cmp;
3925 
3926  len1 = VARSIZE_ANY_EXHDR(arg1);
3927  len2 = VARSIZE_ANY_EXHDR(arg2);
3928 
3929  cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3930 
3931  PG_FREE_IF_COPY(arg1, 0);
3932  PG_FREE_IF_COPY(arg2, 1);
3933 
3934  PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 >= len2)));
3935 }
3936 
3937 Datum
3939 {
3940  bytea *arg1 = PG_GETARG_BYTEA_PP(0);
3941  bytea *arg2 = PG_GETARG_BYTEA_PP(1);
3942  int len1,
3943  len2;
3944  int cmp;
3945 
3946  len1 = VARSIZE_ANY_EXHDR(arg1);
3947  len2 = VARSIZE_ANY_EXHDR(arg2);
3948 
3949  cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3950  if ((cmp == 0) && (len1 != len2))
3951  cmp = (len1 < len2) ? -1 : 1;
3952 
3953  PG_FREE_IF_COPY(arg1, 0);
3954  PG_FREE_IF_COPY(arg2, 1);
3955 
3957 }
3958 
3959 Datum
3961 {
3963  MemoryContext oldcontext;
3964 
3965  oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
3966 
3967  /* Use generic string SortSupport, forcing "C" collation */
3968  varstr_sortsupport(ssup, BYTEAOID, C_COLLATION_OID);
3969 
3970  MemoryContextSwitchTo(oldcontext);
3971 
3972  PG_RETURN_VOID();
3973 }
3974 
3975 /*
3976  * appendStringInfoText
3977  *
3978  * Append a text to str.
3979  * Like appendStringInfoString(str, text_to_cstring(t)) but faster.
3980  */
3981 static void
3983 {
3985 }
3986 
3987 /*
3988  * replace_text
3989  * replace all occurrences of 'old_sub_str' in 'orig_str'
3990  * with 'new_sub_str' to form 'new_str'
3991  *
3992  * returns 'orig_str' if 'old_sub_str' == '' or 'orig_str' == ''
3993  * otherwise returns 'new_str'
3994  */
3995 Datum
3997 {
3998  text *src_text = PG_GETARG_TEXT_PP(0);
3999  text *from_sub_text = PG_GETARG_TEXT_PP(1);
4000  text *to_sub_text = PG_GETARG_TEXT_PP(2);
4001  int src_text_len;
4002  int from_sub_text_len;
4004  text *ret_text;
4005  int chunk_len;
4006  char *curr_ptr;
4007  char *start_ptr;
4009  bool found;
4010 
4011  src_text_len = VARSIZE_ANY_EXHDR(src_text);
4012  from_sub_text_len = VARSIZE_ANY_EXHDR(from_sub_text);
4013 
4014  /* Return unmodified source string if empty source or pattern */
4015  if (src_text_len < 1 || from_sub_text_len < 1)
4016  {
4017  PG_RETURN_TEXT_P(src_text);
4018  }
4019 
4020  text_position_setup(src_text, from_sub_text, PG_GET_COLLATION(), &state);
4021 
4022  found = text_position_next(&state);
4023 
4024  /* When the from_sub_text is not found, there is nothing to do. */
4025  if (!found)
4026  {
4028  PG_RETURN_TEXT_P(src_text);
4029  }
4030  curr_ptr = text_position_get_match_ptr(&state);
4031  start_ptr = VARDATA_ANY(src_text);
4032 
4033  initStringInfo(&str);
4034 
4035  do
4036  {
4038 
4039  /* copy the data skipped over by last text_position_next() */
4040  chunk_len = curr_ptr - start_ptr;
4041  appendBinaryStringInfo(&str, start_ptr, chunk_len);
4042 
4043  appendStringInfoText(&str, to_sub_text);
4044 
4045  start_ptr = curr_ptr + from_sub_text_len;
4046 
4047  found = text_position_next(&state);
4048  if (found)
4049  curr_ptr = text_position_get_match_ptr(&state);
4050  }
4051  while (found);
4052 
4053  /* copy trailing data */
4054  chunk_len = ((char *) src_text + VARSIZE_ANY(src_text)) - start_ptr;
4055  appendBinaryStringInfo(&str, start_ptr, chunk_len);
4056 
4058 
4059  ret_text = cstring_to_text_with_len(str.data, str.len);
4060  pfree(str.data);
4061 
4062  PG_RETURN_TEXT_P(ret_text);
4063 }
4064 
4065 /*
4066  * check_replace_text_has_escape
4067  *
4068  * Returns 0 if text contains no backslashes that need processing.
4069  * Returns 1 if text contains backslashes, but not regexp submatch specifiers.
4070  * Returns 2 if text contains regexp submatch specifiers (\1 .. \9).
4071  */
4072 static int
4074 {
4075  int result = 0;
4076  const char *p = VARDATA_ANY(replace_text);
4077  const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);
4078 
4079  while (p < p_end)
4080  {
4081  /* Find next escape char, if any. */
4082  p = memchr(p, '\\', p_end - p);
4083  if (p == NULL)
4084  break;
4085  p++;
4086  /* Note: a backslash at the end doesn't require extra processing. */
4087  if (p < p_end)
4088  {
4089  if (*p >= '1' && *p <= '9')
4090  return 2; /* Found a submatch specifier, so done */
4091  result = 1; /* Found some other sequence, keep looking */
4092  p++;
4093  }
4094  }
4095  return result;
4096 }
4097 
4098 /*
4099  * appendStringInfoRegexpSubstr
4100  *
4101  * Append replace_text to str, substituting regexp back references for
4102  * \n escapes. start_ptr is the start of the match in the source string,
4103  * at logical character position data_pos.
4104  */
4105 static void
4107  regmatch_t *pmatch,
4108  char *start_ptr, int data_pos)
4109 {
4110  const char *p = VARDATA_ANY(replace_text);
4111  const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);
4112 
4113  while (p < p_end)
4114  {
4115  const char *chunk_start = p;
4116  int so;
4117  int eo;
4118 
4119  /* Find next escape char, if any. */
4120  p = memchr(p, '\\', p_end - p);
4121  if (p == NULL)
4122  p = p_end;
4123 
4124  /* Copy the text we just scanned over, if any. */
4125  if (p > chunk_start)
4126  appendBinaryStringInfo(str, chunk_start, p - chunk_start);
4127 
4128  /* Done if at end of string, else advance over escape char. */
4129  if (p >= p_end)
4130  break;
4131  p++;
4132 
4133  if (p >= p_end)
4134  {
4135  /* Escape at very end of input. Treat same as unexpected char */
4136  appendStringInfoChar(str, '\\');
4137  break;
4138  }
4139 
4140  if (*p >= '1' && *p <= '9')
4141  {
4142  /* Use the back reference of regexp. */
4143  int idx = *p - '0';
4144 
4145  so = pmatch[idx].rm_so;
4146  eo = pmatch[idx].rm_eo;
4147  p++;
4148  }
4149  else if (*p == '&')
4150  {
4151  /* Use the entire matched string. */
4152  so = pmatch[0].rm_so;
4153  eo = pmatch[0].rm_eo;
4154  p++;
4155  }
4156  else if (*p == '\\')
4157  {
4158  /* \\ means transfer one \ to output. */
4159  appendStringInfoChar(str, '\\');
4160  p++;
4161  continue;
4162  }
4163  else
4164  {
4165  /*
4166  * If escape char is not followed by any expected char, just treat
4167  * it as ordinary data to copy. (XXX would it be better to throw
4168  * an error?)
4169  */
4170  appendStringInfoChar(str, '\\');
4171  continue;
4172  }
4173 
4174  if (so >= 0 && eo >= 0)
4175  {
4176  /*
4177  * Copy the text that is back reference of regexp. Note so and eo
4178  * are counted in characters not bytes.
4179  */
4180  char *chunk_start;
4181  int chunk_len;
4182 
4183  Assert(so >= data_pos);
4184  chunk_start = start_ptr;
4185  chunk_start += charlen_to_bytelen(chunk_start, so - data_pos);
4186  chunk_len = charlen_to_bytelen(chunk_start, eo - so);
4187  appendBinaryStringInfo(str, chunk_start, chunk_len);
4188  }
4189  }
4190 }
4191 
4192 /*
4193  * replace_text_regexp
4194  *
4195  * replace substring(s) in src_text that match pattern with replace_text.
4196  * The replace_text can contain backslash markers to substitute
4197  * (parts of) the matched text.
4198  *
4199  * cflags: regexp compile flags.
4200  * collation: collation to use.
4201  * search_start: the character (not byte) offset in src_text at which to
4202  * begin searching.
4203  * n: if 0, replace all matches; if > 0, replace only the N'th match.
4204  */
4205 text *
4206 replace_text_regexp(text *src_text, text *pattern_text,
4207  text *replace_text,
4208  int cflags, Oid collation,
4209  int search_start, int n)
4210 {
4211  text *ret_text;
4212  regex_t *re;
4213  int src_text_len = VARSIZE_ANY_EXHDR(src_text);
4214  int nmatches = 0;
4216  regmatch_t pmatch[10]; /* main match, plus \1 to \9 */
4217  int nmatch = lengthof(pmatch);
4218  pg_wchar *data;
4219  size_t data_len;
4220  int data_pos;
4221  char *start_ptr;
4222  int escape_status;
4223 
4224  initStringInfo(&buf);
4225 
4226  /* Convert data string to wide characters. */
4227  data = (pg_wchar *) palloc((src_text_len + 1) * sizeof(pg_wchar));
4228  data_len = pg_mb2wchar_with_len(VARDATA_ANY(src_text), data, src_text_len);
4229 
4230  /* Check whether replace_text has escapes, especially regexp submatches. */
4232 
4233  /* If no regexp submatches, we can use REG_NOSUB. */
4234  if (escape_status < 2)
4235  {
4236  cflags |= REG_NOSUB;
4237  /* Also tell pg_regexec we only want the whole-match location. */
4238  nmatch = 1;
4239  }
4240 
4241  /* Prepare the regexp. */
4242  re = RE_compile_and_cache(pattern_text, cflags, collation);
4243 
4244  /* start_ptr points to the data_pos'th character of src_text */
4245  start_ptr = (char *) VARDATA_ANY(src_text);
4246  data_pos = 0;
4247 
4248  while (search_start <= data_len)
4249  {
4250  int regexec_result;
4251 
4253 
4254  regexec_result = pg_regexec(re,
4255  data,
4256  data_len,
4257  search_start,
4258  NULL, /* no details */
4259  nmatch,
4260  pmatch,
4261  0);
4262 
4263  if (regexec_result == REG_NOMATCH)
4264  break;
4265 
4266  if (regexec_result != REG_OKAY)
4267  {
4268  char errMsg[100];
4269 
4270  pg_regerror(regexec_result, re, errMsg, sizeof(errMsg));
4271  ereport(ERROR,
4272  (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
4273  errmsg("regular expression failed: %s", errMsg)));
4274  }
4275 
4276  /*
4277  * Count matches, and decide whether to replace this match.
4278  */
4279  nmatches++;
4280  if (n > 0 && nmatches != n)
4281  {
4282  /*
4283  * No, so advance search_start, but not start_ptr/data_pos. (Thus,
4284  * we treat the matched text as if it weren't matched, and copy it
4285  * to the output later.)
4286  */
4287  search_start = pmatch[0].rm_eo;
4288  if (pmatch[0].rm_so == pmatch[0].rm_eo)
4289  search_start++;
4290  continue;
4291  }
4292 
4293  /*
4294  * Copy the text to the left of the match position. Note we are given
4295  * character not byte indexes.
4296  */
4297  if (pmatch[0].rm_so - data_pos > 0)
4298  {
4299  int chunk_len;
4300 
4301  chunk_len = charlen_to_bytelen(start_ptr,
4302  pmatch[0].rm_so - data_pos);
4303  appendBinaryStringInfo(&buf, start_ptr, chunk_len);
4304 
4305  /*
4306  * Advance start_ptr over that text, to avoid multiple rescans of
4307  * it if the replace_text contains multiple back-references.
4308  */
4309  start_ptr += chunk_len;
4310  data_pos = pmatch[0].rm_so;
4311  }
4312 
4313  /*
4314  * Copy the replace_text, processing escapes if any are present.
4315  */
4316  if (escape_status > 0)
4318  start_ptr, data_pos);
4319  else
4321 
4322  /* Advance start_ptr and data_pos over the matched text. */
4323  start_ptr += charlen_to_bytelen(start_ptr,
4324  pmatch[0].rm_eo - data_pos);
4325  data_pos = pmatch[0].rm_eo;
4326 
4327  /*
4328  * If we only want to replace one occurrence, we're done.
4329  */
4330  if (n > 0)
4331  break;
4332 
4333  /*
4334  * Advance search position. Normally we start the next search at the
4335  * end of the previous match; but if the match was of zero length, we
4336  * have to advance by one character, or we'd just find the same match
4337  * again.
4338  */
4339  search_start = data_pos;
4340  if (pmatch[0].rm_so == pmatch[0].rm_eo)
4341  search_start++;
4342  }
4343 
4344  /*
4345  * Copy the text to the right of the last match.
4346  */
4347  if (data_pos < data_len)
4348  {
4349  int chunk_len;
4350 
4351  chunk_len = ((char *) src_text + VARSIZE_ANY(src_text)) - start_ptr;
4352  appendBinaryStringInfo(&buf, start_ptr, chunk_len);
4353  }
4354 
4355  ret_text = cstring_to_text_with_len(buf.data, buf.len);
4356  pfree(buf.data);
4357  pfree(data);
4358 
4359  return ret_text;
4360 }
4361 
4362 /*
4363  * split_part
4364  * parse input string based on provided field separator
4365  * return N'th item (1 based, negative counts from end)
4366  */
4367 Datum
4369 {
4370  text *inputstring = PG_GETARG_TEXT_PP(0);
4371  text *fldsep = PG_GETARG_TEXT_PP(1);
4372  int fldnum = PG_GETARG_INT32(2);
4373  int inputstring_len;
4374  int fldsep_len;
4376  char *start_ptr;
4377  char *end_ptr;
4378  text *result_text;
4379  bool found;
4380 
4381  /* field number is 1 based */
4382  if (fldnum == 0)
4383  ereport(ERROR,
4384  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4385  errmsg("field position must not be zero")));
4386 
4387  inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
4388  fldsep_len = VARSIZE_ANY_EXHDR(fldsep);
4389 
4390  /* return empty string for empty input string */
4391  if (inputstring_len < 1)
4393 
4394  /* handle empty field separator */
4395  if (fldsep_len < 1)
4396  {
4397  /* if first or last field, return input string, else empty string */
4398  if (fldnum == 1 || fldnum == -1)
4399  PG_RETURN_TEXT_P(inputstring);
4400  else
4402  }
4403 
4404  /* find the first field separator */
4405  text_position_setup(inputstring, fldsep, PG_GET_COLLATION(), &state);
4406 
4407  found = text_position_next(&state);
4408 
4409  /* special case if fldsep not found at all */
4410  if (!found)
4411  {
4413  /* if first or last field, return input string, else empty string */
4414  if (fldnum == 1 || fldnum == -1)
4415  PG_RETURN_TEXT_P(inputstring);
4416  else
4418  }
4419 
4420  /*
4421  * take care of a negative field number (i.e. count from the right) by
4422  * converting to a positive field number; we need total number of fields
4423  */
4424  if (fldnum < 0)
4425  {
4426  /* we found a fldsep, so there are at least two fields */
4427  int numfields = 2;
4428 
4429  while (text_position_next(&state))
4430  numfields++;
4431 
4432  /* special case of last field does not require an extra pass */
4433  if (fldnum == -1)
4434  {
4435  start_ptr = text_position_get_match_ptr(&state) + fldsep_len;
4436  end_ptr = VARDATA_ANY(inputstring) + inputstring_len;
4439  end_ptr - start_ptr));
4440  }
4441 
4442  /* else, convert fldnum to positive notation */
4443  fldnum += numfields + 1;
4444 
4445  /* if nonexistent field, return empty string */
4446  if (fldnum <= 0)
4447  {
4450  }
4451 
4452  /* reset to pointing at first match, but now with positive fldnum */
4454  found = text_position_next(&state);
4455  Assert(found);
4456  }
4457 
4458  /* identify bounds of first field */
4459  start_ptr = VARDATA_ANY(inputstring);
4460  end_ptr = text_position_get_match_ptr(&state);
4461 
4462  while (found && --fldnum > 0)
4463  {
4464  /* identify bounds of next field */
4465  start_ptr = end_ptr + fldsep_len;
4466  found = text_position_next(&state);
4467  if (found)
4468  end_ptr = text_position_get_match_ptr(&state);
4469  }
4470 
4472 
4473  if (fldnum > 0)
4474  {
4475  /* N'th field separator not found */
4476  /* if last field requested, return it, else empty string */
4477  if (fldnum == 1)
4478  {
4479  int last_len = start_ptr - VARDATA_ANY(inputstring);
4480 
4481  result_text = cstring_to_text_with_len(start_ptr,
4482  inputstring_len - last_len);
4483  }
4484  else
4485  result_text = cstring_to_text("");
4486  }
4487  else
4488  {
4489  /* non-last field requested */
4490  result_text = cstring_to_text_with_len(start_ptr, end_ptr - start_ptr);
4491  }
4492 
4493  PG_RETURN_TEXT_P(result_text);
4494 }
4495 
4496 /*
4497  * Convenience function to return true when two text params are equal.
4498  */
4499 static bool
4501 {
4503  collid,
4504  PointerGetDatum(txt1),
4505  PointerGetDatum(txt2)));
4506 }
4507 
4508 /*
4509  * text_to_array
4510  * parse input string and return text array of elements,
4511  * based on provided field separator
4512  */
4513 Datum
4515 {
4516  SplitTextOutputData tstate;
4517 
4518  /* For array output, tstate should start as all zeroes */
4519  memset(&tstate, 0, sizeof(tstate));
4520 
4521  if (!split_text(fcinfo, &tstate))
4522  PG_RETURN_NULL();
4523 
4524  if (tstate.astate == NULL)
4526 
4529 }
4530 
4531 /*
4532  * text_to_array_null
4533  * parse input string and return text array of elements,
4534  * based on provided field separator and null string
4535  *
4536  * This is a separate entry point only to prevent the regression tests from
4537  * complaining about different argument sets for the same internal function.
4538  */
4539 Datum
4541 {
4542  return text_to_array(fcinfo);
4543 }
4544 
4545 /*
4546  * text_to_table
4547  * parse input string and return table of elements,
4548  * based on provided field separator
4549  */
4550 Datum
4552 {
4553  ReturnSetInfo *rsi = (ReturnSetInfo *) fcinfo->resultinfo;
4554  SplitTextOutputData tstate;
4555 
4556  tstate.astate = NULL;
4558  tstate.tupstore = rsi->setResult;
4559  tstate.tupdesc = rsi->setDesc;
4560 
4561  (void) split_text(fcinfo, &tstate);
4562 
4563  return (Datum) 0;
4564 }
4565 
4566 /*
4567  * text_to_table_null
4568  * parse input string and return table of elements,
4569  * based on provided field separator and null string
4570  *
4571  * This is a separate entry point only to prevent the regression tests from
4572  * complaining about different argument sets for the same internal function.
4573  */
4574 Datum
4576 {
4577  return text_to_table(fcinfo);
4578 }
4579 
4580 /*
4581  * Common code for text_to_array, text_to_array_null, text_to_table
4582  * and text_to_table_null functions.
4583  *
4584  * These are not strict so we have to test for null inputs explicitly.
4585  * Returns false if result is to be null, else returns true.
4586  *
4587  * Note that if the result is valid but empty (zero elements), we return
4588  * without changing *tstate --- caller must handle that case, too.
4589  */
4590 static bool
4592 {
4593  text *inputstring;
4594  text *fldsep;
4595  text *null_string;
4596  Oid collation = PG_GET_COLLATION();
4597  int inputstring_len;
4598  int fldsep_len;
4599  char *start_ptr;
4600  text *result_text;
4601 
4602  /* when input string is NULL, then result is NULL too */
4603  if (PG_ARGISNULL(0))
4604  return false;
4605 
4606  inputstring = PG_GETARG_TEXT_PP(0);
4607 
4608  /* fldsep can be NULL */
4609  if (!PG_ARGISNULL(1))
4610  fldsep = PG_GETARG_TEXT_PP(1);
4611  else
4612  fldsep = NULL;
4613 
4614  /* null_string can be NULL or omitted */
4615  if (PG_NARGS() > 2 && !PG_ARGISNULL(2))
4616  null_string = PG_GETARG_TEXT_PP(2);
4617  else
4618  null_string = NULL;
4619 
4620  if (fldsep != NULL)
4621  {
4622  /*
4623  * Normal case with non-null fldsep. Use the text_position machinery
4624  * to search for occurrences of fldsep.
4625  */
4627 
4628  inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
4629  fldsep_len = VARSIZE_ANY_EXHDR(fldsep);
4630 
4631  /* return empty set for empty input string */
4632  if (inputstring_len < 1)
4633  return true;
4634 
4635  /* empty field separator: return input string as a one-element set */
4636  if (fldsep_len < 1)
4637  {
4638  split_text_accum_result(tstate, inputstring,
4639  null_string, collation);
4640  return true;
4641  }
4642 
4643  text_position_setup(inputstring, fldsep, collation, &state);
4644 
4645  start_ptr = VARDATA_ANY(inputstring);
4646 
4647  for (;;)
4648  {
4649  bool found;
4650  char *end_ptr;
4651  int chunk_len;
4652 
4654 
4655  found = text_position_next(&state);
4656  if (!found)
4657  {
4658  /* fetch last field */
4659  chunk_len = ((char *) inputstring + VARSIZE_ANY(inputstring)) - start_ptr;
4660  end_ptr = NULL; /* not used, but some compilers complain */
4661  }
4662  else
4663  {
4664  /* fetch non-last field */
4665  end_ptr = text_position_get_match_ptr(&state);
4666  chunk_len = end_ptr - start_ptr;
4667  }
4668 
4669  /* build a temp text datum to pass to split_text_accum_result */
4670  result_text = cstring_to_text_with_len(start_ptr, chunk_len);
4671 
4672  /* stash away this field */
4673  split_text_accum_result(tstate, result_text,
4674  null_string, collation);
4675 
4676  pfree(result_text);
4677 
4678  if (!found)
4679  break;
4680 
4681  start_ptr = end_ptr + fldsep_len;
4682  }
4683 
4685  }
4686  else
4687  {
4688  /*
4689  * When fldsep is NULL, each character in the input string becomes a
4690  * separate element in the result set. The separator is effectively
4691  * the space between characters.
4692  */
4693  inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
4694 
4695  start_ptr = VARDATA_ANY(inputstring);
4696 
4697  while (inputstring_len > 0)
4698  {
4699  int chunk_len = pg_mblen(start_ptr);
4700 
4702 
4703  /* build a temp text datum to pass to split_text_accum_result */
4704  result_text = cstring_to_text_with_len(start_ptr, chunk_len);
4705 
4706  /* stash away this field */
4707  split_text_accum_result(tstate, result_text,
4708  null_string, collation);
4709 
4710  pfree(result_text);
4711 
4712  start_ptr += chunk_len;
4713  inputstring_len -= chunk_len;
4714  }
4715  }
4716 
4717  return true;
4718 }
4719 
4720 /*
4721  * Add text item to result set (table or array).
4722  *
4723  * This is also responsible for checking to see if the item matches
4724  * the null_string, in which case we should emit NULL instead.
4725  */
4726 static void
4728  text *field_value,
4729  text *null_string,
4730  Oid collation)
4731 {
4732  bool is_null = false;
4733 
4734  if (null_string && text_isequal(field_value, null_string, collation))
4735  is_null = true;
4736 
4737  if (tstate->tupstore)
4738  {
4739  Datum values[1];
4740  bool nulls[1];
4741 
4742  values[0] = PointerGetDatum(field_value);
4743  nulls[0] = is_null;
4744 
4746  tstate->tupdesc,
4747  values,
4748  nulls);
4749  }
4750  else
4751  {
4752  tstate->astate = accumArrayResult(tstate->astate,
4753  PointerGetDatum(field_value),
4754  is_null,
4755  TEXTOID,
4757  }
4758 }
4759 
4760 /*
4761  * array_to_text
4762  * concatenate Cstring representation of input array elements
4763  * using provided field separator
4764  */
4765 Datum
4767 {
4769  char *fldsep = text_to_cstring(PG_GETARG_TEXT_PP(1));
4770 
4771  PG_RETURN_TEXT_P(array_to_text_internal(fcinfo, v, fldsep, NULL));
4772 }
4773 
4774 /*
4775  * array_to_text_null
4776  * concatenate Cstring representation of input array elements
4777  * using provided field separator and null string
4778  *
4779  * This version is not strict so we have to test for null inputs explicitly.
4780  */
4781 Datum
4783 {
4784  ArrayType *v;
4785  char *fldsep;
4786  char *null_string;
4787 
4788  /* returns NULL when first or second parameter is NULL */
4789  if (PG_ARGISNULL(0) || PG_ARGISNULL(1))
4790  PG_RETURN_NULL();
4791 
4792  v = PG_GETARG_ARRAYTYPE_P(0);
4793  fldsep = text_to_cstring(PG_GETARG_TEXT_PP(1));
4794 
4795  /* NULL null string is passed through as a null pointer */
4796  if (!PG_ARGISNULL(2))
4797  null_string = text_to_cstring(PG_GETARG_TEXT_PP(2));
4798  else
4799  null_string = NULL;
4800 
4801  PG_RETURN_TEXT_P(array_to_text_internal(fcinfo, v, fldsep, null_string));
4802 }
4803 
4804 /*
4805  * common code for array_to_text and array_to_text_null functions
4806  */
4807 static text *
4809  const char *fldsep, const char *null_string)
4810 {
4811  text *result;
4812  int nitems,
4813  *dims,
4814  ndims;
4815  Oid element_type;
4816  int typlen;
4817  bool typbyval;
4818  char typalign;
4820  bool printed = false;
4821  char *p;
4822  bits8 *bitmap;
4823  int bitmask;
4824  int i;
4825  ArrayMetaState *my_extra;
4826 
4827  ndims = ARR_NDIM(v);
4828  dims = ARR_DIMS(v);
4829  nitems = ArrayGetNItems(ndims, dims);
4830 
4831  /* if there are no elements, return an empty string */
4832  if (nitems == 0)
4833  return cstring_to_text_with_len("", 0);
4834 
4835  element_type = ARR_ELEMTYPE(v);
4836  initStringInfo(&buf);
4837 
4838  /*
4839  * We arrange to look up info about element type, including its output
4840  * conversion proc, only once per series of calls, assuming the element
4841  * type doesn't change underneath us.
4842  */
4843  my_extra = (ArrayMetaState *) fcinfo->flinfo->fn_extra;
4844  if (my_extra == NULL)
4845  {
4846  fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
4847  sizeof(ArrayMetaState));
4848  my_extra = (ArrayMetaState *) fcinfo->flinfo->fn_extra;
4849  my_extra->element_type = ~element_type;
4850  }
4851 
4852  if (my_extra->element_type != element_type)
4853  {
4854  /*
4855  * Get info about element type, including its output conversion proc
4856  */
4857  get_type_io_data(element_type, IOFunc_output,
4858  &my_extra->typlen, &my_extra->typbyval,
4859  &my_extra->typalign, &my_extra->typdelim,
4860  &my_extra->typioparam, &my_extra->typiofunc);
4861  fmgr_info_cxt(my_extra->typiofunc, &my_extra->proc,
4862  fcinfo->flinfo->fn_mcxt);
4863  my_extra->element_type = element_type;
4864  }
4865  typlen = my_extra->typlen;
4866  typbyval = my_extra->typbyval;
4867  typalign = my_extra->typalign;
4868 
4869  p = ARR_DATA_PTR(v);
4870  bitmap = ARR_NULLBITMAP(v);
4871  bitmask = 1;
4872 
4873  for (i = 0; i < nitems; i++)
4874  {
4875  Datum itemvalue;
4876  char *value;
4877 
4878  /* Get source element, checking for NULL */
4879  if (bitmap && (*bitmap & bitmask) == 0)
4880  {
4881  /* if null_string is NULL, we just ignore null elements */
4882  if (null_string != NULL)
4883  {
4884  if (printed)
4885  appendStringInfo(&buf, "%s%s", fldsep, null_string);
4886  else
4887  appendStringInfoString(&buf, null_string);
4888  printed = true;
4889  }
4890  }
4891  else
4892  {
4893  itemvalue = fetch_att(p, typbyval, typlen);
4894 
4895  value = OutputFunctionCall(&my_extra->proc, itemvalue);
4896 
4897  if (printed)
4898  appendStringInfo(&buf, "%s%s", fldsep, value);
4899  else
4901  printed = true;
4902 
4903  p = att_addlength_pointer(p, typlen, p);
4904  p = (char *) att_align_nominal(p, typalign);
4905  }
4906 
4907  /* advance bitmap pointer if any */
4908  if (bitmap)
4909  {
4910  bitmask <<= 1;
4911  if (bitmask == 0x100)
4912  {
4913  bitmap++;
4914  bitmask = 1;
4915  }
4916  }
4917  }
4918 
4919  result = cstring_to_text_with_len(buf.data, buf.len);
4920  pfree(buf.data);
4921 
4922  return result;
4923 }
4924 
4925 /*
4926  * Workhorse for to_bin, to_oct, and to_hex. Note that base must be > 1 and <=
4927  * 16.
4928  */
4929 static inline text *
4930 convert_to_base(uint64 value, int base)
4931 {
4932  const char *digits = "0123456789abcdef";
4933 
4934  /* We size the buffer for to_bin's longest possible return value. */
4935  char buf[sizeof(uint64) * BITS_PER_BYTE];
4936  char *const end = buf + sizeof(buf);
4937  char *ptr = end;
4938 
4939  Assert(base > 1);
4940  Assert(base <= 16);
4941 
4942  do
4943  {
4944  *--ptr = digits[value % base];
4945  value /= base;
4946  } while (ptr > buf && value);
4947 
4948  return cstring_to_text_with_len(ptr, end - ptr);
4949 }
4950 
4951 /*
4952  * Convert an integer to a string containing a base-2 (binary) representation
4953  * of the number.
4954  */
4955 Datum
4957 {
4958  uint64 value = (uint32) PG_GETARG_INT32(0);
4959 
4961 }
4962 Datum
4964 {
4965  uint64 value = (uint64) PG_GETARG_INT64(0);
4966 
4968 }
4969 
4970 /*
4971  * Convert an integer to a string containing a base-8 (oct) representation of
4972  * the number.
4973  */
4974 Datum
4976 {
4977  uint64 value = (uint32) PG_GETARG_INT32(0);
4978 
4980 }
4981 Datum
4983 {
4984  uint64 value = (uint64) PG_GETARG_INT64(0);
4985 
4987 }
4988 
4989 /*
4990  * Convert an integer to a string containing a base-16 (hex) representation of
4991  * the number.
4992  */
4993 Datum
4995 {
4996  uint64 value = (uint32) PG_GETARG_INT32(0);
4997 
4999 }
5000 Datum
5002 {
5003  uint64 value = (uint64) PG_GETARG_INT64(0);
5004 
5006 }
5007 
5008 /*
5009  * Return the size of a datum, possibly compressed
5010  *
5011  * Works on any data type
5012  */
5013 Datum
5015 {
5017  int32 result;
5018  int typlen;
5019 
5020  /* On first call, get the input type's typlen, and save at *fn_extra */
5021  if (fcinfo->flinfo->fn_extra == NULL)
5022  {
5023  /* Lookup the datatype of the supplied argument */
5024  Oid argtypeid = get_fn_expr_argtype(fcinfo->flinfo, 0);
5025 
5026  typlen = get_typlen(argtypeid);
5027  if (typlen == 0) /* should not happen */
5028  elog(ERROR, "cache lookup failed for type %u", argtypeid);
5029 
5030  fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
5031  sizeof(int));
5032  *((int *) fcinfo->flinfo->fn_extra) = typlen;
5033  }
5034  else
5035  typlen = *((int *) fcinfo->flinfo->fn_extra);
5036 
5037  if (typlen == -1)
5038  {
5039  /* varlena type, possibly toasted */
5040  result = toast_datum_size(value);
5041  }
5042  else if (typlen == -2)
5043  {
5044  /* cstring */
5045  result = strlen(DatumGetCString(value)) + 1;
5046  }
5047  else
5048  {
5049  /* ordinary fixed-width type */
5050  result = typlen;
5051  }
5052 
5053  PG_RETURN_INT32(result);
5054 }
5055 
5056 /*
5057  * Return the compression method stored in the compressed attribute. Return
5058  * NULL for non varlena type or uncompressed data.
5059  */
5060 Datum
5062 {
5063  int typlen;
5064  char *result;
5065  ToastCompressionId cmid;
5066 
5067  /* On first call, get the input type's typlen, and save at *fn_extra */
5068  if (fcinfo->flinfo->fn_extra == NULL)
5069  {
5070  /* Lookup the datatype of the supplied argument */
5071  Oid argtypeid = get_fn_expr_argtype(fcinfo->flinfo, 0);
5072 
5073  typlen = get_typlen(argtypeid);
5074  if (typlen == 0) /* should not happen */
5075  elog(ERROR, "cache lookup failed for type %u", argtypeid);
5076 
5077  fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
5078  sizeof(int));
5079  *((int *) fcinfo->flinfo->fn_extra) = typlen;
5080  }
5081  else
5082  typlen = *((int *) fcinfo->flinfo->fn_extra);
5083 
5084  if (typlen != -1)
5085  PG_RETURN_NULL();
5086 
5087  /* get the compression method id stored in the compressed varlena */
5088  cmid = toast_get_compression_id((struct varlena *)
5090  if (cmid == TOAST_INVALID_COMPRESSION_ID)
5091  PG_RETURN_NULL();
5092 
5093  /* convert compression method id to compression method name */
5094  switch (cmid)
5095  {
5097  result = "pglz";
5098  break;
5100  result = "lz4";
5101  break;
5102  default:
5103  elog(ERROR, "invalid compression method id %d", cmid);
5104  }
5105 
5107 }
5108 
5109 /*
5110  * Return the chunk_id of the on-disk TOASTed value. Return NULL if the value
5111  * is un-TOASTed or not on-disk.
5112  */
5113 Datum
5115 {
5116  int typlen;
5117  struct varlena *attr;
5118  struct varatt_external toast_pointer;
5119 
5120  /* On first call, get the input type's typlen, and save at *fn_extra */
5121  if (fcinfo->flinfo->fn_extra == NULL)
5122  {
5123  /* Lookup the datatype of the supplied argument */
5124  Oid argtypeid = get_fn_expr_argtype(fcinfo->flinfo, 0);
5125 
5126  typlen = get_typlen(argtypeid);
5127  if (typlen == 0) /* should not happen */
5128  elog(ERROR, "cache lookup failed for type %u", argtypeid);
5129 
5130  fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
5131  sizeof(int));
5132  *((int *) fcinfo->flinfo->fn_extra) = typlen;
5133  }
5134  else
5135  typlen = *((int *) fcinfo->flinfo->fn_extra);
5136 
5137  if (typlen != -1)
5138  PG_RETURN_NULL();
5139 
5140  attr = (struct varlena *) DatumGetPointer(PG_GETARG_DATUM(0));
5141 
5142  if (!VARATT_IS_EXTERNAL_ONDISK(attr))
5143  PG_RETURN_NULL();
5144 
5145  VARATT_EXTERNAL_GET_POINTER(toast_pointer, attr);
5146 
5147  PG_RETURN_OID(toast_pointer.va_valueid);
5148 }
5149 
5150 /*
5151  * string_agg - Concatenates values and returns string.
5152  *
5153  * Syntax: string_agg(value text, delimiter text) RETURNS text
5154  *
5155  * Note: Any NULL values are ignored. The first-call delimiter isn't
5156  * actually used at all, and on subsequent calls the delimiter precedes
5157  * the associated value.
5158  */
5159 
5160 /* subroutine to initialize state */
5161 static StringInfo
5163 {
5164  StringInfo state;
5165  MemoryContext aggcontext;
5166  MemoryContext oldcontext;
5167 
5168  if (!AggCheckCallContext(fcinfo, &aggcontext))
5169  {
5170  /* cannot be called directly because of internal-type argument */
5171  elog(ERROR, "string_agg_transfn called in non-aggregate context");
5172  }
5173 
5174  /*
5175  * Create state in aggregate context. It'll stay there across subsequent
5176  * calls.
5177  */
5178  oldcontext = MemoryContextSwitchTo(aggcontext);
5179  state = makeStringInfo();
5180  MemoryContextSwitchTo(oldcontext);
5181 
5182  return state;
5183 }
5184 
5185 Datum
5187 {
5188  StringInfo state;
5189 
5190  state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
5191 
5192  /* Append the value unless null, preceding it with the delimiter. */
5193  if (!PG_ARGISNULL(1))
5194  {
5196  bool isfirst = false;
5197 
5198  /*
5199  * You might think we can just throw away the first delimiter, however
5200  * we must keep it as we may be a parallel worker doing partial
5201  * aggregation building a state to send to the main process. We need
5202  * to keep the delimiter of every aggregation so that the combine
5203  * function can properly join up the strings of two separately
5204  * partially aggregated results. The first delimiter is only stripped
5205  * off in the final function. To know how much to strip off the front
5206  * of the string, we store the length of the first delimiter in the
5207  * StringInfo's cursor field, which we don't otherwise need here.
5208  */
5209  if (state == NULL)
5210  {
5211  state = makeStringAggState(fcinfo);
5212  isfirst = true;
5213  }
5214 
5215  if (!PG_ARGISNULL(2))
5216  {
5217  text *delim = PG_GETARG_TEXT_PP(2);
5218 
5219  appendStringInfoText(state, delim);
5220  if (isfirst)
5221  state->cursor = VARSIZE_ANY_EXHDR(delim);
5222  }
5223 
5225  }
5226 
5227  /*
5228  * The transition type for string_agg() is declared to be "internal",
5229  * which is a pass-by-value type the same size as a pointer.
5230  */
5231  if (state)
5233  PG_RETURN_NULL();
5234 }
5235 
5236 /*
5237  * string_agg_combine
5238  * Aggregate combine function for string_agg(text) and string_agg(bytea)
5239  */
5240 Datum
5242 {
5243  StringInfo state1;
5244  StringInfo state2;
5245  MemoryContext agg_context;
5246 
5247  if (!AggCheckCallContext(fcinfo, &agg_context))
5248  elog(ERROR, "aggregate function called in non-aggregate context");
5249 
5250  state1 = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
5251  state2 = PG_ARGISNULL(1) ? NULL : (StringInfo) PG_GETARG_POINTER(1);
5252 
5253  if (state2 == NULL)
5254  {
5255  /*
5256  * NULL state2 is easy, just return state1, which we know is already
5257  * in the agg_context
5258  */
5259  if (state1 == NULL)
5260  PG_RETURN_NULL();
5261  PG_RETURN_POINTER(state1);
5262  }
5263 
5264  if (state1 == NULL)
5265  {
5266  /* We must copy state2's data into the agg_context */
5267  MemoryContext old_context;
5268 
5269  old_context = MemoryContextSwitchTo(agg_context);
5270  state1 = makeStringAggState(fcinfo);
5271  appendBinaryStringInfo(state1, state2->data, state2->len);
5272  state1->cursor = state2->cursor;
5273  MemoryContextSwitchTo(old_context);
5274  }
5275  else if (state2->len > 0)
5276  {
5277  /* Combine ... state1->cursor does not change in this case */
5278  appendBinaryStringInfo(state1, state2->data, state2->len);
5279  }
5280 
5281  PG_RETURN_POINTER(state1);
5282 }
5283 
5284 /*
5285  * string_agg_serialize
5286  * Aggregate serialize function for string_agg(text) and string_agg(bytea)
5287  *
5288  * This is strict, so we need not handle NULL input
5289  */
5290 Datum
5292 {
5293  StringInfo state;
5295  bytea *result;
5296 
5297  /* cannot be called directly because of internal-type argument */
5298  Assert(AggCheckCallContext(fcinfo, NULL));
5299 
5301 
5302  pq_begintypsend(&buf);
5303 
5304  /* cursor */
5305  pq_sendint(&buf, state->cursor, 4);
5306 
5307  /* data */
5308  pq_sendbytes(&buf, state->data, state->len);
5309 
5310  result = pq_endtypsend(&buf);
5311 
5312  PG_RETURN_BYTEA_P(result);
5313 }
5314 
5315 /*
5316  * string_agg_deserialize
5317  * Aggregate deserial function for string_agg(text) and string_agg(bytea)
5318  *
5319  * This is strict, so we need not handle NULL input
5320  */
5321 Datum
5323 {
5324  bytea *sstate;
5325  StringInfo result;
5327  char *data;
5328  int datalen;
5329 
5330  /* cannot be called directly because of internal-type argument */
5331  Assert(AggCheckCallContext(fcinfo, NULL));
5332 
5333  sstate = PG_GETARG_BYTEA_PP(0);
5334 
5335  /*
5336  * Initialize a StringInfo so that we can "receive" it using the standard
5337  * recv-function infrastructure.
5338  */
5340  VARSIZE_ANY_EXHDR(sstate));
5341 
5342  result = makeStringAggState(fcinfo);
5343 
5344  /* cursor */
5345  result->cursor = pq_getmsgint(&buf, 4);
5346 
5347  /* data */
5348  datalen = VARSIZE_ANY_EXHDR(sstate) - 4;
5349  data = (char *) pq_getmsgbytes(&buf, datalen);
5350  appendBinaryStringInfo(result, data, datalen);
5351 
5352  pq_getmsgend(&buf);
5353 
5354  PG_RETURN_POINTER(result);
5355 }
5356 
5357 Datum
5359 {
5360  StringInfo state;
5361 
5362  /* cannot be called directly because of internal-type argument */
5363  Assert(AggCheckCallContext(fcinfo, NULL));
5364 
5365  state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
5366 
5367  if (state != NULL)
5368  {
5369  /* As per comment in transfn, strip data before the cursor position */
5371  state->len - state->cursor));
5372  }
5373  else
5374  PG_RETURN_NULL();
5375 }
5376 
5377 /*
5378  * Prepare cache with fmgr info for the output functions of the datatypes of
5379  * the arguments of a concat-like function, beginning with argument "argidx".
5380  * (Arguments before that will have corresponding slots in the resulting
5381  * FmgrInfo array, but we don't fill those slots.)
5382  */
5383 static FmgrInfo *
5385 {
5386  FmgrInfo *foutcache;
5387  int i;
5388 
5389  /* We keep the info in fn_mcxt so it survives across calls */
5390  foutcache = (FmgrInfo *) MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
5391  PG_NARGS() * sizeof(FmgrInfo));
5392 
5393  for (i = argidx; i < PG_NARGS(); i++)
5394  {
5395  Oid valtype;
5396  Oid typOutput;
5397  bool typIsVarlena;
5398 
5399  valtype = get_fn_expr_argtype(fcinfo->flinfo, i);
5400  if (!OidIsValid(valtype))
5401  elog(ERROR, "could not determine data type of concat() input");
5402 
5403  getTypeOutputInfo(valtype, &typOutput, &typIsVarlena);
5404  fmgr_info_cxt(typOutput, &foutcache[i], fcinfo->flinfo->fn_mcxt);
5405  }
5406 
5407  fcinfo->flinfo->fn_extra = foutcache;
5408 
5409  return foutcache;
5410 }
5411 
5412 /*
5413  * Implementation of both concat() and concat_ws().
5414  *
5415  * sepstr is the separator string to place between values.
5416  * argidx identifies the first argument to concatenate (counting from zero);
5417  * note that this must be constant across any one series of calls.
5418  *
5419  * Returns NULL if result should be NULL, else text value.
5420  */
5421 static text *
5422 concat_internal(const char *sepstr, int argidx,
5423  FunctionCallInfo fcinfo)
5424 {
5425  text *result;
5427  FmgrInfo *foutcache;
5428  bool first_arg = true;
5429  int i;
5430 
5431  /*
5432  * concat(VARIADIC some-array) is essentially equivalent to
5433  * array_to_text(), ie concat the array elements with the given separator.
5434  * So we just pass the case off to that code.
5435  */
5436  if (get_fn_expr_variadic(fcinfo->flinfo))
5437  {
5438  ArrayType *arr;
5439 
5440  /* Should have just the one argument */
5441  Assert(argidx == PG_NARGS() - 1);
5442 
5443  /* concat(VARIADIC NULL) is defined as NULL */
5444  if (PG_ARGISNULL(argidx))
5445  return NULL;
5446 
5447  /*
5448  * Non-null argument had better be an array. We assume that any call
5449  * context that could let get_fn_expr_variadic return true will have
5450  * checked that a VARIADIC-labeled parameter actually is an array. So
5451  * it should be okay to just Assert that it's an array rather than
5452  * doing a full-fledged error check.
5453  */
5455 
5456  /* OK, safe to fetch the array value */
5457  arr = PG_GETARG_ARRAYTYPE_P(argidx);
5458 
5459  /*
5460  * And serialize the array. We tell array_to_text to ignore null
5461  * elements, which matches the behavior of the loop below.
5462  */
5463  return array_to_text_internal(fcinfo, arr, sepstr, NULL);
5464  }
5465 
5466  /* Normal case without explicit VARIADIC marker */
5467  initStringInfo(&str);
5468 
5469  /* Get output function info, building it if first time through */
5470  foutcache = (FmgrInfo *) fcinfo->flinfo->fn_extra;
5471  if (foutcache == NULL)
5472  foutcache = build_concat_foutcache(fcinfo, argidx);
5473 
5474  for (i = argidx; i < PG_NARGS(); i++)
5475  {
5476  if (!PG_ARGISNULL(i))
5477  {
5479 
5480  /* add separator if appropriate */
5481  if (first_arg)
5482  first_arg = false;
5483  else
5484  appendStringInfoString(&str, sepstr);
5485 
5486  /* call the appropriate type output function, append the result */
5488  OutputFunctionCall(&foutcache[i], value));
5489  }
5490  }
5491 
5492  result = cstring_to_text_with_len(str.data, str.len);
5493  pfree(str.data);
5494 
5495  return result;
5496 }
5497 
5498 /*
5499  * Concatenate all arguments. NULL arguments are ignored.
5500  */
5501 Datum
5503 {
5504  text *result;
5505 
5506  result = concat_internal("", 0, fcinfo);
5507  if (result == NULL)
5508  PG_RETURN_NULL();
5509  PG_RETURN_TEXT_P(result);
5510 }
5511 
5512 /*
5513  * Concatenate all but first argument value with separators. The first
5514  * parameter is used as the separator. NULL arguments are ignored.
5515  */
5516 Datum
5518 {
5519  char *sep;
5520  text *result;
5521 
5522  /* return NULL when separator is NULL */
5523  if (PG_ARGISNULL(0))
5524  PG_RETURN_NULL();
5526 
5527  result = concat_internal(sep, 1, fcinfo);
5528  if (result == NULL)
5529  PG_RETURN_NULL();
5530  PG_RETURN_TEXT_P(result);
5531 }
5532 
5533 /*
5534  * Return first n characters in the string. When n is negative,
5535  * return all but last |n| characters.
5536  */
5537 Datum
5539 {
5540  int n = PG_GETARG_INT32(1);
5541 
5542  if (n < 0)
5543  {
5544  text *str = PG_GETARG_TEXT_PP(0);
5545  const char *p = VARDATA_ANY(str);
5546  int len = VARSIZE_ANY_EXHDR(str);
5547  int rlen;
5548 
5549  n = pg_mbstrlen_with_len(p, len) + n;
5550  rlen = pg_mbcharcliplen(p, len, n);
5552  }
5553  else
5555 }
5556 
5557 /*
5558  * Return last n characters in the string. When n is negative,
5559  * return all but first |n| characters.
5560  */
5561 Datum
5563 {
5564  text *str = PG_GETARG_TEXT_PP(0);
5565  const char *p = VARDATA_ANY(str);
5566  int len = VARSIZE_ANY_EXHDR(str);
5567  int n = PG_GETARG_INT32(1);
5568  int off;
5569 
5570  if (n < 0)
5571  n = -n;
5572  else
5573  n = pg_mbstrlen_with_len(p, len) - n;
5574  off = pg_mbcharcliplen(p, len, n);
5575 
5577 }
5578 
5579 /*
5580  * Return reversed string
5581  */
5582 Datum
5584 {
5585  text *str = PG_GETARG_TEXT_PP(0);
5586  const char *p = VARDATA_ANY(str);
5587  int len = VARSIZE_ANY_EXHDR(str);
5588  const char *endp = p + len;
5589  text *result;
5590  char *dst;
5591 
5592  result = palloc(len + VARHDRSZ);
5593  dst = (char *) VARDATA(result) + len;
5594  SET_VARSIZE(result, len + VARHDRSZ);
5595 
5597  {
5598  /* multibyte version */
5599  while (p < endp)
5600  {
5601  int sz;
5602 
5603  sz = pg_mblen(p);
5604  dst -= sz;
5605  memcpy(dst, p, sz);
5606  p += sz;
5607  }
5608  }
5609  else
5610  {
5611  /* single byte version */
5612  while (p < endp)
5613  *(--dst) = *p++;
5614  }
5615 
5616  PG_RETURN_TEXT_P(result);
5617 }
5618 
5619 
5620 /*
5621  * Support macros for text_format()
5622  */
5623 #define TEXT_FORMAT_FLAG_MINUS 0x0001 /* is minus flag present? */
5624 
5625 #define ADVANCE_PARSE_POINTER(ptr,end_ptr) \
5626  do { \
5627  if (++(ptr) >= (end_ptr)) \
5628  ereport(ERROR, \
5629  (errcode(ERRCODE_INVALID_PARAMETER_VALUE), \
5630  errmsg("unterminated format() type specifier"), \
5631  errhint("For a single \"%%\" use \"%%%%\"."))); \
5632  } while (0)
5633 
5634 /*
5635  * Returns a formatted string
5636  */
5637 Datum
5639 {
5640  text *fmt;
5642  const char *cp;
5643  const char *start_ptr;
5644  const char *end_ptr;
5645  text *result;
5646  int arg;
5647  bool funcvariadic;
5648  int nargs;
5649  Datum *elements = NULL;
5650  bool *nulls = NULL;
5651  Oid element_type = InvalidOid;
5652  Oid prev_type = InvalidOid;
5653  Oid prev_width_type = InvalidOid;
5654  FmgrInfo typoutputfinfo;
5655  FmgrInfo typoutputinfo_width;
5656 
5657  /* When format string is null, immediately return null */
5658  if (PG_ARGISNULL(0))
5659  PG_RETURN_NULL();
5660 
5661  /* If argument is marked VARIADIC, expand array into elements */
5662  if (get_fn_expr_variadic(fcinfo->flinfo))
5663  {
5664  ArrayType *arr;
5665  int16 elmlen;
5666  bool elmbyval;
5667  char elmalign;
5668  int nitems;
5669 
5670  /* Should have just the one argument */
5671  Assert(PG_NARGS() == 2);
5672 
5673  /* If argument is NULL, we treat it as zero-length array */
5674  if (PG_ARGISNULL(1))
5675  nitems = 0;
5676  else
5677  {
5678  /*
5679  * Non-null argument had better be an array. We assume that any
5680  * call context that could let get_fn_expr_variadic return true
5681  * will have checked that a VARIADIC-labeled parameter actually is
5682  * an array. So it should be okay to just Assert that it's an
5683  * array rather than doing a full-fledged error check.
5684  */
5686 
5687  /* OK, safe to fetch the array value */
5688  arr = PG_GETARG_ARRAYTYPE_P(1);
5689 
5690  /* Get info about array element type */
5691  element_type = ARR_ELEMTYPE(arr);
5692  get_typlenbyvalalign(element_type,
5693  &elmlen, &elmbyval, &elmalign);
5694 
5695  /* Extract all array elements */
5696  deconstruct_array(arr, element_type, elmlen, elmbyval, elmalign,
5697  &elements, &nulls, &nitems);
5698  }
5699 
5700  nargs = nitems + 1;
5701  funcvariadic = true;
5702  }
5703  else
5704  {
5705  /* Non-variadic case, we'll process the arguments individually */
5706  nargs = PG_NARGS();
5707  funcvariadic = false;
5708  }
5709 
5710  /* Setup for main loop. */
5711  fmt = PG_GETARG_TEXT_PP(0);
5712  start_ptr = VARDATA_ANY(fmt);
5713  end_ptr = start_ptr + VARSIZE_ANY_EXHDR(fmt);
5714  initStringInfo(&str);
5715  arg = 1; /* next argument position to print */
5716 
5717  /* Scan format string, looking for conversion specifiers. */
5718  for (cp = start_ptr; cp < end_ptr; cp++)
5719  {
5720  int argpos;
5721  int widthpos;
5722  int flags;
5723  int width;
5724  Datum value;
5725  bool isNull;
5726  Oid typid;
5727 
5728  /*
5729  * If it's not the start of a conversion specifier, just copy it to
5730  * the output buffer.
5731  */
5732  if (*cp != '%')
5733  {
5735  continue;
5736  }
5737 
5738  ADVANCE_PARSE_POINTER(cp, end_ptr);
5739 
5740  /* Easy case: %% outputs a single % */
5741  if (*cp == '%')
5742  {
5744  continue;
5745  }
5746 
5747  /* Parse the optional portions of the format specifier */
5748  cp = text_format_parse_format(cp, end_ptr,
5749  &argpos, &widthpos,
5750  &flags, &width);
5751 
5752  /*
5753  * Next we should see the main conversion specifier. Whether or not
5754  * an argument position was present, it's known that at least one
5755  * character remains in the string at this point. Experience suggests
5756  * that it's worth checking that that character is one of the expected
5757  * ones before we try to fetch arguments, so as to produce the least
5758  * confusing response to a mis-formatted specifier.
5759  */
5760  if (strchr("sIL", *cp) == NULL)
5761  ereport(ERROR,
5762  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5763  errmsg("unrecognized format() type specifier \"%.*s\"",
5764  pg_mblen(cp), cp),
5765  errhint("For a single \"%%\" use \"%%%%\".")));
5766 
5767  /* If indirect width was specified, get its value */
5768  if (widthpos >= 0)
5769  {
5770  /* Collect the specified or next argument position */
5771  if (widthpos > 0)
5772  arg = widthpos;
5773  if (arg >= nargs)
5774  ereport(ERROR,
5775  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5776  errmsg("too few arguments for format()")));
5777 
5778  /* Get the value and type of the selected argument */
5779  if (!funcvariadic)
5780  {
5782  isNull = PG_ARGISNULL(arg);
5783  typid = get_fn_expr_argtype(fcinfo->flinfo, arg);
5784  }
5785  else
5786  {
5787  value = elements[arg - 1];
5788  isNull = nulls[arg - 1];
5789  typid = element_type;
5790  }
5791  if (!OidIsValid(typid))
5792  elog(ERROR, "could not determine data type of format() input");
5793 
5794  arg++;
5795 
5796  /* We can treat NULL width the same as zero */
5797  if (isNull)
5798  width = 0;
5799  else if (typid == INT4OID)
5800  width = DatumGetInt32(value);
5801  else if (typid == INT2OID)
5802  width = DatumGetInt16(value);
5803  else
5804  {
5805  /* For less-usual datatypes, convert to text then to int */
5806  char *str;
5807 
5808  if (typid != prev_width_type)
5809  {
5810  Oid typoutputfunc;
5811  bool typIsVarlena;
5812 
5813  getTypeOutputInfo(typid, &typoutputfunc, &typIsVarlena);
5814  fmgr_info(typoutputfunc, &typoutputinfo_width);
5815  prev_width_type = typid;
5816  }
5817 
5818  str = OutputFunctionCall(&typoutputinfo_width, value);
5819 
5820  /* pg_strtoint32 will complain about bad data or overflow */
5821  width = pg_strtoint32(str);
5822 
5823  pfree(str);
5824  }
5825  }
5826 
5827  /* Collect the specified or next argument position */
5828  if (argpos > 0)
5829  arg = argpos;
5830  if (arg >= nargs)
5831  ereport(ERROR,
5832  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5833  errmsg("too few arguments for format()")));
5834 
5835  /* Get the value and type of the selected argument */
5836  if (!funcvariadic)
5837  {
5839  isNull = PG_ARGISNULL(arg);
5840  typid = get_fn_expr_argtype(fcinfo->flinfo, arg);
5841  }
5842  else
5843  {
5844  value = elements[arg - 1];
5845  isNull = nulls[arg - 1];
5846  typid = element_type;
5847  }
5848  if (!OidIsValid(typid))
5849  elog(ERROR, "could not determine data type of format() input");
5850 
5851  arg++;
5852 
5853  /*
5854  * Get the appropriate typOutput function, reusing previous one if
5855  * same type as previous argument. That's particularly useful in the
5856  * variadic-array case, but often saves work even for ordinary calls.
5857  */
5858  if (typid != prev_type)
5859  {
5860  Oid typoutputfunc;
5861  bool typIsVarlena;
5862 
5863  getTypeOutputInfo(typid, &typoutputfunc, &typIsVarlena);
5864  fmgr_info(typoutputfunc, &typoutputfinfo);
5865  prev_type = typid;
5866  }
5867 
5868  /*
5869  * And now we can format the value.
5870  */
5871  switch (*cp)
5872  {
5873  case 's':
5874  case 'I':
5875  case 'L':
5876  text_format_string_conversion(&str, *cp, &typoutputfinfo,
5877  value, isNull,
5878  flags, width);
5879  break;
5880  default:
5881  /* should not get here, because of previous check */
5882  ereport(ERROR,
5883  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5884  errmsg("unrecognized format() type specifier \"%.*s\"",
5885  pg_mblen(cp), cp),
5886  errhint("For a single \"%%\" use \"%%%%\".")));
5887  break;
5888  }
5889  }
5890 
5891  /* Don't need deconstruct_array results anymore. */
5892  if (elements != NULL)
5893  pfree(elements);
5894  if (nulls != NULL)
5895  pfree(nulls);
5896 
5897  /* Generate results. */
5898  result = cstring_to_text_with_len(str.data, str.len);
5899  pfree(str.data);
5900 
5901  PG_RETURN_TEXT_P(result);
5902 }
5903 
5904 /*
5905  * Parse contiguous digits as a decimal number.
5906  *
5907  * Returns true if some digits could be parsed.
5908  * The value is returned into *value, and *ptr is advanced to the next
5909  * character to be parsed.
5910  *
5911  * Note parsing invariant: at least one character is known available before
5912  * string end (end_ptr) at entry, and this is still true at exit.
5913  */
5914 static bool
5915 text_format_parse_digits(const char **ptr, const char *end_ptr, int *value)
5916 {
5917  bool found = false;
5918  const char *cp = *ptr;
5919  int val = 0;
5920 
5921  while (*cp >= '0' && *cp <= '9')
5922  {
5923  int8 digit = (*cp - '0');
5924 
5925  if (unlikely(pg_mul_s32_overflow(val, 10, &val)) ||
5926  unlikely(pg_add_s32_overflow(val, digit, &val)))
5927  ereport(ERROR,
5928  (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
5929  errmsg("number is out of range")));
5930  ADVANCE_PARSE_POINTER(cp, end_ptr);
5931  found = true;
5932  }
5933 
5934  *ptr = cp;
5935  *value = val;
5936 
5937  return found;
5938 }
5939 
5940 /*
5941  * Parse a format specifier (generally following the SUS printf spec).
5942  *
5943  * We have already advanced over the initial '%', and we are looking for
5944  * [argpos][flags][width]type (but the type character is not consumed here).
5945  *
5946  * Inputs are start_ptr (the position after '%') and end_ptr (string end + 1).
5947  * Output parameters:
5948  * argpos: argument position for value to be printed. -1 means unspecified.
5949  * widthpos: argument position for width. Zero means the argument position
5950  * was unspecified (ie, take the next arg) and -1 means no width
5951  * argument (width was omitted or specified as a constant).
5952  * flags: bitmask of flags.
5953  * width: directly-specified width value. Zero means the width was omitted
5954  * (note it's not necessary to distinguish this case from an explicit
5955  * zero width value).
5956  *
5957  * The function result is the next character position to be parsed, ie, the
5958  * location where the type character is/should be.
5959  *
5960  * Note parsing invariant: at least one character is known available before
5961  * string end (end_ptr) at entry, and this is still true at exit.
5962  */
5963 static const char *
5964 text_format_parse_format(const char *start_ptr, const char *end_ptr,
5965  int *argpos, int *widthpos,
5966  int *flags, int *width)
5967 {
5968  const char *cp = start_ptr;
5969  int n;
5970 
5971  /* set defaults for output parameters */
5972  *argpos = -1;
5973  *widthpos = -1;
5974  *flags = 0;
5975  *width = 0;
5976 
5977  /* try to identify first number */
5978  if (text_format_parse_digits(&cp, end_ptr, &n))
5979  {
5980  if (*cp != '$')
5981  {
5982  /* Must be just a width and a type, so we're done */
5983  *width = n;
5984  return cp;
5985  }
5986  /* The number was argument position */
5987  *argpos = n;
5988  /* Explicit 0 for argument index is immediately refused */
5989  if (n == 0)
5990  ereport(ERROR,
5991  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5992  errmsg("format specifies argument 0, but arguments are numbered from 1")));
5993  ADVANCE_PARSE_POINTER(cp, end_ptr);
5994  }
5995 
5996  /* Handle flags (only minus is supported now) */
5997  while (*cp == '-')
5998  {
5999  *flags |= TEXT_FORMAT_FLAG_MINUS;
6000  ADVANCE_PARSE_POINTER(cp, end_ptr);
6001  }
6002 
6003  if (*cp == '*')
6004  {
6005  /* Handle indirect width */
6006  ADVANCE_PARSE_POINTER(cp, end_ptr);
6007  if (text_format_parse_digits(&cp, end_ptr, &n))
6008  {
6009  /* number in this position must be closed by $ */
6010  if (*cp != '$')
6011  ereport(ERROR,
6012  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6013