PostgreSQL Source Code  git master
varlena.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * varlena.c
4  * Functions for the variable-length built-in types.
5  *
6  * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  * src/backend/utils/adt/varlena.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 #include "postgres.h"
16 
17 #include <ctype.h>
18 #include <limits.h>
19 
20 #include "access/detoast.h"
22 #include "catalog/pg_collation.h"
23 #include "catalog/pg_type.h"
24 #include "common/hashfn.h"
25 #include "common/int.h"
26 #include "common/unicode_norm.h"
27 #include "funcapi.h"
28 #include "lib/hyperloglog.h"
29 #include "libpq/pqformat.h"
30 #include "miscadmin.h"
31 #include "nodes/execnodes.h"
32 #include "parser/scansup.h"
33 #include "port/pg_bswap.h"
34 #include "regex/regex.h"
35 #include "utils/builtins.h"
36 #include "utils/bytea.h"
37 #include "utils/guc.h"
38 #include "utils/lsyscache.h"
39 #include "utils/memutils.h"
40 #include "utils/pg_locale.h"
41 #include "utils/sortsupport.h"
42 #include "utils/varlena.h"
43 
44 
45 /* GUC variable */
47 
48 typedef struct varlena VarString;
49 
50 /*
51  * State for text_position_* functions.
52  */
53 typedef struct
54 {
55  bool is_multibyte_char_in_char; /* need to check char boundaries? */
56 
57  char *str1; /* haystack string */
58  char *str2; /* needle string */
59  int len1; /* string lengths in bytes */
60  int len2;
61 
62  /* Skip table for Boyer-Moore-Horspool search algorithm: */
63  int skiptablemask; /* mask for ANDing with skiptable subscripts */
64  int skiptable[256]; /* skip distance for given mismatched char */
65 
66  char *last_match; /* pointer to last match in 'str1' */
67 
68  /*
69  * Sometimes we need to convert the byte position of a match to a
70  * character position. These store the last position that was converted,
71  * so that on the next call, we can continue from that point, rather than
72  * count characters from the very beginning.
73  */
74  char *refpoint; /* pointer within original haystack string */
75  int refpos; /* 0-based character offset of the same point */
77 
78 typedef struct
79 {
80  char *buf1; /* 1st string, or abbreviation original string
81  * buf */
82  char *buf2; /* 2nd string, or abbreviation strxfrm() buf */
83  int buflen1; /* Allocated length of buf1 */
84  int buflen2; /* Allocated length of buf2 */
85  int last_len1; /* Length of last buf1 string/strxfrm() input */
86  int last_len2; /* Length of last buf2 string/strxfrm() blob */
87  int last_returned; /* Last comparison result (cache) */
88  bool cache_blob; /* Does buf2 contain strxfrm() blob, etc? */
89  bool collate_c;
90  Oid typid; /* Actual datatype (text/bpchar/bytea/name) */
91  hyperLogLogState abbr_card; /* Abbreviated key cardinality state */
92  hyperLogLogState full_card; /* Full key cardinality state */
93  double prop_card; /* Required cardinality proportion */
96 
97 /*
98  * Output data for split_text(): we output either to an array or a table.
99  * tupstore and tupdesc must be set up in advance to output to a table.
100  */
101 typedef struct
102 {
107 
108 /*
109  * This should be large enough that most strings will fit, but small enough
110  * that we feel comfortable putting it on the stack
111  */
112 #define TEXTBUFLEN 1024
113 
114 #define DatumGetVarStringP(X) ((VarString *) PG_DETOAST_DATUM(X))
115 #define DatumGetVarStringPP(X) ((VarString *) PG_DETOAST_DATUM_PACKED(X))
116 
117 static int varstrfastcmp_c(Datum x, Datum y, SortSupport ssup);
118 static int bpcharfastcmp_c(Datum x, Datum y, SortSupport ssup);
119 static int namefastcmp_c(Datum x, Datum y, SortSupport ssup);
120 static int varlenafastcmp_locale(Datum x, Datum y, SortSupport ssup);
121 static int namefastcmp_locale(Datum x, Datum y, SortSupport ssup);
122 static int varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup);
123 static Datum varstr_abbrev_convert(Datum original, SortSupport ssup);
124 static bool varstr_abbrev_abort(int memtupcount, SortSupport ssup);
125 static int32 text_length(Datum str);
126 static text *text_catenate(text *t1, text *t2);
127 static text *text_substring(Datum str,
128  int32 start,
129  int32 length,
130  bool length_not_specified);
131 static text *text_overlay(text *t1, text *t2, int sp, int sl);
132 static int text_position(text *t1, text *t2, Oid collid);
135 static char *text_position_next_internal(char *start_ptr, TextPositionState *state);
139 static void check_collation_set(Oid collid);
140 static int text_cmp(text *arg1, text *arg2, Oid collid);
141 static bytea *bytea_catenate(bytea *t1, bytea *t2);
143  int S,
144  int L,
145  bool length_not_specified);
146 static bytea *bytea_overlay(bytea *t1, bytea *t2, int sp, int sl);
147 static void appendStringInfoText(StringInfo str, const text *t);
148 static bool split_text(FunctionCallInfo fcinfo, SplitTextOutputData *tstate);
149 static void split_text_accum_result(SplitTextOutputData *tstate,
150  text *field_value,
151  text *null_string,
152  Oid collation);
154  const char *fldsep, const char *null_string);
156 static bool text_format_parse_digits(const char **ptr, const char *end_ptr,
157  int *value);
158 static const char *text_format_parse_format(const char *start_ptr,
159  const char *end_ptr,
160  int *argpos, int *widthpos,
161  int *flags, int *width);
162 static void text_format_string_conversion(StringInfo buf, char conversion,
163  FmgrInfo *typOutputInfo,
164  Datum value, bool isNull,
165  int flags, int width);
166 static void text_format_append_string(StringInfo buf, const char *str,
167  int flags, int width);
168 
169 
170 /*****************************************************************************
171  * CONVERSION ROUTINES EXPORTED FOR USE BY C CODE *
172  *****************************************************************************/
173 
174 /*
175  * cstring_to_text
176  *
177  * Create a text value from a null-terminated C string.
178  *
179  * The new text value is freshly palloc'd with a full-size VARHDR.
180  */
181 text *
182 cstring_to_text(const char *s)
183 {
184  return cstring_to_text_with_len(s, strlen(s));
185 }
186 
187 /*
188  * cstring_to_text_with_len
189  *
190  * Same as cstring_to_text except the caller specifies the string length;
191  * the string need not be null_terminated.
192  */
193 text *
194 cstring_to_text_with_len(const char *s, int len)
195 {
196  text *result = (text *) palloc(len + VARHDRSZ);
197 
198  SET_VARSIZE(result, len + VARHDRSZ);
199  memcpy(VARDATA(result), s, len);
200 
201  return result;
202 }
203 
204 /*
205  * text_to_cstring
206  *
207  * Create a palloc'd, null-terminated C string from a text value.
208  *
209  * We support being passed a compressed or toasted text value.
210  * This is a bit bogus since such values shouldn't really be referred to as
211  * "text *", but it seems useful for robustness. If we didn't handle that
212  * case here, we'd need another routine that did, anyway.
213  */
214 char *
216 {
217  /* must cast away the const, unfortunately */
218  text *tunpacked = pg_detoast_datum_packed(unconstify(text *, t));
219  int len = VARSIZE_ANY_EXHDR(tunpacked);
220  char *result;
221 
222  result = (char *) palloc(len + 1);
223  memcpy(result, VARDATA_ANY(tunpacked), len);
224  result[len] = '\0';
225 
226  if (tunpacked != t)
227  pfree(tunpacked);
228 
229  return result;
230 }
231 
232 /*
233  * text_to_cstring_buffer
234  *
235  * Copy a text value into a caller-supplied buffer of size dst_len.
236  *
237  * The text string is truncated if necessary to fit. The result is
238  * guaranteed null-terminated (unless dst_len == 0).
239  *
240  * We support being passed a compressed or toasted text value.
241  * This is a bit bogus since such values shouldn't really be referred to as
242  * "text *", but it seems useful for robustness. If we didn't handle that
243  * case here, we'd need another routine that did, anyway.
244  */
245 void
246 text_to_cstring_buffer(const text *src, char *dst, size_t dst_len)
247 {
248  /* must cast away the const, unfortunately */
249  text *srcunpacked = pg_detoast_datum_packed(unconstify(text *, src));
250  size_t src_len = VARSIZE_ANY_EXHDR(srcunpacked);
251 
252  if (dst_len > 0)
253  {
254  dst_len--;
255  if (dst_len >= src_len)
256  dst_len = src_len;
257  else /* ensure truncation is encoding-safe */
258  dst_len = pg_mbcliplen(VARDATA_ANY(srcunpacked), src_len, dst_len);
259  memcpy(dst, VARDATA_ANY(srcunpacked), dst_len);
260  dst[dst_len] = '\0';
261  }
262 
263  if (srcunpacked != src)
264  pfree(srcunpacked);
265 }
266 
267 
268 /*****************************************************************************
269  * USER I/O ROUTINES *
270  *****************************************************************************/
271 
272 
273 #define VAL(CH) ((CH) - '0')
274 #define DIG(VAL) ((VAL) + '0')
275 
276 /*
277  * byteain - converts from printable representation of byte array
278  *
279  * Non-printable characters must be passed as '\nnn' (octal) and are
280  * converted to internal form. '\' must be passed as '\\'.
281  * ereport(ERROR, ...) if bad form.
282  *
283  * BUGS:
284  * The input is scanned twice.
285  * The error checking of input is minimal.
286  */
287 Datum
289 {
290  char *inputText = PG_GETARG_CSTRING(0);
291  Node *escontext = fcinfo->context;
292  char *tp;
293  char *rp;
294  int bc;
295  bytea *result;
296 
297  /* Recognize hex input */
298  if (inputText[0] == '\\' && inputText[1] == 'x')
299  {
300  size_t len = strlen(inputText);
301 
302  bc = (len - 2) / 2 + VARHDRSZ; /* maximum possible length */
303  result = palloc(bc);
304  bc = hex_decode_safe(inputText + 2, len - 2, VARDATA(result),
305  escontext);
306  SET_VARSIZE(result, bc + VARHDRSZ); /* actual length */
307 
308  PG_RETURN_BYTEA_P(result);
309  }
310 
311  /* Else, it's the traditional escaped style */
312  for (bc = 0, tp = inputText; *tp != '\0'; bc++)
313  {
314  if (tp[0] != '\\')
315  tp++;
316  else if ((tp[0] == '\\') &&
317  (tp[1] >= '0' && tp[1] <= '3') &&
318  (tp[2] >= '0' && tp[2] <= '7') &&
319  (tp[3] >= '0' && tp[3] <= '7'))
320  tp += 4;
321  else if ((tp[0] == '\\') &&
322  (tp[1] == '\\'))
323  tp += 2;
324  else
325  {
326  /*
327  * one backslash, not followed by another or ### valid octal
328  */
329  ereturn(escontext, (Datum) 0,
330  (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
331  errmsg("invalid input syntax for type %s", "bytea")));
332  }
333  }
334 
335  bc += VARHDRSZ;
336 
337  result = (bytea *) palloc(bc);
338  SET_VARSIZE(result, bc);
339 
340  tp = inputText;
341  rp = VARDATA(result);
342  while (*tp != '\0')
343  {
344  if (tp[0] != '\\')
345  *rp++ = *tp++;
346  else if ((tp[0] == '\\') &&
347  (tp[1] >= '0' && tp[1] <= '3') &&
348  (tp[2] >= '0' && tp[2] <= '7') &&
349  (tp[3] >= '0' && tp[3] <= '7'))
350  {
351  bc = VAL(tp[1]);
352  bc <<= 3;
353  bc += VAL(tp[2]);
354  bc <<= 3;
355  *rp++ = bc + VAL(tp[3]);
356 
357  tp += 4;
358  }
359  else if ((tp[0] == '\\') &&
360  (tp[1] == '\\'))
361  {
362  *rp++ = '\\';
363  tp += 2;
364  }
365  else
366  {
367  /*
368  * We should never get here. The first pass should not allow it.
369  */
370  ereturn(escontext, (Datum) 0,
371  (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
372  errmsg("invalid input syntax for type %s", "bytea")));
373  }
374  }
375 
376  PG_RETURN_BYTEA_P(result);
377 }
378 
379 /*
380  * byteaout - converts to printable representation of byte array
381  *
382  * In the traditional escaped format, non-printable characters are
383  * printed as '\nnn' (octal) and '\' as '\\'.
384  */
385 Datum
387 {
388  bytea *vlena = PG_GETARG_BYTEA_PP(0);
389  char *result;
390  char *rp;
391 
393  {
394  /* Print hex format */
395  rp = result = palloc(VARSIZE_ANY_EXHDR(vlena) * 2 + 2 + 1);
396  *rp++ = '\\';
397  *rp++ = 'x';
398  rp += hex_encode(VARDATA_ANY(vlena), VARSIZE_ANY_EXHDR(vlena), rp);
399  }
400  else if (bytea_output == BYTEA_OUTPUT_ESCAPE)
401  {
402  /* Print traditional escaped format */
403  char *vp;
404  uint64 len;
405  int i;
406 
407  len = 1; /* empty string has 1 char */
408  vp = VARDATA_ANY(vlena);
409  for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++)
410  {
411  if (*vp == '\\')
412  len += 2;
413  else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e)
414  len += 4;
415  else
416  len++;
417  }
418 
419  /*
420  * In principle len can't overflow uint32 if the input fit in 1GB, but
421  * for safety let's check rather than relying on palloc's internal
422  * check.
423  */
424  if (len > MaxAllocSize)
425  ereport(ERROR,
426  (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
427  errmsg_internal("result of bytea output conversion is too large")));
428  rp = result = (char *) palloc(len);
429 
430  vp = VARDATA_ANY(vlena);
431  for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++)
432  {
433  if (*vp == '\\')
434  {
435  *rp++ = '\\';
436  *rp++ = '\\';
437  }
438  else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e)
439  {
440  int val; /* holds unprintable chars */
441 
442  val = *vp;
443  rp[0] = '\\';
444  rp[3] = DIG(val & 07);
445  val >>= 3;
446  rp[2] = DIG(val & 07);
447  val >>= 3;
448  rp[1] = DIG(val & 03);
449  rp += 4;
450  }
451  else
452  *rp++ = *vp;
453  }
454  }
455  else
456  {
457  elog(ERROR, "unrecognized bytea_output setting: %d",
458  bytea_output);
459  rp = result = NULL; /* keep compiler quiet */
460  }
461  *rp = '\0';
462  PG_RETURN_CSTRING(result);
463 }
464 
465 /*
466  * bytearecv - converts external binary format to bytea
467  */
468 Datum
470 {
472  bytea *result;
473  int nbytes;
474 
475  nbytes = buf->len - buf->cursor;
476  result = (bytea *) palloc(nbytes + VARHDRSZ);
477  SET_VARSIZE(result, nbytes + VARHDRSZ);
478  pq_copymsgbytes(buf, VARDATA(result), nbytes);
479  PG_RETURN_BYTEA_P(result);
480 }
481 
482 /*
483  * byteasend - converts bytea to binary format
484  *
485  * This is a special case: just copy the input...
486  */
487 Datum
489 {
490  bytea *vlena = PG_GETARG_BYTEA_P_COPY(0);
491 
492  PG_RETURN_BYTEA_P(vlena);
493 }
494 
495 Datum
497 {
499 
500  state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
501 
502  /* Append the value unless null, preceding it with the delimiter. */
503  if (!PG_ARGISNULL(1))
504  {
506  bool isfirst = false;
507 
508  /*
509  * You might think we can just throw away the first delimiter, however
510  * we must keep it as we may be a parallel worker doing partial
511  * aggregation building a state to send to the main process. We need
512  * to keep the delimiter of every aggregation so that the combine
513  * function can properly join up the strings of two separately
514  * partially aggregated results. The first delimiter is only stripped
515  * off in the final function. To know how much to strip off the front
516  * of the string, we store the length of the first delimiter in the
517  * StringInfo's cursor field, which we don't otherwise need here.
518  */
519  if (state == NULL)
520  {
521  state = makeStringAggState(fcinfo);
522  isfirst = true;
523  }
524 
525  if (!PG_ARGISNULL(2))
526  {
527  bytea *delim = PG_GETARG_BYTEA_PP(2);
528 
530  VARSIZE_ANY_EXHDR(delim));
531  if (isfirst)
532  state->cursor = VARSIZE_ANY_EXHDR(delim);
533  }
534 
537  }
538 
539  /*
540  * The transition type for string_agg() is declared to be "internal",
541  * which is a pass-by-value type the same size as a pointer.
542  */
543  if (state)
545  PG_RETURN_NULL();
546 }
547 
548 Datum
550 {
552 
553  /* cannot be called directly because of internal-type argument */
554  Assert(AggCheckCallContext(fcinfo, NULL));
555 
556  state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
557 
558  if (state != NULL)
559  {
560  /* As per comment in transfn, strip data before the cursor position */
561  bytea *result;
562  int strippedlen = state->len - state->cursor;
563 
564  result = (bytea *) palloc(strippedlen + VARHDRSZ);
565  SET_VARSIZE(result, strippedlen + VARHDRSZ);
566  memcpy(VARDATA(result), &state->data[state->cursor], strippedlen);
567  PG_RETURN_BYTEA_P(result);
568  }
569  else
570  PG_RETURN_NULL();
571 }
572 
573 /*
574  * textin - converts cstring to internal representation
575  */
576 Datum
578 {
579  char *inputText = PG_GETARG_CSTRING(0);
580 
581  PG_RETURN_TEXT_P(cstring_to_text(inputText));
582 }
583 
584 /*
585  * textout - converts internal representation to cstring
586  */
587 Datum
589 {
590  Datum txt = PG_GETARG_DATUM(0);
591 
593 }
594 
595 /*
596  * textrecv - converts external binary format to text
597  */
598 Datum
600 {
602  text *result;
603  char *str;
604  int nbytes;
605 
606  str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
607 
608  result = cstring_to_text_with_len(str, nbytes);
609  pfree(str);
610  PG_RETURN_TEXT_P(result);
611 }
612 
613 /*
614  * textsend - converts text to binary format
615  */
616 Datum
618 {
619  text *t = PG_GETARG_TEXT_PP(0);
621 
625 }
626 
627 
628 /*
629  * unknownin - converts cstring to internal representation
630  */
631 Datum
633 {
634  char *str = PG_GETARG_CSTRING(0);
635 
636  /* representation is same as cstring */
638 }
639 
640 /*
641  * unknownout - converts internal representation to cstring
642  */
643 Datum
645 {
646  /* representation is same as cstring */
647  char *str = PG_GETARG_CSTRING(0);
648 
650 }
651 
652 /*
653  * unknownrecv - converts external binary format to unknown
654  */
655 Datum
657 {
659  char *str;
660  int nbytes;
661 
662  str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
663  /* representation is same as cstring */
665 }
666 
667 /*
668  * unknownsend - converts unknown to binary format
669  */
670 Datum
672 {
673  /* representation is same as cstring */
674  char *str = PG_GETARG_CSTRING(0);
676 
678  pq_sendtext(&buf, str, strlen(str));
680 }
681 
682 
683 /* ========== PUBLIC ROUTINES ========== */
684 
685 /*
686  * textlen -
687  * returns the logical length of a text*
688  * (which is less than the VARSIZE of the text*)
689  */
690 Datum
692 {
694 
695  /* try to avoid decompressing argument */
697 }
698 
699 /*
700  * text_length -
701  * Does the real work for textlen()
702  *
703  * This is broken out so it can be called directly by other string processing
704  * functions. Note that the argument is passed as a Datum, to indicate that
705  * it may still be in compressed form. We can avoid decompressing it at all
706  * in some cases.
707  */
708 static int32
710 {
711  /* fastpath when max encoding length is one */
714  else
715  {
716  text *t = DatumGetTextPP(str);
717 
719  VARSIZE_ANY_EXHDR(t)));
720  }
721 }
722 
723 /*
724  * textoctetlen -
725  * returns the physical length of a text*
726  * (which is less than the VARSIZE of the text*)
727  */
728 Datum
730 {
732 
733  /* We need not detoast the input at all */
735 }
736 
737 /*
738  * textcat -
739  * takes two text* and returns a text* that is the concatenation of
740  * the two.
741  *
742  * Rewritten by Sapa, sapa@hq.icb.chel.su. 8-Jul-96.
743  * Updated by Thomas, Thomas.Lockhart@jpl.nasa.gov 1997-07-10.
744  * Allocate space for output in all cases.
745  * XXX - thomas 1997-07-10
746  */
747 Datum
749 {
750  text *t1 = PG_GETARG_TEXT_PP(0);
751  text *t2 = PG_GETARG_TEXT_PP(1);
752 
754 }
755 
756 /*
757  * text_catenate
758  * Guts of textcat(), broken out so it can be used by other functions
759  *
760  * Arguments can be in short-header form, but not compressed or out-of-line
761  */
762 static text *
764 {
765  text *result;
766  int len1,
767  len2,
768  len;
769  char *ptr;
770 
771  len1 = VARSIZE_ANY_EXHDR(t1);
772  len2 = VARSIZE_ANY_EXHDR(t2);
773 
774  /* paranoia ... probably should throw error instead? */
775  if (len1 < 0)
776  len1 = 0;
777  if (len2 < 0)
778  len2 = 0;
779 
780  len = len1 + len2 + VARHDRSZ;
781  result = (text *) palloc(len);
782 
783  /* Set size of result string... */
784  SET_VARSIZE(result, len);
785 
786  /* Fill data field of result string... */
787  ptr = VARDATA(result);
788  if (len1 > 0)
789  memcpy(ptr, VARDATA_ANY(t1), len1);
790  if (len2 > 0)
791  memcpy(ptr + len1, VARDATA_ANY(t2), len2);
792 
793  return result;
794 }
795 
796 /*
797  * charlen_to_bytelen()
798  * Compute the number of bytes occupied by n characters starting at *p
799  *
800  * It is caller's responsibility that there actually are n characters;
801  * the string need not be null-terminated.
802  */
803 static int
804 charlen_to_bytelen(const char *p, int n)
805 {
807  {
808  /* Optimization for single-byte encodings */
809  return n;
810  }
811  else
812  {
813  const char *s;
814 
815  for (s = p; n > 0; n--)
816  s += pg_mblen(s);
817 
818  return s - p;
819  }
820 }
821 
822 /*
823  * text_substr()
824  * Return a substring starting at the specified position.
825  * - thomas 1997-12-31
826  *
827  * Input:
828  * - string
829  * - starting position (is one-based)
830  * - string length
831  *
832  * If the starting position is zero or less, then return from the start of the string
833  * adjusting the length to be consistent with the "negative start" per SQL.
834  * If the length is less than zero, return the remaining string.
835  *
836  * Added multibyte support.
837  * - Tatsuo Ishii 1998-4-21
838  * Changed behavior if starting position is less than one to conform to SQL behavior.
839  * Formerly returned the entire string; now returns a portion.
840  * - Thomas Lockhart 1998-12-10
841  * Now uses faster TOAST-slicing interface
842  * - John Gray 2002-02-22
843  * Remove "#ifdef MULTIBYTE" and test for encoding_max_length instead. Change
844  * behaviors conflicting with SQL to meet SQL (if E = S + L < S throw
845  * error; if E < 1, return '', not entire string). Fixed MB related bug when
846  * S > LC and < LC + 4 sometimes garbage characters are returned.
847  * - Joe Conway 2002-08-10
848  */
849 Datum
851 {
853  PG_GETARG_INT32(1),
854  PG_GETARG_INT32(2),
855  false));
856 }
857 
858 /*
859  * text_substr_no_len -
860  * Wrapper to avoid opr_sanity failure due to
861  * one function accepting a different number of args.
862  */
863 Datum
865 {
867  PG_GETARG_INT32(1),
868  -1, true));
869 }
870 
871 /*
872  * text_substring -
873  * Does the real work for text_substr() and text_substr_no_len()
874  *
875  * This is broken out so it can be called directly by other string processing
876  * functions. Note that the argument is passed as a Datum, to indicate that
877  * it may still be in compressed/toasted form. We can avoid detoasting all
878  * of it in some cases.
879  *
880  * The result is always a freshly palloc'd datum.
881  */
882 static text *
883 text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
884 {
886  int32 S = start; /* start position */
887  int32 S1; /* adjusted start position */
888  int32 L1; /* adjusted substring length */
889  int32 E; /* end position */
890 
891  /*
892  * SQL99 says S can be zero or negative, but we still must fetch from the
893  * start of the string.
894  */
895  S1 = Max(S, 1);
896 
897  /* life is easy if the encoding max length is 1 */
898  if (eml == 1)
899  {
900  if (length_not_specified) /* special case - get length to end of
901  * string */
902  L1 = -1;
903  else if (length < 0)
904  {
905  /* SQL99 says to throw an error for E < S, i.e., negative length */
906  ereport(ERROR,
907  (errcode(ERRCODE_SUBSTRING_ERROR),
908  errmsg("negative substring length not allowed")));
909  L1 = -1; /* silence stupider compilers */
910  }
911  else if (pg_add_s32_overflow(S, length, &E))
912  {
913  /*
914  * L could be large enough for S + L to overflow, in which case
915  * the substring must run to end of string.
916  */
917  L1 = -1;
918  }
919  else
920  {
921  /*
922  * A zero or negative value for the end position can happen if the
923  * start was negative or one. SQL99 says to return a zero-length
924  * string.
925  */
926  if (E < 1)
927  return cstring_to_text("");
928 
929  L1 = E - S1;
930  }
931 
932  /*
933  * If the start position is past the end of the string, SQL99 says to
934  * return a zero-length string -- DatumGetTextPSlice() will do that
935  * for us. We need only convert S1 to zero-based starting position.
936  */
937  return DatumGetTextPSlice(str, S1 - 1, L1);
938  }
939  else if (eml > 1)
940  {
941  /*
942  * When encoding max length is > 1, we can't get LC without
943  * detoasting, so we'll grab a conservatively large slice now and go
944  * back later to do the right thing
945  */
946  int32 slice_start;
947  int32 slice_size;
948  int32 slice_strlen;
949  text *slice;
950  int32 E1;
951  int32 i;
952  char *p;
953  char *s;
954  text *ret;
955 
956  /*
957  * We need to start at position zero because there is no way to know
958  * in advance which byte offset corresponds to the supplied start
959  * position.
960  */
961  slice_start = 0;
962 
963  if (length_not_specified) /* special case - get length to end of
964  * string */
965  slice_size = L1 = -1;
966  else if (length < 0)
967  {
968  /* SQL99 says to throw an error for E < S, i.e., negative length */
969  ereport(ERROR,
970  (errcode(ERRCODE_SUBSTRING_ERROR),
971  errmsg("negative substring length not allowed")));
972  slice_size = L1 = -1; /* silence stupider compilers */
973  }
974  else if (pg_add_s32_overflow(S, length, &E))
975  {
976  /*
977  * L could be large enough for S + L to overflow, in which case
978  * the substring must run to end of string.
979  */
980  slice_size = L1 = -1;
981  }
982  else
983  {
984  /*
985  * A zero or negative value for the end position can happen if the
986  * start was negative or one. SQL99 says to return a zero-length
987  * string.
988  */
989  if (E < 1)
990  return cstring_to_text("");
991 
992  /*
993  * if E is past the end of the string, the tuple toaster will
994  * truncate the length for us
995  */
996  L1 = E - S1;
997 
998  /*
999  * Total slice size in bytes can't be any longer than the start
1000  * position plus substring length times the encoding max length.
1001  * If that overflows, we can just use -1.
1002  */
1003  if (pg_mul_s32_overflow(E, eml, &slice_size))
1004  slice_size = -1;
1005  }
1006 
1007  /*
1008  * If we're working with an untoasted source, no need to do an extra
1009  * copying step.
1010  */
1013  slice = DatumGetTextPSlice(str, slice_start, slice_size);
1014  else
1015  slice = (text *) DatumGetPointer(str);
1016 
1017  /* see if we got back an empty string */
1018  if (VARSIZE_ANY_EXHDR(slice) == 0)
1019  {
1020  if (slice != (text *) DatumGetPointer(str))
1021  pfree(slice);
1022  return cstring_to_text("");
1023  }
1024 
1025  /* Now we can get the actual length of the slice in MB characters */
1026  slice_strlen = pg_mbstrlen_with_len(VARDATA_ANY(slice),
1027  VARSIZE_ANY_EXHDR(slice));
1028 
1029  /*
1030  * Check that the start position wasn't > slice_strlen. If so, SQL99
1031  * says to return a zero-length string.
1032  */
1033  if (S1 > slice_strlen)
1034  {
1035  if (slice != (text *) DatumGetPointer(str))
1036  pfree(slice);
1037  return cstring_to_text("");
1038  }
1039 
1040  /*
1041  * Adjust L1 and E1 now that we know the slice string length. Again
1042  * remember that S1 is one based, and slice_start is zero based.
1043  */
1044  if (L1 > -1)
1045  E1 = Min(S1 + L1, slice_start + 1 + slice_strlen);
1046  else
1047  E1 = slice_start + 1 + slice_strlen;
1048 
1049  /*
1050  * Find the start position in the slice; remember S1 is not zero based
1051  */
1052  p = VARDATA_ANY(slice);
1053  for (i = 0; i < S1 - 1; i++)
1054  p += pg_mblen(p);
1055 
1056  /* hang onto a pointer to our start position */
1057  s = p;
1058 
1059  /*
1060  * Count the actual bytes used by the substring of the requested
1061  * length.
1062  */
1063  for (i = S1; i < E1; i++)
1064  p += pg_mblen(p);
1065 
1066  ret = (text *) palloc(VARHDRSZ + (p - s));
1067  SET_VARSIZE(ret, VARHDRSZ + (p - s));
1068  memcpy(VARDATA(ret), s, (p - s));
1069 
1070  if (slice != (text *) DatumGetPointer(str))
1071  pfree(slice);
1072 
1073  return ret;
1074  }
1075  else
1076  elog(ERROR, "invalid backend encoding: encoding max length < 1");
1077 
1078  /* not reached: suppress compiler warning */
1079  return NULL;
1080 }
1081 
1082 /*
1083  * textoverlay
1084  * Replace specified substring of first string with second
1085  *
1086  * The SQL standard defines OVERLAY() in terms of substring and concatenation.
1087  * This code is a direct implementation of what the standard says.
1088  */
1089 Datum
1091 {
1092  text *t1 = PG_GETARG_TEXT_PP(0);
1093  text *t2 = PG_GETARG_TEXT_PP(1);
1094  int sp = PG_GETARG_INT32(2); /* substring start position */
1095  int sl = PG_GETARG_INT32(3); /* substring length */
1096 
1097  PG_RETURN_TEXT_P(text_overlay(t1, t2, sp, sl));
1098 }
1099 
1100 Datum
1102 {
1103  text *t1 = PG_GETARG_TEXT_PP(0);
1104  text *t2 = PG_GETARG_TEXT_PP(1);
1105  int sp = PG_GETARG_INT32(2); /* substring start position */
1106  int sl;
1107 
1108  sl = text_length(PointerGetDatum(t2)); /* defaults to length(t2) */
1109  PG_RETURN_TEXT_P(text_overlay(t1, t2, sp, sl));
1110 }
1111 
1112 static text *
1113 text_overlay(text *t1, text *t2, int sp, int sl)
1114 {
1115  text *result;
1116  text *s1;
1117  text *s2;
1118  int sp_pl_sl;
1119 
1120  /*
1121  * Check for possible integer-overflow cases. For negative sp, throw a
1122  * "substring length" error because that's what should be expected
1123  * according to the spec's definition of OVERLAY().
1124  */
1125  if (sp <= 0)
1126  ereport(ERROR,
1127  (errcode(ERRCODE_SUBSTRING_ERROR),
1128  errmsg("negative substring length not allowed")));
1129  if (pg_add_s32_overflow(sp, sl, &sp_pl_sl))
1130  ereport(ERROR,
1131  (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
1132  errmsg("integer out of range")));
1133 
1134  s1 = text_substring(PointerGetDatum(t1), 1, sp - 1, false);
1135  s2 = text_substring(PointerGetDatum(t1), sp_pl_sl, -1, true);
1136  result = text_catenate(s1, t2);
1137  result = text_catenate(result, s2);
1138 
1139  return result;
1140 }
1141 
1142 /*
1143  * textpos -
1144  * Return the position of the specified substring.
1145  * Implements the SQL POSITION() function.
1146  * Ref: A Guide To The SQL Standard, Date & Darwen, 1997
1147  * - thomas 1997-07-27
1148  */
1149 Datum
1151 {
1152  text *str = PG_GETARG_TEXT_PP(0);
1153  text *search_str = PG_GETARG_TEXT_PP(1);
1154 
1156 }
1157 
1158 /*
1159  * text_position -
1160  * Does the real work for textpos()
1161  *
1162  * Inputs:
1163  * t1 - string to be searched
1164  * t2 - pattern to match within t1
1165  * Result:
1166  * Character index of the first matched char, starting from 1,
1167  * or 0 if no match.
1168  *
1169  * This is broken out so it can be called directly by other string processing
1170  * functions.
1171  */
1172 static int
1174 {
1176  int result;
1177 
1178  /* Empty needle always matches at position 1 */
1179  if (VARSIZE_ANY_EXHDR(t2) < 1)
1180  return 1;
1181 
1182  /* Otherwise, can't match if haystack is shorter than needle */
1183  if (VARSIZE_ANY_EXHDR(t1) < VARSIZE_ANY_EXHDR(t2))
1184  return 0;
1185 
1186  text_position_setup(t1, t2, collid, &state);
1187  if (!text_position_next(&state))
1188  result = 0;
1189  else
1192  return result;
1193 }
1194 
1195 
1196 /*
1197  * text_position_setup, text_position_next, text_position_cleanup -
1198  * Component steps of text_position()
1199  *
1200  * These are broken out so that a string can be efficiently searched for
1201  * multiple occurrences of the same pattern. text_position_next may be
1202  * called multiple times, and it advances to the next match on each call.
1203  * text_position_get_match_ptr() and text_position_get_match_pos() return
1204  * a pointer or 1-based character position of the last match, respectively.
1205  *
1206  * The "state" variable is normally just a local variable in the caller.
1207  *
1208  * NOTE: text_position_next skips over the matched portion. For example,
1209  * searching for "xx" in "xxx" returns only one match, not two.
1210  */
1211 
1212 static void
1214 {
1215  int len1 = VARSIZE_ANY_EXHDR(t1);
1216  int len2 = VARSIZE_ANY_EXHDR(t2);
1217  pg_locale_t mylocale = 0;
1218 
1220 
1221  if (!lc_collate_is_c(collid))
1222  mylocale = pg_newlocale_from_collation(collid);
1223 
1224  if (!pg_locale_deterministic(mylocale))
1225  ereport(ERROR,
1226  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1227  errmsg("nondeterministic collations are not supported for substring searches")));
1228 
1229  Assert(len1 > 0);
1230  Assert(len2 > 0);
1231 
1232  /*
1233  * Even with a multi-byte encoding, we perform the search using the raw
1234  * byte sequence, ignoring multibyte issues. For UTF-8, that works fine,
1235  * because in UTF-8 the byte sequence of one character cannot contain
1236  * another character. For other multi-byte encodings, we do the search
1237  * initially as a simple byte search, ignoring multibyte issues, but
1238  * verify afterwards that the match we found is at a character boundary,
1239  * and continue the search if it was a false match.
1240  */
1242  state->is_multibyte_char_in_char = false;
1243  else if (GetDatabaseEncoding() == PG_UTF8)
1244  state->is_multibyte_char_in_char = false;
1245  else
1246  state->is_multibyte_char_in_char = true;
1247 
1248  state->str1 = VARDATA_ANY(t1);
1249  state->str2 = VARDATA_ANY(t2);
1250  state->len1 = len1;
1251  state->len2 = len2;
1252  state->last_match = NULL;
1253  state->refpoint = state->str1;
1254  state->refpos = 0;
1255 
1256  /*
1257  * Prepare the skip table for Boyer-Moore-Horspool searching. In these
1258  * notes we use the terminology that the "haystack" is the string to be
1259  * searched (t1) and the "needle" is the pattern being sought (t2).
1260  *
1261  * If the needle is empty or bigger than the haystack then there is no
1262  * point in wasting cycles initializing the table. We also choose not to
1263  * use B-M-H for needles of length 1, since the skip table can't possibly
1264  * save anything in that case.
1265  */
1266  if (len1 >= len2 && len2 > 1)
1267  {
1268  int searchlength = len1 - len2;
1269  int skiptablemask;
1270  int last;
1271  int i;
1272  const char *str2 = state->str2;
1273 
1274  /*
1275  * First we must determine how much of the skip table to use. The
1276  * declaration of TextPositionState allows up to 256 elements, but for
1277  * short search problems we don't really want to have to initialize so
1278  * many elements --- it would take too long in comparison to the
1279  * actual search time. So we choose a useful skip table size based on
1280  * the haystack length minus the needle length. The closer the needle
1281  * length is to the haystack length the less useful skipping becomes.
1282  *
1283  * Note: since we use bit-masking to select table elements, the skip
1284  * table size MUST be a power of 2, and so the mask must be 2^N-1.
1285  */
1286  if (searchlength < 16)
1287  skiptablemask = 3;
1288  else if (searchlength < 64)
1289  skiptablemask = 7;
1290  else if (searchlength < 128)
1291  skiptablemask = 15;
1292  else if (searchlength < 512)
1293  skiptablemask = 31;
1294  else if (searchlength < 2048)
1295  skiptablemask = 63;
1296  else if (searchlength < 4096)
1297  skiptablemask = 127;
1298  else
1299  skiptablemask = 255;
1300  state->skiptablemask = skiptablemask;
1301 
1302  /*
1303  * Initialize the skip table. We set all elements to the needle
1304  * length, since this is the correct skip distance for any character
1305  * not found in the needle.
1306  */
1307  for (i = 0; i <= skiptablemask; i++)
1308  state->skiptable[i] = len2;
1309 
1310  /*
1311  * Now examine the needle. For each character except the last one,
1312  * set the corresponding table element to the appropriate skip
1313  * distance. Note that when two characters share the same skip table
1314  * entry, the one later in the needle must determine the skip
1315  * distance.
1316  */
1317  last = len2 - 1;
1318 
1319  for (i = 0; i < last; i++)
1320  state->skiptable[(unsigned char) str2[i] & skiptablemask] = last - i;
1321  }
1322 }
1323 
1324 /*
1325  * Advance to the next match, starting from the end of the previous match
1326  * (or the beginning of the string, on first call). Returns true if a match
1327  * is found.
1328  *
1329  * Note that this refuses to match an empty-string needle. Most callers
1330  * will have handled that case specially and we'll never see it here.
1331  */
1332 static bool
1334 {
1335  int needle_len = state->len2;
1336  char *start_ptr;
1337  char *matchptr;
1338 
1339  if (needle_len <= 0)
1340  return false; /* result for empty pattern */
1341 
1342  /* Start from the point right after the previous match. */
1343  if (state->last_match)
1344  start_ptr = state->last_match + needle_len;
1345  else
1346  start_ptr = state->str1;
1347 
1348 retry:
1349  matchptr = text_position_next_internal(start_ptr, state);
1350 
1351  if (!matchptr)
1352  return false;
1353 
1354  /*
1355  * Found a match for the byte sequence. If this is a multibyte encoding,
1356  * where one character's byte sequence can appear inside a longer
1357  * multi-byte character, we need to verify that the match was at a
1358  * character boundary, not in the middle of a multi-byte character.
1359  */
1360  if (state->is_multibyte_char_in_char)
1361  {
1362  /* Walk one character at a time, until we reach the match. */
1363 
1364  /* the search should never move backwards. */
1365  Assert(state->refpoint <= matchptr);
1366 
1367  while (state->refpoint < matchptr)
1368  {
1369  /* step to next character. */
1370  state->refpoint += pg_mblen(state->refpoint);
1371  state->refpos++;
1372 
1373  /*
1374  * If we stepped over the match's start position, then it was a
1375  * false positive, where the byte sequence appeared in the middle
1376  * of a multi-byte character. Skip it, and continue the search at
1377  * the next character boundary.
1378  */
1379  if (state->refpoint > matchptr)
1380  {
1381  start_ptr = state->refpoint;
1382  goto retry;
1383  }
1384  }
1385  }
1386 
1387  state->last_match = matchptr;
1388  return true;
1389 }
1390 
1391 /*
1392  * Subroutine of text_position_next(). This searches for the raw byte
1393  * sequence, ignoring any multi-byte encoding issues. Returns the first
1394  * match starting at 'start_ptr', or NULL if no match is found.
1395  */
1396 static char *
1398 {
1399  int haystack_len = state->len1;
1400  int needle_len = state->len2;
1401  int skiptablemask = state->skiptablemask;
1402  const char *haystack = state->str1;
1403  const char *needle = state->str2;
1404  const char *haystack_end = &haystack[haystack_len];
1405  const char *hptr;
1406 
1407  Assert(start_ptr >= haystack && start_ptr <= haystack_end);
1408 
1409  if (needle_len == 1)
1410  {
1411  /* No point in using B-M-H for a one-character needle */
1412  char nchar = *needle;
1413 
1414  hptr = start_ptr;
1415  while (hptr < haystack_end)
1416  {
1417  if (*hptr == nchar)
1418  return (char *) hptr;
1419  hptr++;
1420  }
1421  }
1422  else
1423  {
1424  const char *needle_last = &needle[needle_len - 1];
1425 
1426  /* Start at startpos plus the length of the needle */
1427  hptr = start_ptr + needle_len - 1;
1428  while (hptr < haystack_end)
1429  {
1430  /* Match the needle scanning *backward* */
1431  const char *nptr;
1432  const char *p;
1433 
1434  nptr = needle_last;
1435  p = hptr;
1436  while (*nptr == *p)
1437  {
1438  /* Matched it all? If so, return 1-based position */
1439  if (nptr == needle)
1440  return (char *) p;
1441  nptr--, p--;
1442  }
1443 
1444  /*
1445  * No match, so use the haystack char at hptr to decide how far to
1446  * advance. If the needle had any occurrence of that character
1447  * (or more precisely, one sharing the same skiptable entry)
1448  * before its last character, then we advance far enough to align
1449  * the last such needle character with that haystack position.
1450  * Otherwise we can advance by the whole needle length.
1451  */
1452  hptr += state->skiptable[(unsigned char) *hptr & skiptablemask];
1453  }
1454  }
1455 
1456  return 0; /* not found */
1457 }
1458 
1459 /*
1460  * Return a pointer to the current match.
1461  *
1462  * The returned pointer points into the original haystack string.
1463  */
1464 static char *
1466 {
1467  return state->last_match;
1468 }
1469 
1470 /*
1471  * Return the offset of the current match.
1472  *
1473  * The offset is in characters, 1-based.
1474  */
1475 static int
1477 {
1478  /* Convert the byte position to char position. */
1479  state->refpos += pg_mbstrlen_with_len(state->refpoint,
1480  state->last_match - state->refpoint);
1481  state->refpoint = state->last_match;
1482  return state->refpos + 1;
1483 }
1484 
1485 /*
1486  * Reset search state to the initial state installed by text_position_setup.
1487  *
1488  * The next call to text_position_next will search from the beginning
1489  * of the string.
1490  */
1491 static void
1493 {
1494  state->last_match = NULL;
1495  state->refpoint = state->str1;
1496  state->refpos = 0;
1497 }
1498 
1499 static void
1501 {
1502  /* no cleanup needed */
1503 }
1504 
1505 
1506 static void
1508 {
1509  if (!OidIsValid(collid))
1510  {
1511  /*
1512  * This typically means that the parser could not resolve a conflict
1513  * of implicit collations, so report it that way.
1514  */
1515  ereport(ERROR,
1516  (errcode(ERRCODE_INDETERMINATE_COLLATION),
1517  errmsg("could not determine which collation to use for string comparison"),
1518  errhint("Use the COLLATE clause to set the collation explicitly.")));
1519  }
1520 }
1521 
1522 /* varstr_cmp()
1523  * Comparison function for text strings with given lengths.
1524  * Includes locale support, but must copy strings to temporary memory
1525  * to allow null-termination for inputs to strcoll().
1526  * Returns an integer less than, equal to, or greater than zero, indicating
1527  * whether arg1 is less than, equal to, or greater than arg2.
1528  *
1529  * Note: many functions that depend on this are marked leakproof; therefore,
1530  * avoid reporting the actual contents of the input when throwing errors.
1531  * All errors herein should be things that can't happen except on corrupt
1532  * data, anyway; otherwise we will have trouble with indexing strings that
1533  * would cause them.
1534  */
1535 int
1536 varstr_cmp(const char *arg1, int len1, const char *arg2, int len2, Oid collid)
1537 {
1538  int result;
1539 
1541 
1542  /*
1543  * Unfortunately, there is no strncoll(), so in the non-C locale case we
1544  * have to do some memory copying. This turns out to be significantly
1545  * slower, so we optimize the case where LC_COLLATE is C. We also try to
1546  * optimize relatively-short strings by avoiding palloc/pfree overhead.
1547  */
1548  if (lc_collate_is_c(collid))
1549  {
1550  result = memcmp(arg1, arg2, Min(len1, len2));
1551  if ((result == 0) && (len1 != len2))
1552  result = (len1 < len2) ? -1 : 1;
1553  }
1554  else
1555  {
1556  pg_locale_t mylocale;
1557 
1558  mylocale = pg_newlocale_from_collation(collid);
1559 
1560  /*
1561  * memcmp() can't tell us which of two unequal strings sorts first,
1562  * but it's a cheap way to tell if they're equal. Testing shows that
1563  * memcmp() followed by strcoll() is only trivially slower than
1564  * strcoll() by itself, so we don't lose much if this doesn't work out
1565  * very often, and if it does - for example, because there are many
1566  * equal strings in the input - then we win big by avoiding expensive
1567  * collation-aware comparisons.
1568  */
1569  if (len1 == len2 && memcmp(arg1, arg2, len1) == 0)
1570  return 0;
1571 
1572  result = pg_strncoll(arg1, len1, arg2, len2, mylocale);
1573 
1574  /* Break tie if necessary. */
1575  if (result == 0 && pg_locale_deterministic(mylocale))
1576  {
1577  result = memcmp(arg1, arg2, Min(len1, len2));
1578  if ((result == 0) && (len1 != len2))
1579  result = (len1 < len2) ? -1 : 1;
1580  }
1581  }
1582 
1583  return result;
1584 }
1585 
1586 /* text_cmp()
1587  * Internal comparison function for text strings.
1588  * Returns -1, 0 or 1
1589  */
1590 static int
1591 text_cmp(text *arg1, text *arg2, Oid collid)
1592 {
1593  char *a1p,
1594  *a2p;
1595  int len1,
1596  len2;
1597 
1598  a1p = VARDATA_ANY(arg1);
1599  a2p = VARDATA_ANY(arg2);
1600 
1601  len1 = VARSIZE_ANY_EXHDR(arg1);
1602  len2 = VARSIZE_ANY_EXHDR(arg2);
1603 
1604  return varstr_cmp(a1p, len1, a2p, len2, collid);
1605 }
1606 
1607 /*
1608  * Comparison functions for text strings.
1609  *
1610  * Note: btree indexes need these routines not to leak memory; therefore,
1611  * be careful to free working copies of toasted datums. Most places don't
1612  * need to be so careful.
1613  */
1614 
1615 Datum
1617 {
1619  bool locale_is_c = false;
1620  pg_locale_t mylocale = 0;
1621  bool result;
1622 
1624 
1625  if (lc_collate_is_c(collid))
1626  locale_is_c = true;
1627  else
1628  mylocale = pg_newlocale_from_collation(collid);
1629 
1630  if (locale_is_c || pg_locale_deterministic(mylocale))
1631  {
1632  Datum arg1 = PG_GETARG_DATUM(0);
1633  Datum arg2 = PG_GETARG_DATUM(1);
1634  Size len1,
1635  len2;
1636 
1637  /*
1638  * Since we only care about equality or not-equality, we can avoid all
1639  * the expense of strcoll() here, and just do bitwise comparison. In
1640  * fact, we don't even have to do a bitwise comparison if we can show
1641  * the lengths of the strings are unequal; which might save us from
1642  * having to detoast one or both values.
1643  */
1644  len1 = toast_raw_datum_size(arg1);
1645  len2 = toast_raw_datum_size(arg2);
1646  if (len1 != len2)
1647  result = false;
1648  else
1649  {
1650  text *targ1 = DatumGetTextPP(arg1);
1651  text *targ2 = DatumGetTextPP(arg2);
1652 
1653  result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1654  len1 - VARHDRSZ) == 0);
1655 
1656  PG_FREE_IF_COPY(targ1, 0);
1657  PG_FREE_IF_COPY(targ2, 1);
1658  }
1659  }
1660  else
1661  {
1662  text *arg1 = PG_GETARG_TEXT_PP(0);
1663  text *arg2 = PG_GETARG_TEXT_PP(1);
1664 
1665  result = (text_cmp(arg1, arg2, collid) == 0);
1666 
1667  PG_FREE_IF_COPY(arg1, 0);
1668  PG_FREE_IF_COPY(arg2, 1);
1669  }
1670 
1671  PG_RETURN_BOOL(result);
1672 }
1673 
1674 Datum
1676 {
1678  bool locale_is_c = false;
1679  pg_locale_t mylocale = 0;
1680  bool result;
1681 
1683 
1684  if (lc_collate_is_c(collid))
1685  locale_is_c = true;
1686  else
1687  mylocale = pg_newlocale_from_collation(collid);
1688 
1689  if (locale_is_c || pg_locale_deterministic(mylocale))
1690  {
1691  Datum arg1 = PG_GETARG_DATUM(0);
1692  Datum arg2 = PG_GETARG_DATUM(1);
1693  Size len1,
1694  len2;
1695 
1696  /* See comment in texteq() */
1697  len1 = toast_raw_datum_size(arg1);
1698  len2 = toast_raw_datum_size(arg2);
1699  if (len1 != len2)
1700  result = true;
1701  else
1702  {
1703  text *targ1 = DatumGetTextPP(arg1);
1704  text *targ2 = DatumGetTextPP(arg2);
1705 
1706  result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1707  len1 - VARHDRSZ) != 0);
1708 
1709  PG_FREE_IF_COPY(targ1, 0);
1710  PG_FREE_IF_COPY(targ2, 1);
1711  }
1712  }
1713  else
1714  {
1715  text *arg1 = PG_GETARG_TEXT_PP(0);
1716  text *arg2 = PG_GETARG_TEXT_PP(1);
1717 
1718  result = (text_cmp(arg1, arg2, collid) != 0);
1719 
1720  PG_FREE_IF_COPY(arg1, 0);
1721  PG_FREE_IF_COPY(arg2, 1);
1722  }
1723 
1724  PG_RETURN_BOOL(result);
1725 }
1726 
1727 Datum
1729 {
1730  text *arg1 = PG_GETARG_TEXT_PP(0);
1731  text *arg2 = PG_GETARG_TEXT_PP(1);
1732  bool result;
1733 
1734  result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) < 0);
1735 
1736  PG_FREE_IF_COPY(arg1, 0);
1737  PG_FREE_IF_COPY(arg2, 1);
1738 
1739  PG_RETURN_BOOL(result);
1740 }
1741 
1742 Datum
1744 {
1745  text *arg1 = PG_GETARG_TEXT_PP(0);
1746  text *arg2 = PG_GETARG_TEXT_PP(1);
1747  bool result;
1748 
1749  result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) <= 0);
1750 
1751  PG_FREE_IF_COPY(arg1, 0);
1752  PG_FREE_IF_COPY(arg2, 1);
1753 
1754  PG_RETURN_BOOL(result);
1755 }
1756 
1757 Datum
1759 {
1760  text *arg1 = PG_GETARG_TEXT_PP(0);
1761  text *arg2 = PG_GETARG_TEXT_PP(1);
1762  bool result;
1763 
1764  result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) > 0);
1765 
1766  PG_FREE_IF_COPY(arg1, 0);
1767  PG_FREE_IF_COPY(arg2, 1);
1768 
1769  PG_RETURN_BOOL(result);
1770 }
1771 
1772 Datum
1774 {
1775  text *arg1 = PG_GETARG_TEXT_PP(0);
1776  text *arg2 = PG_GETARG_TEXT_PP(1);
1777  bool result;
1778 
1779  result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) >= 0);
1780 
1781  PG_FREE_IF_COPY(arg1, 0);
1782  PG_FREE_IF_COPY(arg2, 1);
1783 
1784  PG_RETURN_BOOL(result);
1785 }
1786 
1787 Datum
1789 {
1790  Datum arg1 = PG_GETARG_DATUM(0);
1791  Datum arg2 = PG_GETARG_DATUM(1);
1793  pg_locale_t mylocale = 0;
1794  bool result;
1795  Size len1,
1796  len2;
1797 
1799 
1800  if (!lc_collate_is_c(collid))
1801  mylocale = pg_newlocale_from_collation(collid);
1802 
1803  if (!pg_locale_deterministic(mylocale))
1804  ereport(ERROR,
1805  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1806  errmsg("nondeterministic collations are not supported for substring searches")));
1807 
1808  len1 = toast_raw_datum_size(arg1);
1809  len2 = toast_raw_datum_size(arg2);
1810  if (len2 > len1)
1811  result = false;
1812  else
1813  {
1814  text *targ1 = text_substring(arg1, 1, len2, false);
1815  text *targ2 = DatumGetTextPP(arg2);
1816 
1817  result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1818  VARSIZE_ANY_EXHDR(targ2)) == 0);
1819 
1820  PG_FREE_IF_COPY(targ1, 0);
1821  PG_FREE_IF_COPY(targ2, 1);
1822  }
1823 
1824  PG_RETURN_BOOL(result);
1825 }
1826 
1827 Datum
1829 {
1830  text *arg1 = PG_GETARG_TEXT_PP(0);
1831  text *arg2 = PG_GETARG_TEXT_PP(1);
1832  int32 result;
1833 
1834  result = text_cmp(arg1, arg2, PG_GET_COLLATION());
1835 
1836  PG_FREE_IF_COPY(arg1, 0);
1837  PG_FREE_IF_COPY(arg2, 1);
1838 
1839  PG_RETURN_INT32(result);
1840 }
1841 
1842 Datum
1844 {
1846  Oid collid = ssup->ssup_collation;
1847  MemoryContext oldcontext;
1848 
1849  oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
1850 
1851  /* Use generic string SortSupport */
1852  varstr_sortsupport(ssup, TEXTOID, collid);
1853 
1854  MemoryContextSwitchTo(oldcontext);
1855 
1856  PG_RETURN_VOID();
1857 }
1858 
1859 /*
1860  * Generic sortsupport interface for character type's operator classes.
1861  * Includes locale support, and support for BpChar semantics (i.e. removing
1862  * trailing spaces before comparison).
1863  *
1864  * Relies on the assumption that text, VarChar, BpChar, and bytea all have the
1865  * same representation. Callers that always use the C collation (e.g.
1866  * non-collatable type callers like bytea) may have NUL bytes in their strings;
1867  * this will not work with any other collation, though.
1868  */
1869 void
1871 {
1872  bool abbreviate = ssup->abbreviate;
1873  bool collate_c = false;
1874  VarStringSortSupport *sss;
1875  pg_locale_t locale = 0;
1876 
1878 
1879  /*
1880  * If possible, set ssup->comparator to a function which can be used to
1881  * directly compare two datums. If we can do this, we'll avoid the
1882  * overhead of a trip through the fmgr layer for every comparison, which
1883  * can be substantial.
1884  *
1885  * Most typically, we'll set the comparator to varlenafastcmp_locale,
1886  * which uses strcoll() to perform comparisons. We use that for the
1887  * BpChar case too, but type NAME uses namefastcmp_locale. However, if
1888  * LC_COLLATE = C, we can make things quite a bit faster with
1889  * varstrfastcmp_c, bpcharfastcmp_c, or namefastcmp_c, all of which use
1890  * memcmp() rather than strcoll().
1891  */
1892  if (lc_collate_is_c(collid))
1893  {
1894  if (typid == BPCHAROID)
1895  ssup->comparator = bpcharfastcmp_c;
1896  else if (typid == NAMEOID)
1897  {
1898  ssup->comparator = namefastcmp_c;
1899  /* Not supporting abbreviation with type NAME, for now */
1900  abbreviate = false;
1901  }
1902  else
1903  ssup->comparator = varstrfastcmp_c;
1904 
1905  collate_c = true;
1906  }
1907  else
1908  {
1909  /*
1910  * We need a collation-sensitive comparison. To make things faster,
1911  * we'll figure out the collation based on the locale id and cache the
1912  * result.
1913  */
1915 
1916  /*
1917  * We use varlenafastcmp_locale except for type NAME.
1918  */
1919  if (typid == NAMEOID)
1920  {
1922  /* Not supporting abbreviation with type NAME, for now */
1923  abbreviate = false;
1924  }
1925  else
1927  }
1928 
1929  /*
1930  * Unfortunately, it seems that abbreviation for non-C collations is
1931  * broken on many common platforms; see pg_strxfrm_enabled().
1932  *
1933  * Even apart from the risk of broken locales, it's possible that there
1934  * are platforms where the use of abbreviated keys should be disabled at
1935  * compile time. Having only 4 byte datums could make worst-case
1936  * performance drastically more likely, for example. Moreover, macOS's
1937  * strxfrm() implementation is known to not effectively concentrate a
1938  * significant amount of entropy from the original string in earlier
1939  * transformed blobs. It's possible that other supported platforms are
1940  * similarly encumbered. So, if we ever get past disabling this
1941  * categorically, we may still want or need to disable it for particular
1942  * platforms.
1943  */
1944  if (!collate_c && !pg_strxfrm_enabled(locale))
1945  abbreviate = false;
1946 
1947  /*
1948  * If we're using abbreviated keys, or if we're using a locale-aware
1949  * comparison, we need to initialize a VarStringSortSupport object. Both
1950  * cases will make use of the temporary buffers we initialize here for
1951  * scratch space (and to detect requirement for BpChar semantics from
1952  * caller), and the abbreviation case requires additional state.
1953  */
1954  if (abbreviate || !collate_c)
1955  {
1956  sss = palloc(sizeof(VarStringSortSupport));
1957  sss->buf1 = palloc(TEXTBUFLEN);
1958  sss->buflen1 = TEXTBUFLEN;
1959  sss->buf2 = palloc(TEXTBUFLEN);
1960  sss->buflen2 = TEXTBUFLEN;
1961  /* Start with invalid values */
1962  sss->last_len1 = -1;
1963  sss->last_len2 = -1;
1964  /* Initialize */
1965  sss->last_returned = 0;
1966  sss->locale = locale;
1967 
1968  /*
1969  * To avoid somehow confusing a strxfrm() blob and an original string,
1970  * constantly keep track of the variety of data that buf1 and buf2
1971  * currently contain.
1972  *
1973  * Comparisons may be interleaved with conversion calls. Frequently,
1974  * conversions and comparisons are batched into two distinct phases,
1975  * but the correctness of caching cannot hinge upon this. For
1976  * comparison caching, buffer state is only trusted if cache_blob is
1977  * found set to false, whereas strxfrm() caching only trusts the state
1978  * when cache_blob is found set to true.
1979  *
1980  * Arbitrarily initialize cache_blob to true.
1981  */
1982  sss->cache_blob = true;
1983  sss->collate_c = collate_c;
1984  sss->typid = typid;
1985  ssup->ssup_extra = sss;
1986 
1987  /*
1988  * If possible, plan to use the abbreviated keys optimization. The
1989  * core code may switch back to authoritative comparator should
1990  * abbreviation be aborted.
1991  */
1992  if (abbreviate)
1993  {
1994  sss->prop_card = 0.20;
1995  initHyperLogLog(&sss->abbr_card, 10);
1996  initHyperLogLog(&sss->full_card, 10);
1997  ssup->abbrev_full_comparator = ssup->comparator;
2001  }
2002  }
2003 }
2004 
2005 /*
2006  * sortsupport comparison func (for C locale case)
2007  */
2008 static int
2010 {
2011  VarString *arg1 = DatumGetVarStringPP(x);
2012  VarString *arg2 = DatumGetVarStringPP(y);
2013  char *a1p,
2014  *a2p;
2015  int len1,
2016  len2,
2017  result;
2018 
2019  a1p = VARDATA_ANY(arg1);
2020  a2p = VARDATA_ANY(arg2);
2021 
2022  len1 = VARSIZE_ANY_EXHDR(arg1);
2023  len2 = VARSIZE_ANY_EXHDR(arg2);
2024 
2025  result = memcmp(a1p, a2p, Min(len1, len2));
2026  if ((result == 0) && (len1 != len2))
2027  result = (len1 < len2) ? -1 : 1;
2028 
2029  /* We can't afford to leak memory here. */
2030  if (PointerGetDatum(arg1) != x)
2031  pfree(arg1);
2032  if (PointerGetDatum(arg2) != y)
2033  pfree(arg2);
2034 
2035  return result;
2036 }
2037 
2038 /*
2039  * sortsupport comparison func (for BpChar C locale case)
2040  *
2041  * BpChar outsources its sortsupport to this module. Specialization for the
2042  * varstr_sortsupport BpChar case, modeled on
2043  * internal_bpchar_pattern_compare().
2044  */
2045 static int
2047 {
2048  BpChar *arg1 = DatumGetBpCharPP(x);
2049  BpChar *arg2 = DatumGetBpCharPP(y);
2050  char *a1p,
2051  *a2p;
2052  int len1,
2053  len2,
2054  result;
2055 
2056  a1p = VARDATA_ANY(arg1);
2057  a2p = VARDATA_ANY(arg2);
2058 
2059  len1 = bpchartruelen(a1p, VARSIZE_ANY_EXHDR(arg1));
2060  len2 = bpchartruelen(a2p, VARSIZE_ANY_EXHDR(arg2));
2061 
2062  result = memcmp(a1p, a2p, Min(len1, len2));
2063  if ((result == 0) && (len1 != len2))
2064  result = (len1 < len2) ? -1 : 1;
2065 
2066  /* We can't afford to leak memory here. */
2067  if (PointerGetDatum(arg1) != x)
2068  pfree(arg1);
2069  if (PointerGetDatum(arg2) != y)
2070  pfree(arg2);
2071 
2072  return result;
2073 }
2074 
2075 /*
2076  * sortsupport comparison func (for NAME C locale case)
2077  */
2078 static int
2080 {
2081  Name arg1 = DatumGetName(x);
2082  Name arg2 = DatumGetName(y);
2083 
2084  return strncmp(NameStr(*arg1), NameStr(*arg2), NAMEDATALEN);
2085 }
2086 
2087 /*
2088  * sortsupport comparison func (for locale case with all varlena types)
2089  */
2090 static int
2092 {
2093  VarString *arg1 = DatumGetVarStringPP(x);
2094  VarString *arg2 = DatumGetVarStringPP(y);
2095  char *a1p,
2096  *a2p;
2097  int len1,
2098  len2,
2099  result;
2100 
2101  a1p = VARDATA_ANY(arg1);
2102  a2p = VARDATA_ANY(arg2);
2103 
2104  len1 = VARSIZE_ANY_EXHDR(arg1);
2105  len2 = VARSIZE_ANY_EXHDR(arg2);
2106 
2107  result = varstrfastcmp_locale(a1p, len1, a2p, len2, ssup);
2108 
2109  /* We can't afford to leak memory here. */
2110  if (PointerGetDatum(arg1) != x)
2111  pfree(arg1);
2112  if (PointerGetDatum(arg2) != y)
2113  pfree(arg2);
2114 
2115  return result;
2116 }
2117 
2118 /*
2119  * sortsupport comparison func (for locale case with NAME type)
2120  */
2121 static int
2123 {
2124  Name arg1 = DatumGetName(x);
2125  Name arg2 = DatumGetName(y);
2126 
2127  return varstrfastcmp_locale(NameStr(*arg1), strlen(NameStr(*arg1)),
2128  NameStr(*arg2), strlen(NameStr(*arg2)),
2129  ssup);
2130 }
2131 
2132 /*
2133  * sortsupport comparison func for locale cases
2134  */
2135 static int
2136 varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup)
2137 {
2139  int result;
2140  bool arg1_match;
2141 
2142  /* Fast pre-check for equality, as discussed in varstr_cmp() */
2143  if (len1 == len2 && memcmp(a1p, a2p, len1) == 0)
2144  {
2145  /*
2146  * No change in buf1 or buf2 contents, so avoid changing last_len1 or
2147  * last_len2. Existing contents of buffers might still be used by
2148  * next call.
2149  *
2150  * It's fine to allow the comparison of BpChar padding bytes here,
2151  * even though that implies that the memcmp() will usually be
2152  * performed for BpChar callers (though multibyte characters could
2153  * still prevent that from occurring). The memcmp() is still very
2154  * cheap, and BpChar's funny semantics have us remove trailing spaces
2155  * (not limited to padding), so we need make no distinction between
2156  * padding space characters and "real" space characters.
2157  */
2158  return 0;
2159  }
2160 
2161  if (sss->typid == BPCHAROID)
2162  {
2163  /* Get true number of bytes, ignoring trailing spaces */
2164  len1 = bpchartruelen(a1p, len1);
2165  len2 = bpchartruelen(a2p, len2);
2166  }
2167 
2168  if (len1 >= sss->buflen1)
2169  {
2170  sss->buflen1 = Max(len1 + 1, Min(sss->buflen1 * 2, MaxAllocSize));
2171  sss->buf1 = repalloc(sss->buf1, sss->buflen1);
2172  }
2173  if (len2 >= sss->buflen2)
2174  {
2175  sss->buflen2 = Max(len2 + 1, Min(sss->buflen2 * 2, MaxAllocSize));
2176  sss->buf2 = repalloc(sss->buf2, sss->buflen2);
2177  }
2178 
2179  /*
2180  * We're likely to be asked to compare the same strings repeatedly, and
2181  * memcmp() is so much cheaper than strcoll() that it pays to try to cache
2182  * comparisons, even though in general there is no reason to think that
2183  * that will work out (every string datum may be unique). Caching does
2184  * not slow things down measurably when it doesn't work out, and can speed
2185  * things up by rather a lot when it does. In part, this is because the
2186  * memcmp() compares data from cachelines that are needed in L1 cache even
2187  * when the last comparison's result cannot be reused.
2188  */
2189  arg1_match = true;
2190  if (len1 != sss->last_len1 || memcmp(sss->buf1, a1p, len1) != 0)
2191  {
2192  arg1_match = false;
2193  memcpy(sss->buf1, a1p, len1);
2194  sss->buf1[len1] = '\0';
2195  sss->last_len1 = len1;
2196  }
2197 
2198  /*
2199  * If we're comparing the same two strings as last time, we can return the
2200  * same answer without calling strcoll() again. This is more likely than
2201  * it seems (at least with moderate to low cardinality sets), because
2202  * quicksort compares the same pivot against many values.
2203  */
2204  if (len2 != sss->last_len2 || memcmp(sss->buf2, a2p, len2) != 0)
2205  {
2206  memcpy(sss->buf2, a2p, len2);
2207  sss->buf2[len2] = '\0';
2208  sss->last_len2 = len2;
2209  }
2210  else if (arg1_match && !sss->cache_blob)
2211  {
2212  /* Use result cached following last actual strcoll() call */
2213  return sss->last_returned;
2214  }
2215 
2216  result = pg_strcoll(sss->buf1, sss->buf2, sss->locale);
2217 
2218  /* Break tie if necessary. */
2219  if (result == 0 && pg_locale_deterministic(sss->locale))
2220  result = strcmp(sss->buf1, sss->buf2);
2221 
2222  /* Cache result, perhaps saving an expensive strcoll() call next time */
2223  sss->cache_blob = false;
2224  sss->last_returned = result;
2225  return result;
2226 }
2227 
2228 /*
2229  * Conversion routine for sortsupport. Converts original to abbreviated key
2230  * representation. Our encoding strategy is simple -- pack the first 8 bytes
2231  * of a strxfrm() blob into a Datum (on little-endian machines, the 8 bytes are
2232  * stored in reverse order), and treat it as an unsigned integer. When the "C"
2233  * locale is used, or in case of bytea, just memcpy() from original instead.
2234  */
2235 static Datum
2237 {
2238  const size_t max_prefix_bytes = sizeof(Datum);
2240  VarString *authoritative = DatumGetVarStringPP(original);
2241  char *authoritative_data = VARDATA_ANY(authoritative);
2242 
2243  /* working state */
2244  Datum res;
2245  char *pres;
2246  int len;
2247  uint32 hash;
2248 
2249  pres = (char *) &res;
2250  /* memset(), so any non-overwritten bytes are NUL */
2251  memset(pres, 0, max_prefix_bytes);
2252  len = VARSIZE_ANY_EXHDR(authoritative);
2253 
2254  /* Get number of bytes, ignoring trailing spaces */
2255  if (sss->typid == BPCHAROID)
2256  len = bpchartruelen(authoritative_data, len);
2257 
2258  /*
2259  * If we're using the C collation, use memcpy(), rather than strxfrm(), to
2260  * abbreviate keys. The full comparator for the C locale is always
2261  * memcmp(). It would be incorrect to allow bytea callers (callers that
2262  * always force the C collation -- bytea isn't a collatable type, but this
2263  * approach is convenient) to use strxfrm(). This is because bytea
2264  * strings may contain NUL bytes. Besides, this should be faster, too.
2265  *
2266  * More generally, it's okay that bytea callers can have NUL bytes in
2267  * strings because abbreviated cmp need not make a distinction between
2268  * terminating NUL bytes, and NUL bytes representing actual NULs in the
2269  * authoritative representation. Hopefully a comparison at or past one
2270  * abbreviated key's terminating NUL byte will resolve the comparison
2271  * without consulting the authoritative representation; specifically, some
2272  * later non-NUL byte in the longer string can resolve the comparison
2273  * against a subsequent terminating NUL in the shorter string. There will
2274  * usually be what is effectively a "length-wise" resolution there and
2275  * then.
2276  *
2277  * If that doesn't work out -- if all bytes in the longer string
2278  * positioned at or past the offset of the smaller string's (first)
2279  * terminating NUL are actually representative of NUL bytes in the
2280  * authoritative binary string (perhaps with some *terminating* NUL bytes
2281  * towards the end of the longer string iff it happens to still be small)
2282  * -- then an authoritative tie-breaker will happen, and do the right
2283  * thing: explicitly consider string length.
2284  */
2285  if (sss->collate_c)
2286  memcpy(pres, authoritative_data, Min(len, max_prefix_bytes));
2287  else
2288  {
2289  Size bsize;
2290 
2291  /*
2292  * We're not using the C collation, so fall back on strxfrm or ICU
2293  * analogs.
2294  */
2295 
2296  /* By convention, we use buffer 1 to store and NUL-terminate */
2297  if (len >= sss->buflen1)
2298  {
2299  sss->buflen1 = Max(len + 1, Min(sss->buflen1 * 2, MaxAllocSize));
2300  sss->buf1 = repalloc(sss->buf1, sss->buflen1);
2301  }
2302 
2303  /* Might be able to reuse strxfrm() blob from last call */
2304  if (sss->last_len1 == len && sss->cache_blob &&
2305  memcmp(sss->buf1, authoritative_data, len) == 0)
2306  {
2307  memcpy(pres, sss->buf2, Min(max_prefix_bytes, sss->last_len2));
2308  /* No change affecting cardinality, so no hashing required */
2309  goto done;
2310  }
2311 
2312  memcpy(sss->buf1, authoritative_data, len);
2313 
2314  /*
2315  * pg_strxfrm() and pg_strxfrm_prefix expect NUL-terminated strings.
2316  */
2317  sss->buf1[len] = '\0';
2318  sss->last_len1 = len;
2319 
2321  {
2322  if (sss->buflen2 < max_prefix_bytes)
2323  {
2324  sss->buflen2 = Max(max_prefix_bytes,
2325  Min(sss->buflen2 * 2, MaxAllocSize));
2326  sss->buf2 = repalloc(sss->buf2, sss->buflen2);
2327  }
2328 
2329  bsize = pg_strxfrm_prefix(sss->buf2, sss->buf1,
2330  max_prefix_bytes, sss->locale);
2331  sss->last_len2 = bsize;
2332  }
2333  else
2334  {
2335  /*
2336  * Loop: Call pg_strxfrm(), possibly enlarge buffer, and try
2337  * again. The pg_strxfrm() function leaves the result buffer
2338  * content undefined if the result did not fit, so we need to
2339  * retry until everything fits, even though we only need the first
2340  * few bytes in the end.
2341  */
2342  for (;;)
2343  {
2344  bsize = pg_strxfrm(sss->buf2, sss->buf1, sss->buflen2,
2345  sss->locale);
2346 
2347  sss->last_len2 = bsize;
2348  if (bsize < sss->buflen2)
2349  break;
2350 
2351  /*
2352  * Grow buffer and retry.
2353  */
2354  sss->buflen2 = Max(bsize + 1,
2355  Min(sss->buflen2 * 2, MaxAllocSize));
2356  sss->buf2 = repalloc(sss->buf2, sss->buflen2);
2357  }
2358  }
2359 
2360  /*
2361  * Every Datum byte is always compared. This is safe because the
2362  * strxfrm() blob is itself NUL terminated, leaving no danger of
2363  * misinterpreting any NUL bytes not intended to be interpreted as
2364  * logically representing termination.
2365  *
2366  * (Actually, even if there were NUL bytes in the blob it would be
2367  * okay. See remarks on bytea case above.)
2368  */
2369  memcpy(pres, sss->buf2, Min(max_prefix_bytes, bsize));
2370  }
2371 
2372  /*
2373  * Maintain approximate cardinality of both abbreviated keys and original,
2374  * authoritative keys using HyperLogLog. Used as cheap insurance against
2375  * the worst case, where we do many string transformations for no saving
2376  * in full strcoll()-based comparisons. These statistics are used by
2377  * varstr_abbrev_abort().
2378  *
2379  * First, Hash key proper, or a significant fraction of it. Mix in length
2380  * in order to compensate for cases where differences are past
2381  * PG_CACHE_LINE_SIZE bytes, so as to limit the overhead of hashing.
2382  */
2383  hash = DatumGetUInt32(hash_any((unsigned char *) authoritative_data,
2385 
2386  if (len > PG_CACHE_LINE_SIZE)
2388 
2389  addHyperLogLog(&sss->full_card, hash);
2390 
2391  /* Hash abbreviated key */
2392 #if SIZEOF_DATUM == 8
2393  {
2394  uint32 lohalf,
2395  hihalf;
2396 
2397  lohalf = (uint32) res;
2398  hihalf = (uint32) (res >> 32);
2399  hash = DatumGetUInt32(hash_uint32(lohalf ^ hihalf));
2400  }
2401 #else /* SIZEOF_DATUM != 8 */
2403 #endif
2404 
2405  addHyperLogLog(&sss->abbr_card, hash);
2406 
2407  /* Cache result, perhaps saving an expensive strxfrm() call next time */
2408  sss->cache_blob = true;
2409 done:
2410 
2411  /*
2412  * Byteswap on little-endian machines.
2413  *
2414  * This is needed so that ssup_datum_unsigned_cmp() (an unsigned integer
2415  * 3-way comparator) works correctly on all platforms. If we didn't do
2416  * this, the comparator would have to call memcmp() with a pair of
2417  * pointers to the first byte of each abbreviated key, which is slower.
2418  */
2419  res = DatumBigEndianToNative(res);
2420 
2421  /* Don't leak memory here */
2422  if (PointerGetDatum(authoritative) != original)
2423  pfree(authoritative);
2424 
2425  return res;
2426 }
2427 
2428 /*
2429  * Callback for estimating effectiveness of abbreviated key optimization, using
2430  * heuristic rules. Returns value indicating if the abbreviation optimization
2431  * should be aborted, based on its projected effectiveness.
2432  */
2433 static bool
2434 varstr_abbrev_abort(int memtupcount, SortSupport ssup)
2435 {
2437  double abbrev_distinct,
2438  key_distinct;
2439 
2440  Assert(ssup->abbreviate);
2441 
2442  /* Have a little patience */
2443  if (memtupcount < 100)
2444  return false;
2445 
2446  abbrev_distinct = estimateHyperLogLog(&sss->abbr_card);
2447  key_distinct = estimateHyperLogLog(&sss->full_card);
2448 
2449  /*
2450  * Clamp cardinality estimates to at least one distinct value. While
2451  * NULLs are generally disregarded, if only NULL values were seen so far,
2452  * that might misrepresent costs if we failed to clamp.
2453  */
2454  if (abbrev_distinct <= 1.0)
2455  abbrev_distinct = 1.0;
2456 
2457  if (key_distinct <= 1.0)
2458  key_distinct = 1.0;
2459 
2460  /*
2461  * In the worst case all abbreviated keys are identical, while at the same
2462  * time there are differences within full key strings not captured in
2463  * abbreviations.
2464  */
2465 #ifdef TRACE_SORT
2466  if (trace_sort)
2467  {
2468  double norm_abbrev_card = abbrev_distinct / (double) memtupcount;
2469 
2470  elog(LOG, "varstr_abbrev: abbrev_distinct after %d: %f "
2471  "(key_distinct: %f, norm_abbrev_card: %f, prop_card: %f)",
2472  memtupcount, abbrev_distinct, key_distinct, norm_abbrev_card,
2473  sss->prop_card);
2474  }
2475 #endif
2476 
2477  /*
2478  * If the number of distinct abbreviated keys approximately matches the
2479  * number of distinct authoritative original keys, that's reason enough to
2480  * proceed. We can win even with a very low cardinality set if most
2481  * tie-breakers only memcmp(). This is by far the most important
2482  * consideration.
2483  *
2484  * While comparisons that are resolved at the abbreviated key level are
2485  * considerably cheaper than tie-breakers resolved with memcmp(), both of
2486  * those two outcomes are so much cheaper than a full strcoll() once
2487  * sorting is underway that it doesn't seem worth it to weigh abbreviated
2488  * cardinality against the overall size of the set in order to more
2489  * accurately model costs. Assume that an abbreviated comparison, and an
2490  * abbreviated comparison with a cheap memcmp()-based authoritative
2491  * resolution are equivalent.
2492  */
2493  if (abbrev_distinct > key_distinct * sss->prop_card)
2494  {
2495  /*
2496  * When we have exceeded 10,000 tuples, decay required cardinality
2497  * aggressively for next call.
2498  *
2499  * This is useful because the number of comparisons required on
2500  * average increases at a linearithmic rate, and at roughly 10,000
2501  * tuples that factor will start to dominate over the linear costs of
2502  * string transformation (this is a conservative estimate). The decay
2503  * rate is chosen to be a little less aggressive than halving -- which
2504  * (since we're called at points at which memtupcount has doubled)
2505  * would never see the cost model actually abort past the first call
2506  * following a decay. This decay rate is mostly a precaution against
2507  * a sudden, violent swing in how well abbreviated cardinality tracks
2508  * full key cardinality. The decay also serves to prevent a marginal
2509  * case from being aborted too late, when too much has already been
2510  * invested in string transformation.
2511  *
2512  * It's possible for sets of several million distinct strings with
2513  * mere tens of thousands of distinct abbreviated keys to still
2514  * benefit very significantly. This will generally occur provided
2515  * each abbreviated key is a proxy for a roughly uniform number of the
2516  * set's full keys. If it isn't so, we hope to catch that early and
2517  * abort. If it isn't caught early, by the time the problem is
2518  * apparent it's probably not worth aborting.
2519  */
2520  if (memtupcount > 10000)
2521  sss->prop_card *= 0.65;
2522 
2523  return false;
2524  }
2525 
2526  /*
2527  * Abort abbreviation strategy.
2528  *
2529  * The worst case, where all abbreviated keys are identical while all
2530  * original strings differ will typically only see a regression of about
2531  * 10% in execution time for small to medium sized lists of strings.
2532  * Whereas on modern CPUs where cache stalls are the dominant cost, we can
2533  * often expect very large improvements, particularly with sets of strings
2534  * of moderately high to high abbreviated cardinality. There is little to
2535  * lose but much to gain, which our strategy reflects.
2536  */
2537 #ifdef TRACE_SORT
2538  if (trace_sort)
2539  elog(LOG, "varstr_abbrev: aborted abbreviation at %d "
2540  "(abbrev_distinct: %f, key_distinct: %f, prop_card: %f)",
2541  memtupcount, abbrev_distinct, key_distinct, sss->prop_card);
2542 #endif
2543 
2544  return true;
2545 }
2546 
2547 /*
2548  * Generic equalimage support function for character type's operator classes.
2549  * Disables the use of deduplication with nondeterministic collations.
2550  */
2551 Datum
2553 {
2554  /* Oid opcintype = PG_GETARG_OID(0); */
2556 
2558 
2559  if (lc_collate_is_c(collid) ||
2560  collid == DEFAULT_COLLATION_OID ||
2562  PG_RETURN_BOOL(true);
2563  else
2564  PG_RETURN_BOOL(false);
2565 }
2566 
2567 Datum
2569 {
2570  text *arg1 = PG_GETARG_TEXT_PP(0);
2571  text *arg2 = PG_GETARG_TEXT_PP(1);
2572  text *result;
2573 
2574  result = ((text_cmp(arg1, arg2, PG_GET_COLLATION()) > 0) ? arg1 : arg2);
2575 
2576  PG_RETURN_TEXT_P(result);
2577 }
2578 
2579 Datum
2581 {
2582  text *arg1 = PG_GETARG_TEXT_PP(0);
2583  text *arg2 = PG_GETARG_TEXT_PP(1);
2584  text *result;
2585 
2586  result = ((text_cmp(arg1, arg2, PG_GET_COLLATION()) < 0) ? arg1 : arg2);
2587 
2588  PG_RETURN_TEXT_P(result);
2589 }
2590 
2591 
2592 /*
2593  * Cross-type comparison functions for types text and name.
2594  */
2595 
2596 Datum
2598 {
2599  Name arg1 = PG_GETARG_NAME(0);
2600  text *arg2 = PG_GETARG_TEXT_PP(1);
2601  size_t len1 = strlen(NameStr(*arg1));
2602  size_t len2 = VARSIZE_ANY_EXHDR(arg2);
2604  bool result;
2605 
2607 
2608  if (collid == C_COLLATION_OID)
2609  result = (len1 == len2 &&
2610  memcmp(NameStr(*arg1), VARDATA_ANY(arg2), len1) == 0);
2611  else
2612  result = (varstr_cmp(NameStr(*arg1), len1,
2613  VARDATA_ANY(arg2), len2,
2614  collid) == 0);
2615 
2616  PG_FREE_IF_COPY(arg2, 1);
2617 
2618  PG_RETURN_BOOL(result);
2619 }
2620 
2621 Datum
2623 {
2624  text *arg1 = PG_GETARG_TEXT_PP(0);
2625  Name arg2 = PG_GETARG_NAME(1);
2626  size_t len1 = VARSIZE_ANY_EXHDR(arg1);
2627  size_t len2 = strlen(NameStr(*arg2));
2629  bool result;
2630 
2632 
2633  if (collid == C_COLLATION_OID)
2634  result = (len1 == len2 &&
2635  memcmp(VARDATA_ANY(arg1), NameStr(*arg2), len1) == 0);
2636  else
2637  result = (varstr_cmp(VARDATA_ANY(arg1), len1,
2638  NameStr(*arg2), len2,
2639  collid) == 0);
2640 
2641  PG_FREE_IF_COPY(arg1, 0);
2642 
2643  PG_RETURN_BOOL(result);
2644 }
2645 
2646 Datum
2648 {
2649  Name arg1 = PG_GETARG_NAME(0);
2650  text *arg2 = PG_GETARG_TEXT_PP(1);
2651  size_t len1 = strlen(NameStr(*arg1));
2652  size_t len2 = VARSIZE_ANY_EXHDR(arg2);
2654  bool result;
2655 
2657 
2658  if (collid == C_COLLATION_OID)
2659  result = !(len1 == len2 &&
2660  memcmp(NameStr(*arg1), VARDATA_ANY(arg2), len1) == 0);
2661  else
2662  result = !(varstr_cmp(NameStr(*arg1), len1,
2663  VARDATA_ANY(arg2), len2,
2664  collid) == 0);
2665 
2666  PG_FREE_IF_COPY(arg2, 1);
2667 
2668  PG_RETURN_BOOL(result);
2669 }
2670 
2671 Datum
2673 {
2674  text *arg1 = PG_GETARG_TEXT_PP(0);
2675  Name arg2 = PG_GETARG_NAME(1);
2676  size_t len1 = VARSIZE_ANY_EXHDR(arg1);
2677  size_t len2 = strlen(NameStr(*arg2));
2679  bool result;
2680 
2682 
2683  if (collid == C_COLLATION_OID)
2684  result = !(len1 == len2 &&
2685  memcmp(VARDATA_ANY(arg1), NameStr(*arg2), len1) == 0);
2686  else
2687  result = !(varstr_cmp(VARDATA_ANY(arg1), len1,
2688  NameStr(*arg2), len2,
2689  collid) == 0);
2690 
2691  PG_FREE_IF_COPY(arg1, 0);
2692 
2693  PG_RETURN_BOOL(result);
2694 }
2695 
2696 Datum
2698 {
2699  Name arg1 = PG_GETARG_NAME(0);
2700  text *arg2 = PG_GETARG_TEXT_PP(1);
2701  int32 result;
2702 
2703  result = varstr_cmp(NameStr(*arg1), strlen(NameStr(*arg1)),
2704  VARDATA_ANY(arg2), VARSIZE_ANY_EXHDR(arg2),
2705  PG_GET_COLLATION());
2706 
2707  PG_FREE_IF_COPY(arg2, 1);
2708 
2709  PG_RETURN_INT32(result);
2710 }
2711 
2712 Datum
2714 {
2715  text *arg1 = PG_GETARG_TEXT_PP(0);
2716  Name arg2 = PG_GETARG_NAME(1);
2717  int32 result;
2718 
2719  result = varstr_cmp(VARDATA_ANY(arg1), VARSIZE_ANY_EXHDR(arg1),
2720  NameStr(*arg2), strlen(NameStr(*arg2)),
2721  PG_GET_COLLATION());
2722 
2723  PG_FREE_IF_COPY(arg1, 0);
2724 
2725  PG_RETURN_INT32(result);
2726 }
2727 
2728 #define CmpCall(cmpfunc) \
2729  DatumGetInt32(DirectFunctionCall2Coll(cmpfunc, \
2730  PG_GET_COLLATION(), \
2731  PG_GETARG_DATUM(0), \
2732  PG_GETARG_DATUM(1)))
2733 
2734 Datum
2736 {
2738 }
2739 
2740 Datum
2742 {
2744 }
2745 
2746 Datum
2748 {
2750 }
2751 
2752 Datum
2754 {
2756 }
2757 
2758 Datum
2760 {
2762 }
2763 
2764 Datum
2766 {
2768 }
2769 
2770 Datum
2772 {
2774 }
2775 
2776 Datum
2778 {
2780 }
2781 
2782 #undef CmpCall
2783 
2784 
2785 /*
2786  * The following operators support character-by-character comparison
2787  * of text datums, to allow building indexes suitable for LIKE clauses.
2788  * Note that the regular texteq/textne comparison operators, and regular
2789  * support functions 1 and 2 with "C" collation are assumed to be
2790  * compatible with these!
2791  */
2792 
2793 static int
2795 {
2796  int result;
2797  int len1,
2798  len2;
2799 
2800  len1 = VARSIZE_ANY_EXHDR(arg1);
2801  len2 = VARSIZE_ANY_EXHDR(arg2);
2802 
2803  result = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
2804  if (result != 0)
2805  return result;
2806  else if (len1 < len2)
2807  return -1;
2808  else if (len1 > len2)
2809  return 1;
2810  else
2811  return 0;
2812 }
2813 
2814 
2815 Datum
2817 {
2818  text *arg1 = PG_GETARG_TEXT_PP(0);
2819  text *arg2 = PG_GETARG_TEXT_PP(1);
2820  int result;
2821 
2822  result = internal_text_pattern_compare(arg1, arg2);
2823 
2824  PG_FREE_IF_COPY(arg1, 0);
2825  PG_FREE_IF_COPY(arg2, 1);
2826 
2827  PG_RETURN_BOOL(result < 0);
2828 }
2829 
2830 
2831 Datum
2833 {
2834  text *arg1 = PG_GETARG_TEXT_PP(0);
2835  text *arg2 = PG_GETARG_TEXT_PP(1);
2836  int result;
2837 
2838  result = internal_text_pattern_compare(arg1, arg2);
2839 
2840  PG_FREE_IF_COPY(arg1, 0);
2841  PG_FREE_IF_COPY(arg2, 1);
2842 
2843  PG_RETURN_BOOL(result <= 0);
2844 }
2845 
2846 
2847 Datum
2849 {
2850  text *arg1 = PG_GETARG_TEXT_PP(0);
2851  text *arg2 = PG_GETARG_TEXT_PP(1);
2852  int result;
2853 
2854  result = internal_text_pattern_compare(arg1, arg2);
2855 
2856  PG_FREE_IF_COPY(arg1, 0);
2857  PG_FREE_IF_COPY(arg2, 1);
2858 
2859  PG_RETURN_BOOL(result >= 0);
2860 }
2861 
2862 
2863 Datum
2865 {
2866  text *arg1 = PG_GETARG_TEXT_PP(0);
2867  text *arg2 = PG_GETARG_TEXT_PP(1);
2868  int result;
2869 
2870  result = internal_text_pattern_compare(arg1, arg2);
2871 
2872  PG_FREE_IF_COPY(arg1, 0);
2873  PG_FREE_IF_COPY(arg2, 1);
2874 
2875  PG_RETURN_BOOL(result > 0);
2876 }
2877 
2878 
2879 Datum
2881 {
2882  text *arg1 = PG_GETARG_TEXT_PP(0);
2883  text *arg2 = PG_GETARG_TEXT_PP(1);
2884  int result;
2885 
2886  result = internal_text_pattern_compare(arg1, arg2);
2887 
2888  PG_FREE_IF_COPY(arg1, 0);
2889  PG_FREE_IF_COPY(arg2, 1);
2890 
2891  PG_RETURN_INT32(result);
2892 }
2893 
2894 
2895 Datum
2897 {
2899  MemoryContext oldcontext;
2900 
2901  oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
2902 
2903  /* Use generic string SortSupport, forcing "C" collation */
2904  varstr_sortsupport(ssup, TEXTOID, C_COLLATION_OID);
2905 
2906  MemoryContextSwitchTo(oldcontext);
2907 
2908  PG_RETURN_VOID();
2909 }
2910 
2911 
2912 /*-------------------------------------------------------------
2913  * byteaoctetlen
2914  *
2915  * get the number of bytes contained in an instance of type 'bytea'
2916  *-------------------------------------------------------------
2917  */
2918 Datum
2920 {
2921  Datum str = PG_GETARG_DATUM(0);
2922 
2923  /* We need not detoast the input at all */
2925 }
2926 
2927 /*
2928  * byteacat -
2929  * takes two bytea* and returns a bytea* that is the concatenation of
2930  * the two.
2931  *
2932  * Cloned from textcat and modified as required.
2933  */
2934 Datum
2936 {
2937  bytea *t1 = PG_GETARG_BYTEA_PP(0);
2938  bytea *t2 = PG_GETARG_BYTEA_PP(1);
2939 
2941 }
2942 
2943 /*
2944  * bytea_catenate
2945  * Guts of byteacat(), broken out so it can be used by other functions
2946  *
2947  * Arguments can be in short-header form, but not compressed or out-of-line
2948  */
2949 static bytea *
2951 {
2952  bytea *result;
2953  int len1,
2954  len2,
2955  len;
2956  char *ptr;
2957 
2958  len1 = VARSIZE_ANY_EXHDR(t1);
2959  len2 = VARSIZE_ANY_EXHDR(t2);
2960 
2961  /* paranoia ... probably should throw error instead? */
2962  if (len1 < 0)
2963  len1 = 0;
2964  if (len2 < 0)
2965  len2 = 0;
2966 
2967  len = len1 + len2 + VARHDRSZ;
2968  result = (bytea *) palloc(len);
2969 
2970  /* Set size of result string... */
2971  SET_VARSIZE(result, len);
2972 
2973  /* Fill data field of result string... */
2974  ptr = VARDATA(result);
2975  if (len1 > 0)
2976  memcpy(ptr, VARDATA_ANY(t1), len1);
2977  if (len2 > 0)
2978  memcpy(ptr + len1, VARDATA_ANY(t2), len2);
2979 
2980  return result;
2981 }
2982 
2983 #define PG_STR_GET_BYTEA(str_) \
2984  DatumGetByteaPP(DirectFunctionCall1(byteain, CStringGetDatum(str_)))
2985 
2986 /*
2987  * bytea_substr()
2988  * Return a substring starting at the specified position.
2989  * Cloned from text_substr and modified as required.
2990  *
2991  * Input:
2992  * - string
2993  * - starting position (is one-based)
2994  * - string length (optional)
2995  *
2996  * If the starting position is zero or less, then return from the start of the string
2997  * adjusting the length to be consistent with the "negative start" per SQL.
2998  * If the length is less than zero, an ERROR is thrown. If no third argument
2999  * (length) is provided, the length to the end of the string is assumed.
3000  */
3001 Datum
3003 {
3005  PG_GETARG_INT32(1),
3006  PG_GETARG_INT32(2),
3007  false));
3008 }
3009 
3010 /*
3011  * bytea_substr_no_len -
3012  * Wrapper to avoid opr_sanity failure due to
3013  * one function accepting a different number of args.
3014  */
3015 Datum
3017 {
3019  PG_GETARG_INT32(1),
3020  -1,
3021  true));
3022 }
3023 
3024 static bytea *
3026  int S,
3027  int L,
3028  bool length_not_specified)
3029 {
3030  int32 S1; /* adjusted start position */
3031  int32 L1; /* adjusted substring length */
3032  int32 E; /* end position */
3033 
3034  /*
3035  * The logic here should generally match text_substring().
3036  */
3037  S1 = Max(S, 1);
3038 
3039  if (length_not_specified)
3040  {
3041  /*
3042  * Not passed a length - DatumGetByteaPSlice() grabs everything to the
3043  * end of the string if we pass it a negative value for length.
3044  */
3045  L1 = -1;
3046  }
3047  else if (L < 0)
3048  {
3049  /* SQL99 says to throw an error for E < S, i.e., negative length */
3050  ereport(ERROR,
3051  (errcode(ERRCODE_SUBSTRING_ERROR),
3052  errmsg("negative substring length not allowed")));
3053  L1 = -1; /* silence stupider compilers */
3054  }
3055  else if (pg_add_s32_overflow(S, L, &E))
3056  {
3057  /*
3058  * L could be large enough for S + L to overflow, in which case the
3059  * substring must run to end of string.
3060  */
3061  L1 = -1;
3062  }
3063  else
3064  {
3065  /*
3066  * A zero or negative value for the end position can happen if the
3067  * start was negative or one. SQL99 says to return a zero-length
3068  * string.
3069  */
3070  if (E < 1)
3071  return PG_STR_GET_BYTEA("");
3072 
3073  L1 = E - S1;
3074  }
3075 
3076  /*
3077  * If the start position is past the end of the string, SQL99 says to
3078  * return a zero-length string -- DatumGetByteaPSlice() will do that for
3079  * us. We need only convert S1 to zero-based starting position.
3080  */
3081  return DatumGetByteaPSlice(str, S1 - 1, L1);
3082 }
3083 
3084 /*
3085  * byteaoverlay
3086  * Replace specified substring of first string with second
3087  *
3088  * The SQL standard defines OVERLAY() in terms of substring and concatenation.
3089  * This code is a direct implementation of what the standard says.
3090  */
3091 Datum
3093 {
3094  bytea *t1 = PG_GETARG_BYTEA_PP(0);
3095  bytea *t2 = PG_GETARG_BYTEA_PP(1);
3096  int sp = PG_GETARG_INT32(2); /* substring start position */
3097  int sl = PG_GETARG_INT32(3); /* substring length */
3098 
3099  PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl));
3100 }
3101 
3102 Datum
3104 {
3105  bytea *t1 = PG_GETARG_BYTEA_PP(0);
3106  bytea *t2 = PG_GETARG_BYTEA_PP(1);
3107  int sp = PG_GETARG_INT32(2); /* substring start position */
3108  int sl;
3109 
3110  sl = VARSIZE_ANY_EXHDR(t2); /* defaults to length(t2) */
3111  PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl));
3112 }
3113 
3114 static bytea *
3115 bytea_overlay(bytea *t1, bytea *t2, int sp, int sl)
3116 {
3117  bytea *result;
3118  bytea *s1;
3119  bytea *s2;
3120  int sp_pl_sl;
3121 
3122  /*
3123  * Check for possible integer-overflow cases. For negative sp, throw a
3124  * "substring length" error because that's what should be expected
3125  * according to the spec's definition of OVERLAY().
3126  */
3127  if (sp <= 0)
3128  ereport(ERROR,
3129  (errcode(ERRCODE_SUBSTRING_ERROR),
3130  errmsg("negative substring length not allowed")));
3131  if (pg_add_s32_overflow(sp, sl, &sp_pl_sl))
3132  ereport(ERROR,
3133  (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
3134  errmsg("integer out of range")));
3135 
3136  s1 = bytea_substring(PointerGetDatum(t1), 1, sp - 1, false);
3137  s2 = bytea_substring(PointerGetDatum(t1), sp_pl_sl, -1, true);
3138  result = bytea_catenate(s1, t2);
3139  result = bytea_catenate(result, s2);
3140 
3141  return result;
3142 }
3143 
3144 /*
3145  * bit_count
3146  */
3147 Datum
3149 {
3150  bytea *t1 = PG_GETARG_BYTEA_PP(0);
3151 
3153 }
3154 
3155 /*
3156  * byteapos -
3157  * Return the position of the specified substring.
3158  * Implements the SQL POSITION() function.
3159  * Cloned from textpos and modified as required.
3160  */
3161 Datum
3163 {
3164  bytea *t1 = PG_GETARG_BYTEA_PP(0);
3165  bytea *t2 = PG_GETARG_BYTEA_PP(1);
3166  int pos;
3167  int px,
3168  p;
3169  int len1,
3170  len2;
3171  char *p1,
3172  *p2;
3173 
3174  len1 = VARSIZE_ANY_EXHDR(t1);
3175  len2 = VARSIZE_ANY_EXHDR(t2);
3176 
3177  if (len2 <= 0)
3178  PG_RETURN_INT32(1); /* result for empty pattern */
3179 
3180  p1 = VARDATA_ANY(t1);
3181  p2 = VARDATA_ANY(t2);
3182 
3183  pos = 0;
3184  px = (len1 - len2);
3185  for (p = 0; p <= px; p++)
3186  {
3187  if ((*p2 == *p1) && (memcmp(p1, p2, len2) == 0))
3188  {
3189  pos = p + 1;
3190  break;
3191  };
3192  p1++;
3193  };
3194 
3195  PG_RETURN_INT32(pos);
3196 }
3197 
3198 /*-------------------------------------------------------------
3199  * byteaGetByte
3200  *
3201  * this routine treats "bytea" as an array of bytes.
3202  * It returns the Nth byte (a number between 0 and 255).
3203  *-------------------------------------------------------------
3204  */
3205 Datum
3207 {
3208  bytea *v = PG_GETARG_BYTEA_PP(0);
3209  int32 n = PG_GETARG_INT32(1);
3210  int len;
3211  int byte;
3212 
3213  len = VARSIZE_ANY_EXHDR(v);
3214 
3215  if (n < 0 || n >= len)
3216  ereport(ERROR,
3217  (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3218  errmsg("index %d out of valid range, 0..%d",
3219  n, len - 1)));
3220 
3221  byte = ((unsigned char *) VARDATA_ANY(v))[n];
3222 
3223  PG_RETURN_INT32(byte);
3224 }
3225 
3226 /*-------------------------------------------------------------
3227  * byteaGetBit
3228  *
3229  * This routine treats a "bytea" type like an array of bits.
3230  * It returns the value of the Nth bit (0 or 1).
3231  *
3232  *-------------------------------------------------------------
3233  */
3234 Datum
3236 {
3237  bytea *v = PG_GETARG_BYTEA_PP(0);
3238  int64 n = PG_GETARG_INT64(1);
3239  int byteNo,
3240  bitNo;
3241  int len;
3242  int byte;
3243 
3244  len = VARSIZE_ANY_EXHDR(v);
3245 
3246  if (n < 0 || n >= (int64) len * 8)
3247  ereport(ERROR,
3248  (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3249  errmsg("index %lld out of valid range, 0..%lld",
3250  (long long) n, (long long) len * 8 - 1)));
3251 
3252  /* n/8 is now known < len, so safe to cast to int */
3253  byteNo = (int) (n / 8);
3254  bitNo = (int) (n % 8);
3255 
3256  byte = ((unsigned char *) VARDATA_ANY(v))[byteNo];
3257 
3258  if (byte & (1 << bitNo))
3259  PG_RETURN_INT32(1);
3260  else
3261  PG_RETURN_INT32(0);
3262 }
3263 
3264 /*-------------------------------------------------------------
3265  * byteaSetByte
3266  *
3267  * Given an instance of type 'bytea' creates a new one with
3268  * the Nth byte set to the given value.
3269  *
3270  *-------------------------------------------------------------
3271  */
3272 Datum
3274 {
3276  int32 n = PG_GETARG_INT32(1);
3277  int32 newByte = PG_GETARG_INT32(2);
3278  int len;
3279 
3280  len = VARSIZE(res) - VARHDRSZ;
3281 
3282  if (n < 0 || n >= len)
3283  ereport(ERROR,
3284  (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3285  errmsg("index %d out of valid range, 0..%d",
3286  n, len - 1)));
3287 
3288  /*
3289  * Now set the byte.
3290  */
3291  ((unsigned char *) VARDATA(res))[n] = newByte;
3292 
3294 }
3295 
3296 /*-------------------------------------------------------------
3297  * byteaSetBit
3298  *
3299  * Given an instance of type 'bytea' creates a new one with
3300  * the Nth bit set to the given value.
3301  *
3302  *-------------------------------------------------------------
3303  */
3304 Datum
3306 {
3308  int64 n = PG_GETARG_INT64(1);
3309  int32 newBit = PG_GETARG_INT32(2);
3310  int len;
3311  int oldByte,
3312  newByte;
3313  int byteNo,
3314  bitNo;
3315 
3316  len = VARSIZE(res) - VARHDRSZ;
3317 
3318  if (n < 0 || n >= (int64) len * 8)
3319  ereport(ERROR,
3320  (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3321  errmsg("index %lld out of valid range, 0..%lld",
3322  (long long) n, (long long) len * 8 - 1)));
3323 
3324  /* n/8 is now known < len, so safe to cast to int */
3325  byteNo = (int) (n / 8);
3326  bitNo = (int) (n % 8);
3327 
3328  /*
3329  * sanity check!
3330  */
3331  if (newBit != 0 && newBit != 1)
3332  ereport(ERROR,
3333  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
3334  errmsg("new bit must be 0 or 1")));
3335 
3336  /*
3337  * Update the byte.
3338  */
3339  oldByte = ((unsigned char *) VARDATA(res))[byteNo];
3340 
3341  if (newBit == 0)
3342  newByte = oldByte & (~(1 << bitNo));
3343  else
3344  newByte = oldByte | (1 << bitNo);
3345 
3346  ((unsigned char *) VARDATA(res))[byteNo] = newByte;
3347 
3349 }
3350 
3351 
3352 /* text_name()
3353  * Converts a text type to a Name type.
3354  */
3355 Datum
3357 {
3358  text *s = PG_GETARG_TEXT_PP(0);
3359  Name result;
3360  int len;
3361 
3362  len = VARSIZE_ANY_EXHDR(s);
3363 
3364  /* Truncate oversize input */
3365  if (len >= NAMEDATALEN)
3367 
3368  /* We use palloc0 here to ensure result is zero-padded */
3369  result = (Name) palloc0(NAMEDATALEN);
3370  memcpy(NameStr(*result), VARDATA_ANY(s), len);
3371 
3372  PG_RETURN_NAME(result);
3373 }
3374 
3375 /* name_text()
3376  * Converts a Name type to a text type.
3377  */
3378 Datum
3380 {
3381  Name s = PG_GETARG_NAME(0);
3382 
3384 }
3385 
3386 
3387 /*
3388  * textToQualifiedNameList - convert a text object to list of names
3389  *
3390  * This implements the input parsing needed by nextval() and other
3391  * functions that take a text parameter representing a qualified name.
3392  * We split the name at dots, downcase if not double-quoted, and
3393  * truncate names if they're too long.
3394  */
3395 List *
3397 {
3398  char *rawname;
3399  List *result = NIL;
3400  List *namelist;
3401  ListCell *l;
3402 
3403  /* Convert to C string (handles possible detoasting). */
3404  /* Note we rely on being able to modify rawname below. */
3405  rawname = text_to_cstring(textval);
3406 
3407  if (!SplitIdentifierString(rawname, '.', &namelist))
3408  ereport(ERROR,
3409  (errcode(ERRCODE_INVALID_NAME),
3410  errmsg("invalid name syntax")));
3411 
3412  if (namelist == NIL)
3413  ereport(ERROR,
3414  (errcode(ERRCODE_INVALID_NAME),
3415  errmsg("invalid name syntax")));
3416 
3417  foreach(l, namelist)
3418  {
3419  char *curname = (char *) lfirst(l);
3420 
3421  result = lappend(result, makeString(pstrdup(curname)));
3422  }
3423 
3424  pfree(rawname);
3425  list_free(namelist);
3426 
3427  return result;
3428 }
3429 
3430 /*
3431  * SplitIdentifierString --- parse a string containing identifiers
3432  *
3433  * This is the guts of textToQualifiedNameList, and is exported for use in
3434  * other situations such as parsing GUC variables. In the GUC case, it's
3435  * important to avoid memory leaks, so the API is designed to minimize the
3436  * amount of stuff that needs to be allocated and freed.
3437  *
3438  * Inputs:
3439  * rawstring: the input string; must be overwritable! On return, it's
3440  * been modified to contain the separated identifiers.
3441  * separator: the separator punctuation expected between identifiers
3442  * (typically '.' or ','). Whitespace may also appear around
3443  * identifiers.
3444  * Outputs:
3445  * namelist: filled with a palloc'd list of pointers to identifiers within
3446  * rawstring. Caller should list_free() this even on error return.
3447  *
3448  * Returns true if okay, false if there is a syntax error in the string.
3449  *
3450  * Note that an empty string is considered okay here, though not in
3451  * textToQualifiedNameList.
3452  */
3453 bool
3454 SplitIdentifierString(char *rawstring, char separator,
3455  List **namelist)
3456 {
3457  char *nextp = rawstring;
3458  bool done = false;
3459 
3460  *namelist = NIL;
3461 
3462  while (scanner_isspace(*nextp))
3463  nextp++; /* skip leading whitespace */
3464 
3465  if (*nextp == '\0')
3466  return true; /* allow empty string */
3467 
3468  /* At the top of the loop, we are at start of a new identifier. */
3469  do
3470  {
3471  char *curname;
3472  char *endp;
3473 
3474  if (*nextp == '"')
3475  {
3476  /* Quoted name --- collapse quote-quote pairs, no downcasing */
3477  curname = nextp + 1;
3478  for (;;)
3479  {
3480  endp = strchr(nextp + 1, '"');
3481  if (endp == NULL)
3482  return false; /* mismatched quotes */
3483  if (endp[1] != '"')
3484  break; /* found end of quoted name */
3485  /* Collapse adjacent quotes into one quote, and look again */
3486  memmove(endp, endp + 1, strlen(endp));
3487  nextp = endp;
3488  }
3489  /* endp now points at the terminating quote */
3490  nextp = endp + 1;
3491  }
3492  else
3493  {
3494  /* Unquoted name --- extends to separator or whitespace */
3495  char *downname;
3496  int len;
3497 
3498  curname = nextp;
3499  while (*nextp && *nextp != separator &&
3500  !scanner_isspace(*nextp))
3501  nextp++;
3502  endp = nextp;
3503  if (curname == nextp)
3504  return false; /* empty unquoted name not allowed */
3505 
3506  /*
3507  * Downcase the identifier, using same code as main lexer does.
3508  *
3509  * XXX because we want to overwrite the input in-place, we cannot
3510  * support a downcasing transformation that increases the string
3511  * length. This is not a problem given the current implementation
3512  * of downcase_truncate_identifier, but we'll probably have to do
3513  * something about this someday.
3514  */
3515  len = endp - curname;
3516  downname = downcase_truncate_identifier(curname, len, false);
3517  Assert(strlen(downname) <= len);
3518  strncpy(curname, downname, len); /* strncpy is required here */
3519  pfree(downname);
3520  }
3521 
3522  while (scanner_isspace(*nextp))
3523  nextp++; /* skip trailing whitespace */
3524 
3525  if (*nextp == separator)
3526  {
3527  nextp++;
3528  while (scanner_isspace(*nextp))
3529  nextp++; /* skip leading whitespace for next */
3530  /* we expect another name, so done remains false */
3531  }
3532  else if (*nextp == '\0')
3533  done = true;
3534  else
3535  return false; /* invalid syntax */
3536 
3537  /* Now safe to overwrite separator with a null */
3538  *endp = '\0';
3539 
3540  /* Truncate name if it's overlength */
3541  truncate_identifier(curname, strlen(curname), false);
3542 
3543  /*
3544  * Finished isolating current name --- add it to list
3545  */
3546  *namelist = lappend(*namelist, curname);
3547 
3548  /* Loop back if we didn't reach end of string */
3549  } while (!done);
3550 
3551  return true;
3552 }
3553 
3554 
3555 /*
3556  * SplitDirectoriesString --- parse a string containing file/directory names
3557  *
3558  * This works fine on file names too; the function name is historical.
3559  *
3560  * This is similar to SplitIdentifierString, except that the parsing
3561  * rules are meant to handle pathnames instead of identifiers: there is
3562  * no downcasing, embedded spaces are allowed, the max length is MAXPGPATH-1,
3563  * and we apply canonicalize_path() to each extracted string. Because of the
3564  * last, the returned strings are separately palloc'd rather than being
3565  * pointers into rawstring --- but we still scribble on rawstring.
3566  *
3567  * Inputs:
3568  * rawstring: the input string; must be modifiable!
3569  * separator: the separator punctuation expected between directories
3570  * (typically ',' or ';'). Whitespace may also appear around
3571  * directories.
3572  * Outputs:
3573  * namelist: filled with a palloc'd list of directory names.
3574  * Caller should list_free_deep() this even on error return.
3575  *
3576  * Returns true if okay, false if there is a syntax error in the string.
3577  *
3578  * Note that an empty string is considered okay here.
3579  */
3580 bool
3581 SplitDirectoriesString(char *rawstring, char separator,
3582  List **namelist)
3583 {
3584  char *nextp = rawstring;
3585  bool done = false;
3586 
3587  *namelist = NIL;
3588 
3589  while (scanner_isspace(*nextp))
3590  nextp++; /* skip leading whitespace */
3591 
3592  if (*nextp == '\0')
3593  return true; /* allow empty string */
3594 
3595  /* At the top of the loop, we are at start of a new directory. */
3596  do
3597  {
3598  char *curname;
3599  char *endp;
3600 
3601  if (*nextp == '"')
3602  {
3603  /* Quoted name --- collapse quote-quote pairs */
3604  curname = nextp + 1;
3605  for (;;)
3606  {
3607  endp = strchr(nextp + 1, '"');
3608  if (endp == NULL)
3609  return false; /* mismatched quotes */
3610  if (endp[1] != '"')
3611  break; /* found end of quoted name */
3612  /* Collapse adjacent quotes into one quote, and look again */
3613  memmove(endp, endp + 1, strlen(endp));
3614  nextp = endp;
3615  }
3616  /* endp now points at the terminating quote */
3617  nextp = endp + 1;
3618  }
3619  else
3620  {
3621  /* Unquoted name --- extends to separator or end of string */
3622  curname = endp = nextp;
3623  while (*nextp && *nextp != separator)
3624  {
3625  /* trailing whitespace should not be included in name */
3626  if (!scanner_isspace(*nextp))
3627  endp = nextp + 1;
3628  nextp++;
3629  }
3630  if (curname == endp)
3631  return false; /* empty unquoted name not allowed */
3632  }
3633 
3634  while (scanner_isspace(*nextp))
3635  nextp++; /* skip trailing whitespace */
3636 
3637  if (*nextp == separator)
3638  {
3639  nextp++;
3640  while (scanner_isspace(*nextp))
3641  nextp++; /* skip leading whitespace for next */
3642  /* we expect another name, so done remains false */
3643  }
3644  else if (*nextp == '\0')
3645  done = true;
3646  else
3647  return false; /* invalid syntax */
3648 
3649  /* Now safe to overwrite separator with a null */
3650  *endp = '\0';
3651 
3652  /* Truncate path if it's overlength */
3653  if (strlen(curname) >= MAXPGPATH)
3654  curname[MAXPGPATH - 1] = '\0';
3655 
3656  /*
3657  * Finished isolating current name --- add it to list
3658  */
3659  curname = pstrdup(curname);
3660  canonicalize_path(curname);
3661  *namelist = lappend(*namelist, curname);
3662 
3663  /* Loop back if we didn't reach end of string */
3664  } while (!done);
3665 
3666  return true;
3667 }
3668 
3669 
3670 /*
3671  * SplitGUCList --- parse a string containing identifiers or file names
3672  *
3673  * This is used to split the value of a GUC_LIST_QUOTE GUC variable, without
3674  * presuming whether the elements will be taken as identifiers or file names.
3675  * We assume the input has already been through flatten_set_variable_args(),
3676  * so that we need never downcase (if appropriate, that was done already).
3677  * Nor do we ever truncate, since we don't know the correct max length.
3678  * We disallow embedded whitespace for simplicity (it shouldn't matter,
3679  * because any embedded whitespace should have led to double-quoting).
3680  * Otherwise the API is identical to SplitIdentifierString.
3681  *
3682  * XXX it's annoying to have so many copies of this string-splitting logic.
3683  * However, it's not clear that having one function with a bunch of option
3684  * flags would be much better.
3685  *
3686  * XXX there is a version of this function in src/bin/pg_dump/dumputils.c.
3687  * Be sure to update that if you have to change this.
3688  *
3689  * Inputs:
3690  * rawstring: the input string; must be overwritable! On return, it's
3691  * been modified to contain the separated identifiers.
3692  * separator: the separator punctuation expected between identifiers
3693  * (typically '.' or ','). Whitespace may also appear around
3694  * identifiers.
3695  * Outputs:
3696  * namelist: filled with a palloc'd list of pointers to identifiers within
3697  * rawstring. Caller should list_free() this even on error return.
3698  *
3699  * Returns true if okay, false if there is a syntax error in the string.
3700  */
3701 bool
3702 SplitGUCList(char *rawstring, char separator,
3703  List **namelist)
3704 {
3705  char *nextp = rawstring;
3706  bool done = false;
3707 
3708  *namelist = NIL;
3709 
3710  while (scanner_isspace(*nextp))
3711  nextp++; /* skip leading whitespace */
3712 
3713  if (*nextp == '\0')
3714  return true; /* allow empty string */
3715 
3716  /* At the top of the loop, we are at start of a new identifier. */
3717  do
3718  {
3719  char *curname;
3720  char *endp;
3721 
3722  if (*nextp == '"')
3723  {
3724  /* Quoted name --- collapse quote-quote pairs */
3725  curname = nextp + 1;
3726  for (;;)
3727  {
3728  endp = strchr(nextp + 1, '"');
3729  if (endp == NULL)
3730  return false; /* mismatched quotes */
3731  if (endp[1] != '"')
3732  break; /* found end of quoted name */
3733  /* Collapse adjacent quotes into one quote, and look again */
3734  memmove(endp, endp + 1, strlen(endp));
3735  nextp = endp;
3736  }
3737  /* endp now points at the terminating quote */
3738  nextp = endp + 1;
3739  }
3740  else
3741  {
3742  /* Unquoted name --- extends to separator or whitespace */
3743  curname = nextp;
3744  while (*nextp && *nextp != separator &&
3745  !scanner_isspace(*nextp))
3746  nextp++;
3747  endp = nextp;
3748  if (curname == nextp)
3749  return false; /* empty unquoted name not allowed */
3750  }
3751 
3752  while (scanner_isspace(*nextp))
3753  nextp++; /* skip trailing whitespace */
3754 
3755  if (*nextp == separator)
3756  {
3757  nextp++;
3758  while (scanner_isspace(*nextp))
3759  nextp++; /* skip leading whitespace for next */
3760  /* we expect another name, so done remains false */
3761  }
3762  else if (*nextp == '\0')
3763  done = true;
3764  else
3765  return false; /* invalid syntax */
3766 
3767  /* Now safe to overwrite separator with a null */
3768  *endp = '\0';
3769 
3770  /*
3771  * Finished isolating current name --- add it to list
3772  */
3773  *namelist = lappend(*namelist, curname);
3774 
3775  /* Loop back if we didn't reach end of string */
3776  } while (!done);
3777 
3778  return true;
3779 }
3780 
3781 
3782 /*****************************************************************************
3783  * Comparison Functions used for bytea
3784  *
3785  * Note: btree indexes need these routines not to leak memory; therefore,
3786  * be careful to free working copies of toasted datums. Most places don't
3787  * need to be so careful.
3788  *****************************************************************************/
3789 
3790 Datum
3792 {
3793  Datum arg1 = PG_GETARG_DATUM(0);
3794  Datum arg2 = PG_GETARG_DATUM(1);
3795  bool result;
3796  Size len1,
3797  len2;
3798 
3799  /*
3800  * We can use a fast path for unequal lengths, which might save us from
3801  * having to detoast one or both values.
3802  */
3803  len1 = toast_raw_datum_size(arg1);
3804  len2 = toast_raw_datum_size(arg2);
3805  if (len1 != len2)
3806  result = false;
3807  else
3808  {
3809  bytea *barg1 = DatumGetByteaPP(arg1);
3810  bytea *barg2 = DatumGetByteaPP(arg2);
3811 
3812  result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2),
3813  len1 - VARHDRSZ) == 0);
3814 
3815  PG_FREE_IF_COPY(barg1, 0);
3816  PG_FREE_IF_COPY(barg2, 1);
3817  }
3818 
3819  PG_RETURN_BOOL(result);
3820 }
3821 
3822 Datum
3824 {
3825  Datum arg1 = PG_GETARG_DATUM(0);
3826  Datum arg2 = PG_GETARG_DATUM(1);
3827  bool result;
3828  Size len1,
3829  len2;
3830 
3831  /*
3832  * We can use a fast path for unequal lengths, which might save us from
3833  * having to detoast one or both values.
3834  */
3835  len1 = toast_raw_datum_size(arg1);
3836  len2 = toast_raw_datum_size(arg2);
3837  if (len1 != len2)
3838  result = true;
3839  else
3840  {
3841  bytea *barg1 = DatumGetByteaPP(arg1);
3842  bytea *barg2 = DatumGetByteaPP(arg2);
3843 
3844  result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2),
3845  len1 - VARHDRSZ) != 0);
3846 
3847  PG_FREE_IF_COPY(barg1, 0);
3848  PG_FREE_IF_COPY(barg2, 1);
3849  }
3850 
3851  PG_RETURN_BOOL(result);
3852 }
3853 
3854 Datum
3856 {
3857  bytea *arg1 = PG_GETARG_BYTEA_PP(0);
3858  bytea *arg2 = PG_GETARG_BYTEA_PP(1);
3859  int len1,
3860  len2;
3861  int cmp;
3862 
3863  len1 = VARSIZE_ANY_EXHDR(arg1);
3864  len2 = VARSIZE_ANY_EXHDR(arg2);
3865 
3866  cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3867 
3868  PG_FREE_IF_COPY(arg1, 0);
3869  PG_FREE_IF_COPY(arg2, 1);
3870 
3871  PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 < len2)));
3872 }
3873 
3874 Datum
3876 {
3877  bytea *arg1 = PG_GETARG_BYTEA_PP(0);
3878  bytea *arg2 = PG_GETARG_BYTEA_PP(1);
3879  int len1,
3880  len2;
3881  int cmp;
3882 
3883  len1 = VARSIZE_ANY_EXHDR(arg1);
3884  len2 = VARSIZE_ANY_EXHDR(arg2);
3885 
3886  cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3887 
3888  PG_FREE_IF_COPY(arg1, 0);
3889  PG_FREE_IF_COPY(arg2, 1);
3890 
3891  PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 <= len2)));
3892 }
3893 
3894 Datum
3896 {
3897  bytea *arg1 = PG_GETARG_BYTEA_PP(0);
3898  bytea *arg2 = PG_GETARG_BYTEA_PP(1);
3899  int len1,
3900  len2;
3901  int cmp;
3902 
3903  len1 = VARSIZE_ANY_EXHDR(arg1);
3904  len2 = VARSIZE_ANY_EXHDR(arg2);
3905 
3906  cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3907 
3908  PG_FREE_IF_COPY(arg1, 0);
3909  PG_FREE_IF_COPY(arg2, 1);
3910 
3911  PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 > len2)));
3912 }
3913 
3914 Datum
3916 {
3917  bytea *arg1 = PG_GETARG_BYTEA_PP(0);
3918  bytea *arg2 = PG_GETARG_BYTEA_PP(1);
3919  int len1,
3920  len2;
3921  int cmp;
3922 
3923  len1 = VARSIZE_ANY_EXHDR(arg1);
3924  len2 = VARSIZE_ANY_EXHDR(arg2);
3925 
3926  cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3927 
3928  PG_FREE_IF_COPY(arg1, 0);
3929  PG_FREE_IF_COPY(arg2, 1);
3930 
3931  PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 >= len2)));
3932 }
3933 
3934 Datum
3936 {
3937  bytea *arg1 = PG_GETARG_BYTEA_PP(0);
3938  bytea *arg2 = PG_GETARG_BYTEA_PP(1);
3939  int len1,
3940  len2;
3941  int cmp;
3942 
3943  len1 = VARSIZE_ANY_EXHDR(arg1);
3944  len2 = VARSIZE_ANY_EXHDR(arg2);
3945 
3946  cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3947  if ((cmp == 0) && (len1 != len2))
3948  cmp = (len1 < len2) ? -1 : 1;
3949 
3950  PG_FREE_IF_COPY(arg1, 0);
3951  PG_FREE_IF_COPY(arg2, 1);
3952 
3954 }
3955 
3956 Datum
3958 {
3960  MemoryContext oldcontext;
3961 
3962  oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
3963 
3964  /* Use generic string SortSupport, forcing "C" collation */
3965  varstr_sortsupport(ssup, BYTEAOID, C_COLLATION_OID);
3966 
3967  MemoryContextSwitchTo(oldcontext);
3968 
3969  PG_RETURN_VOID();
3970 }
3971 
3972 /*
3973  * appendStringInfoText
3974  *
3975  * Append a text to str.
3976  * Like appendStringInfoString(str, text_to_cstring(t)) but faster.
3977  */
3978 static void
3980 {
3982 }
3983 
3984 /*
3985  * replace_text
3986  * replace all occurrences of 'old_sub_str' in 'orig_str'
3987  * with 'new_sub_str' to form 'new_str'
3988  *
3989  * returns 'orig_str' if 'old_sub_str' == '' or 'orig_str' == ''
3990  * otherwise returns 'new_str'
3991  */
3992 Datum
3994 {
3995  text *src_text = PG_GETARG_TEXT_PP(0);
3996  text *from_sub_text = PG_GETARG_TEXT_PP(1);
3997  text *to_sub_text = PG_GETARG_TEXT_PP(2);
3998  int src_text_len;
3999  int from_sub_text_len;
4001  text *ret_text;
4002  int chunk_len;
4003  char *curr_ptr;
4004  char *start_ptr;
4006  bool found;
4007 
4008  src_text_len = VARSIZE_ANY_EXHDR(src_text);
4009  from_sub_text_len = VARSIZE_ANY_EXHDR(from_sub_text);
4010 
4011  /* Return unmodified source string if empty source or pattern */
4012  if (src_text_len < 1 || from_sub_text_len < 1)
4013  {
4014  PG_RETURN_TEXT_P(src_text);
4015  }
4016 
4017  text_position_setup(src_text, from_sub_text, PG_GET_COLLATION(), &state);
4018 
4019  found = text_position_next(&state);
4020 
4021  /* When the from_sub_text is not found, there is nothing to do. */
4022  if (!found)
4023  {
4025  PG_RETURN_TEXT_P(src_text);
4026  }
4027  curr_ptr = text_position_get_match_ptr(&state);
4028  start_ptr = VARDATA_ANY(src_text);
4029 
4030  initStringInfo(&str);
4031 
4032  do
4033  {
4035 
4036  /* copy the data skipped over by last text_position_next() */
4037  chunk_len = curr_ptr - start_ptr;
4038  appendBinaryStringInfo(&str, start_ptr, chunk_len);
4039 
4040  appendStringInfoText(&str, to_sub_text);
4041 
4042  start_ptr = curr_ptr + from_sub_text_len;
4043 
4044  found = text_position_next(&state);
4045  if (found)
4046  curr_ptr = text_position_get_match_ptr(&state);
4047  }
4048  while (found);
4049 
4050  /* copy trailing data */
4051  chunk_len = ((char *) src_text + VARSIZE_ANY(src_text)) - start_ptr;
4052  appendBinaryStringInfo(&str, start_ptr, chunk_len);
4053 
4055 
4056  ret_text = cstring_to_text_with_len(str.data, str.len);
4057  pfree(str.data);
4058 
4059  PG_RETURN_TEXT_P(ret_text);
4060 }
4061 
4062 /*
4063  * check_replace_text_has_escape
4064  *
4065  * Returns 0 if text contains no backslashes that need processing.
4066  * Returns 1 if text contains backslashes, but not regexp submatch specifiers.
4067  * Returns 2 if text contains regexp submatch specifiers (\1 .. \9).
4068  */
4069 static int
4071 {
4072  int result = 0;
4073  const char *p = VARDATA_ANY(replace_text);
4074  const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);
4075 
4076  while (p < p_end)
4077  {
4078  /* Find next escape char, if any. */
4079  p = memchr(p, '\\', p_end - p);
4080  if (p == NULL)
4081  break;
4082  p++;
4083  /* Note: a backslash at the end doesn't require extra processing. */
4084  if (p < p_end)
4085  {
4086  if (*p >= '1' && *p <= '9')
4087  return 2; /* Found a submatch specifier, so done */
4088  result = 1; /* Found some other sequence, keep looking */
4089  p++;
4090  }
4091  }
4092  return result;
4093 }
4094 
4095 /*
4096  * appendStringInfoRegexpSubstr
4097  *
4098  * Append replace_text to str, substituting regexp back references for
4099  * \n escapes. start_ptr is the start of the match in the source string,
4100  * at logical character position data_pos.
4101  */
4102 static void
4104  regmatch_t *pmatch,
4105  char *start_ptr, int data_pos)
4106 {
4107  const char *p = VARDATA_ANY(replace_text);
4108  const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);
4109 
4110  while (p < p_end)
4111  {
4112  const char *chunk_start = p;
4113  int so;
4114  int eo;
4115 
4116  /* Find next escape char, if any. */
4117  p = memchr(p, '\\', p_end - p);
4118  if (p == NULL)
4119  p = p_end;
4120 
4121  /* Copy the text we just scanned over, if any. */
4122  if (p > chunk_start)
4123  appendBinaryStringInfo(str, chunk_start, p - chunk_start);
4124 
4125  /* Done if at end of string, else advance over escape char. */
4126  if (p >= p_end)
4127  break;
4128  p++;
4129 
4130  if (p >= p_end)
4131  {
4132  /* Escape at very end of input. Treat same as unexpected char */
4133  appendStringInfoChar(str, '\\');
4134  break;
4135  }
4136 
4137  if (*p >= '1' && *p <= '9')
4138  {
4139  /* Use the back reference of regexp. */
4140  int idx = *p - '0';
4141 
4142  so = pmatch[idx].rm_so;
4143  eo = pmatch[idx].rm_eo;
4144  p++;
4145  }
4146  else if (*p == '&')
4147  {
4148  /* Use the entire matched string. */
4149  so = pmatch[0].rm_so;
4150  eo = pmatch[0].rm_eo;
4151  p++;
4152  }
4153  else if (*p == '\\')
4154  {
4155  /* \\ means transfer one \ to output. */
4156  appendStringInfoChar(str, '\\');
4157  p++;
4158  continue;
4159  }
4160  else
4161  {
4162  /*
4163  * If escape char is not followed by any expected char, just treat
4164  * it as ordinary data to copy. (XXX would it be better to throw
4165  * an error?)
4166  */
4167  appendStringInfoChar(str, '\\');
4168  continue;
4169  }
4170 
4171  if (so >= 0 && eo >= 0)
4172  {
4173  /*
4174  * Copy the text that is back reference of regexp. Note so and eo
4175  * are counted in characters not bytes.
4176  */
4177  char *chunk_start;
4178  int chunk_len;
4179 
4180  Assert(so >= data_pos);
4181  chunk_start = start_ptr;
4182  chunk_start += charlen_to_bytelen(chunk_start, so - data_pos);
4183  chunk_len = charlen_to_bytelen(chunk_start, eo - so);
4184  appendBinaryStringInfo(str, chunk_start, chunk_len);
4185  }
4186  }
4187 }
4188 
4189 /*
4190  * replace_text_regexp
4191  *
4192  * replace substring(s) in src_text that match pattern with replace_text.
4193  * The replace_text can contain backslash markers to substitute
4194  * (parts of) the matched text.
4195  *
4196  * cflags: regexp compile flags.
4197  * collation: collation to use.
4198  * search_start: the character (not byte) offset in src_text at which to
4199  * begin searching.
4200  * n: if 0, replace all matches; if > 0, replace only the N'th match.
4201  */
4202 text *
4203 replace_text_regexp(text *src_text, text *pattern_text,
4204  text *replace_text,
4205  int cflags, Oid collation,
4206  int search_start, int n)
4207 {
4208  text *ret_text;
4209  regex_t *re;
4210  int src_text_len = VARSIZE_ANY_EXHDR(src_text);
4211  int nmatches = 0;
4213  regmatch_t pmatch[10]; /* main match, plus \1 to \9 */
4214  int nmatch = lengthof(pmatch);
4215  pg_wchar *data;
4216  size_t data_len;
4217  int data_pos;
4218  char *start_ptr;
4219  int escape_status;
4220 
4221  initStringInfo(&buf);
4222 
4223  /* Convert data string to wide characters. */
4224  data = (pg_wchar *) palloc((src_text_len + 1) * sizeof(pg_wchar));
4225  data_len = pg_mb2wchar_with_len(VARDATA_ANY(src_text), data, src_text_len);
4226 
4227  /* Check whether replace_text has escapes, especially regexp submatches. */
4229 
4230  /* If no regexp submatches, we can use REG_NOSUB. */
4231  if (escape_status < 2)
4232  {
4233  cflags |= REG_NOSUB;
4234  /* Also tell pg_regexec we only want the whole-match location. */
4235  nmatch = 1;
4236  }
4237 
4238  /* Prepare the regexp. */
4239  re = RE_compile_and_cache(pattern_text, cflags, collation);
4240 
4241  /* start_ptr points to the data_pos'th character of src_text */
4242  start_ptr = (char *) VARDATA_ANY(src_text);
4243  data_pos = 0;
4244 
4245  while (search_start <= data_len)
4246  {
4247  int regexec_result;
4248 
4250 
4251  regexec_result = pg_regexec(re,
4252  data,
4253  data_len,
4254  search_start,
4255  NULL, /* no details */
4256  nmatch,
4257  pmatch,
4258  0);
4259 
4260  if (regexec_result == REG_NOMATCH)
4261  break;
4262 
4263  if (regexec_result != REG_OKAY)
4264  {
4265  char errMsg[100];
4266 
4267  pg_regerror(regexec_result, re, errMsg, sizeof(errMsg));
4268  ereport(ERROR,
4269  (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
4270  errmsg("regular expression failed: %s", errMsg)));
4271  }
4272 
4273  /*
4274  * Count matches, and decide whether to replace this match.
4275  */
4276  nmatches++;
4277  if (n > 0 && nmatches != n)
4278  {
4279  /*
4280  * No, so advance search_start, but not start_ptr/data_pos. (Thus,
4281  * we treat the matched text as if it weren't matched, and copy it
4282  * to the output later.)
4283  */
4284  search_start = pmatch[0].rm_eo;
4285  if (pmatch[0].rm_so == pmatch[0].rm_eo)
4286  search_start++;
4287  continue;
4288  }
4289 
4290  /*
4291  * Copy the text to the left of the match position. Note we are given
4292  * character not byte indexes.
4293  */
4294  if (pmatch[0].rm_so - data_pos > 0)
4295  {
4296  int chunk_len;
4297 
4298  chunk_len = charlen_to_bytelen(start_ptr,
4299  pmatch[0].rm_so - data_pos);
4300  appendBinaryStringInfo(&buf, start_ptr, chunk_len);
4301 
4302  /*
4303  * Advance start_ptr over that text, to avoid multiple rescans of
4304  * it if the replace_text contains multiple back-references.
4305  */
4306  start_ptr += chunk_len;
4307  data_pos = pmatch[0].rm_so;
4308  }
4309 
4310  /*
4311  * Copy the replace_text, processing escapes if any are present.
4312  */
4313  if (escape_status > 0)
4315  start_ptr, data_pos);
4316  else
4318 
4319  /* Advance start_ptr and data_pos over the matched text. */
4320  start_ptr += charlen_to_bytelen(start_ptr,
4321  pmatch[0].rm_eo - data_pos);
4322  data_pos = pmatch[0].rm_eo;
4323 
4324  /*
4325  * If we only want to replace one occurrence, we're done.
4326  */
4327  if (n > 0)
4328  break;
4329 
4330  /*
4331  * Advance search position. Normally we start the next search at the
4332  * end of the previous match; but if the match was of zero length, we
4333  * have to advance by one character, or we'd just find the same match
4334  * again.
4335  */
4336  search_start = data_pos;
4337  if (pmatch[0].rm_so == pmatch[0].rm_eo)
4338  search_start++;
4339  }
4340 
4341  /*
4342  * Copy the text to the right of the last match.
4343  */
4344  if (data_pos < data_len)
4345  {
4346  int chunk_len;
4347 
4348  chunk_len = ((char *) src_text + VARSIZE_ANY(src_text)) - start_ptr;
4349  appendBinaryStringInfo(&buf, start_ptr, chunk_len);
4350  }
4351 
4352  ret_text = cstring_to_text_with_len(buf.data, buf.len);
4353  pfree(buf.data);
4354  pfree(data);
4355 
4356  return ret_text;
4357 }
4358 
4359 /*
4360  * split_part
4361  * parse input string based on provided field separator
4362  * return N'th item (1 based, negative counts from end)
4363  */
4364 Datum
4366 {
4367  text *inputstring = PG_GETARG_TEXT_PP(0);
4368  text *fldsep = PG_GETARG_TEXT_PP(1);
4369  int fldnum = PG_GETARG_INT32(2);
4370  int inputstring_len;
4371  int fldsep_len;
4373  char *start_ptr;
4374  char *end_ptr;
4375  text *result_text;
4376  bool found;
4377 
4378  /* field number is 1 based */
4379  if (fldnum == 0)
4380  ereport(ERROR,
4381  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4382  errmsg("field position must not be zero")));
4383 
4384  inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
4385  fldsep_len = VARSIZE_ANY_EXHDR(fldsep);
4386 
4387  /* return empty string for empty input string */
4388  if (inputstring_len < 1)
4390 
4391  /* handle empty field separator */
4392  if (fldsep_len < 1)
4393  {
4394  /* if first or last field, return input string, else empty string */
4395  if (fldnum == 1 || fldnum == -1)
4396  PG_RETURN_TEXT_P(inputstring);
4397  else
4399  }
4400 
4401  /* find the first field separator */
4402  text_position_setup(inputstring, fldsep, PG_GET_COLLATION(), &state);
4403 
4404  found = text_position_next(&state);
4405 
4406  /* special case if fldsep not found at all */
4407  if (!found)
4408  {
4410  /* if first or last field, return input string, else empty string */
4411  if (fldnum == 1 || fldnum == -1)
4412  PG_RETURN_TEXT_P(inputstring);
4413  else
4415  }
4416 
4417  /*
4418  * take care of a negative field number (i.e. count from the right) by
4419  * converting to a positive field number; we need total number of fields
4420  */
4421  if (fldnum < 0)
4422  {
4423  /* we found a fldsep, so there are at least two fields */
4424  int numfields = 2;
4425 
4426  while (text_position_next(&state))
4427  numfields++;
4428 
4429  /* special case of last field does not require an extra pass */
4430  if (fldnum == -1)
4431  {
4432  start_ptr = text_position_get_match_ptr(&state) + fldsep_len;
4433  end_ptr = VARDATA_ANY(inputstring) + inputstring_len;
4436  end_ptr - start_ptr));
4437  }
4438 
4439  /* else, convert fldnum to positive notation */
4440  fldnum += numfields + 1;
4441 
4442  /* if nonexistent field, return empty string */
4443  if (fldnum <= 0)
4444  {
4447  }
4448 
4449  /* reset to pointing at first match, but now with positive fldnum */
4451  found = text_position_next(&state);
4452  Assert(found);
4453  }
4454 
4455  /* identify bounds of first field */
4456  start_ptr = VARDATA_ANY(inputstring);
4457  end_ptr = text_position_get_match_ptr(&state);
4458 
4459  while (found && --fldnum > 0)
4460  {
4461  /* identify bounds of next field */
4462  start_ptr = end_ptr + fldsep_len;
4463  found = text_position_next(&state);
4464  if (found)
4465  end_ptr = text_position_get_match_ptr(&state);
4466  }
4467 
4469 
4470  if (fldnum > 0)
4471  {
4472  /* N'th field separator not found */
4473  /* if last field requested, return it, else empty string */
4474  if (fldnum == 1)
4475  {
4476  int last_len = start_ptr - VARDATA_ANY(inputstring);
4477 
4478  result_text = cstring_to_text_with_len(start_ptr,
4479  inputstring_len - last_len);
4480  }
4481  else
4482  result_text = cstring_to_text("");
4483  }
4484  else
4485  {
4486  /* non-last field requested */
4487  result_text = cstring_to_text_with_len(start_ptr, end_ptr - start_ptr);
4488  }
4489 
4490  PG_RETURN_TEXT_P(result_text);
4491 }
4492 
4493 /*
4494  * Convenience function to return true when two text params are equal.
4495  */
4496 static bool
4498 {
4500  collid,
4501  PointerGetDatum(txt1),
4502  PointerGetDatum(txt2)));
4503 }
4504 
4505 /*
4506  * text_to_array
4507  * parse input string and return text array of elements,
4508  * based on provided field separator
4509  */
4510 Datum
4512 {
4513  SplitTextOutputData tstate;
4514 
4515  /* For array output, tstate should start as all zeroes */
4516  memset(&tstate, 0, sizeof(tstate));
4517 
4518  if (!split_text(fcinfo, &tstate))
4519  PG_RETURN_NULL();
4520 
4521  if (tstate.astate == NULL)
4523 
4526 }
4527 
4528 /*
4529  * text_to_array_null
4530  * parse input string and return text array of elements,
4531  * based on provided field separator and null string
4532  *
4533  * This is a separate entry point only to prevent the regression tests from
4534  * complaining about different argument sets for the same internal function.
4535  */
4536 Datum
4538 {
4539  return text_to_array(fcinfo);
4540 }
4541 
4542 /*
4543  * text_to_table
4544  * parse input string and return table of elements,
4545  * based on provided field separator
4546  */
4547 Datum
4549 {
4550  ReturnSetInfo *rsi = (ReturnSetInfo *) fcinfo->resultinfo;
4551  SplitTextOutputData tstate;
4552 
4553  tstate.astate = NULL;
4555  tstate.tupstore = rsi->setResult;
4556  tstate.tupdesc = rsi->setDesc;
4557 
4558  (void) split_text(fcinfo, &tstate);
4559 
4560  return (Datum) 0;
4561 }
4562 
4563 /*
4564  * text_to_table_null
4565  * parse input string and return table of elements,
4566  * based on provided field separator and null string
4567  *
4568  * This is a separate entry point only to prevent the regression tests from
4569  * complaining about different argument sets for the same internal function.
4570  */
4571 Datum
4573 {
4574  return text_to_table(fcinfo);
4575 }
4576 
4577 /*
4578  * Common code for text_to_array, text_to_array_null, text_to_table
4579  * and text_to_table_null functions.
4580  *
4581  * These are not strict so we have to test for null inputs explicitly.
4582  * Returns false if result is to be null, else returns true.
4583  *
4584  * Note that if the result is valid but empty (zero elements), we return
4585  * without changing *tstate --- caller must handle that case, too.
4586  */
4587 static bool
4589 {
4590  text *inputstring;
4591  text *fldsep;
4592  text *null_string;
4593  Oid collation = PG_GET_COLLATION();
4594  int inputstring_len;
4595  int fldsep_len;
4596  char *start_ptr;
4597  text *result_text;
4598 
4599  /* when input string is NULL, then result is NULL too */
4600  if (PG_ARGISNULL(0))
4601  return false;
4602 
4603  inputstring = PG_GETARG_TEXT_PP(0);
4604 
4605  /* fldsep can be NULL */
4606  if (!PG_ARGISNULL(1))
4607  fldsep = PG_GETARG_TEXT_PP(1);
4608  else
4609  fldsep = NULL;
4610 
4611  /* null_string can be NULL or omitted */
4612  if (PG_NARGS() > 2 && !PG_ARGISNULL(2))
4613  null_string = PG_GETARG_TEXT_PP(2);
4614  else
4615  null_string = NULL;
4616 
4617  if (fldsep != NULL)
4618  {
4619  /*
4620  * Normal case with non-null fldsep. Use the text_position machinery
4621  * to search for occurrences of fldsep.
4622  */
4624 
4625  inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
4626  fldsep_len = VARSIZE_ANY_EXHDR(fldsep);
4627 
4628  /* return empty set for empty input string */
4629  if (inputstring_len < 1)
4630  return true;
4631 
4632  /* empty field separator: return input string as a one-element set */
4633  if (fldsep_len < 1)
4634  {
4635  split_text_accum_result(tstate, inputstring,
4636  null_string, collation);
4637  return true;
4638  }
4639 
4640  text_position_setup(inputstring, fldsep, collation, &state);
4641 
4642  start_ptr = VARDATA_ANY(inputstring);
4643 
4644  for (;;)
4645  {
4646  bool found;
4647  char *end_ptr;
4648  int chunk_len;
4649 
4651 
4652  found = text_position_next(&state);
4653  if (!found)
4654  {
4655  /* fetch last field */
4656  chunk_len = ((char *) inputstring + VARSIZE_ANY(inputstring)) - start_ptr;
4657  end_ptr = NULL; /* not used, but some compilers complain */
4658  }
4659  else
4660  {
4661  /* fetch non-last field */
4662  end_ptr = text_position_get_match_ptr(&state);
4663  chunk_len = end_ptr - start_ptr;
4664  }
4665 
4666  /* build a temp text datum to pass to split_text_accum_result */
4667  result_text = cstring_to_text_with_len(start_ptr, chunk_len);
4668 
4669  /* stash away this field */
4670  split_text_accum_result(tstate, result_text,
4671  null_string, collation);
4672 
4673  pfree(result_text);
4674 
4675  if (!found)
4676  break;
4677 
4678  start_ptr = end_ptr + fldsep_len;
4679  }
4680 
4682  }
4683  else
4684  {
4685  /*
4686  * When fldsep is NULL, each character in the input string becomes a
4687  * separate element in the result set. The separator is effectively
4688  * the space between characters.
4689  */
4690  inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
4691 
4692  start_ptr = VARDATA_ANY(inputstring);
4693 
4694  while (inputstring_len > 0)
4695  {
4696  int chunk_len = pg_mblen(start_ptr);
4697 
4699 
4700  /* build a temp text datum to pass to split_text_accum_result */
4701  result_text = cstring_to_text_with_len(start_ptr, chunk_len);
4702 
4703  /* stash away this field */
4704  split_text_accum_result(tstate, result_text,
4705  null_string, collation);
4706 
4707  pfree(result_text);
4708 
4709  start_ptr += chunk_len;
4710  inputstring_len -= chunk_len;
4711  }
4712  }
4713 
4714  return true;
4715 }
4716 
4717 /*
4718  * Add text item to result set (table or array).
4719  *
4720  * This is also responsible for checking to see if the item matches
4721  * the null_string, in which case we should emit NULL instead.
4722  */
4723 static void
4725  text *field_value,
4726  text *null_string,
4727  Oid collation)
4728 {
4729  bool is_null = false;
4730 
4731  if (null_string && text_isequal(field_value, null_string, collation))
4732  is_null = true;
4733 
4734  if (tstate->tupstore)
4735  {
4736  Datum values[1];
4737  bool nulls[1];
4738 
4739  values[0] = PointerGetDatum(field_value);
4740  nulls[0] = is_null;
4741 
4743  tstate->tupdesc,
4744  values,
4745  nulls);
4746  }
4747  else
4748  {
4749  tstate->astate = accumArrayResult(tstate->astate,
4750  PointerGetDatum(field_value),
4751  is_null,
4752  TEXTOID,
4754  }
4755 }
4756 
4757 /*
4758  * array_to_text
4759  * concatenate Cstring representation of input array elements
4760  * using provided field separator
4761  */
4762 Datum
4764 {
4766  char *fldsep = text_to_cstring(PG_GETARG_TEXT_PP(1));
4767 
4768  PG_RETURN_TEXT_P(array_to_text_internal(fcinfo, v, fldsep, NULL));
4769 }
4770 
4771 /*
4772  * array_to_text_null
4773  * concatenate Cstring representation of input array elements
4774  * using provided field separator and null string
4775  *
4776  * This version is not strict so we have to test for null inputs explicitly.
4777  */
4778 Datum
4780 {
4781  ArrayType *v;
4782  char *fldsep;
4783  char *null_string;
4784 
4785  /* returns NULL when first or second parameter is NULL */
4786  if (PG_ARGISNULL(0) || PG_ARGISNULL(1))
4787  PG_RETURN_NULL();
4788 
4789  v = PG_GETARG_ARRAYTYPE_P(0);
4790  fldsep = text_to_cstring(PG_GETARG_TEXT_PP(1));
4791 
4792  /* NULL null string is passed through as a null pointer */
4793  if (!PG_ARGISNULL(2))
4794  null_string = text_to_cstring(PG_GETARG_TEXT_PP(2));
4795  else
4796  null_string = NULL;
4797 
4798  PG_RETURN_TEXT_P(array_to_text_internal(fcinfo, v, fldsep, null_string));
4799 }
4800 
4801 /*
4802  * common code for array_to_text and array_to_text_null functions
4803  */
4804 static text *
4806  const char *fldsep, const char *null_string)
4807 {
4808  text *result;
4809  int nitems,
4810  *dims,
4811  ndims;
4812  Oid element_type;
4813  int typlen;
4814  bool typbyval;
4815  char typalign;
4817  bool printed = false;
4818  char *p;
4819  bits8 *bitmap;
4820  int bitmask;
4821  int i;
4822  ArrayMetaState *my_extra;
4823 
4824  ndims = ARR_NDIM(v);
4825  dims = ARR_DIMS(v);
4826  nitems = ArrayGetNItems(ndims, dims);
4827 
4828  /* if there are no elements, return an empty string */
4829  if (nitems == 0)
4830  return cstring_to_text_with_len("", 0);
4831 
4832  element_type = ARR_ELEMTYPE(v);
4833  initStringInfo(&buf);
4834 
4835  /*
4836  * We arrange to look up info about element type, including its output
4837  * conversion proc, only once per series of calls, assuming the element
4838  * type doesn't change underneath us.
4839  */
4840  my_extra = (ArrayMetaState *) fcinfo->flinfo->fn_extra;
4841  if (my_extra == NULL)
4842  {
4843  fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
4844  sizeof(ArrayMetaState));
4845  my_extra = (ArrayMetaState *) fcinfo->flinfo->fn_extra;
4846  my_extra->element_type = ~element_type;
4847  }
4848 
4849  if (my_extra->element_type != element_type)
4850  {
4851  /*
4852  * Get info about element type, including its output conversion proc
4853  */
4854  get_type_io_data(element_type, IOFunc_output,
4855  &my_extra->typlen, &my_extra->typbyval,
4856  &my_extra->typalign, &my_extra->typdelim,
4857  &my_extra->typioparam, &my_extra->typiofunc);
4858  fmgr_info_cxt(my_extra->typiofunc, &my_extra->proc,
4859  fcinfo->flinfo->fn_mcxt);
4860  my_extra->element_type = element_type;
4861  }
4862  typlen = my_extra->typlen;
4863  typbyval = my_extra->typbyval;
4864  typalign = my_extra->typalign;
4865 
4866  p = ARR_DATA_PTR(v);
4867  bitmap = ARR_NULLBITMAP(v);
4868  bitmask = 1;
4869 
4870  for (i = 0; i < nitems; i++)
4871  {
4872  Datum itemvalue;
4873  char *value;
4874 
4875  /* Get source element, checking for NULL */
4876  if (bitmap && (*bitmap & bitmask) == 0)
4877  {
4878  /* if null_string is NULL, we just ignore null elements */
4879  if (null_string != NULL)
4880  {
4881  if (printed)
4882  appendStringInfo(&buf, "%s%s", fldsep, null_string);
4883  else
4884  appendStringInfoString(&buf, null_string);
4885  printed = true;
4886  }
4887  }
4888  else
4889  {
4890  itemvalue = fetch_att(p, typbyval, typlen);
4891 
4892  value = OutputFunctionCall(&my_extra->proc, itemvalue);
4893 
4894  if (printed)
4895  appendStringInfo(&buf, "%s%s", fldsep, value);
4896  else
4898  printed = true;
4899 
4900  p = att_addlength_pointer(p, typlen, p);
4901  p = (char *) att_align_nominal(p, typalign);
4902  }
4903 
4904  /* advance bitmap pointer if any */
4905  if (bitmap)
4906  {
4907  bitmask <<= 1;
4908  if (bitmask == 0x100)
4909  {
4910  bitmap++;
4911  bitmask = 1;
4912  }
4913  }
4914  }
4915 
4916  result = cstring_to_text_with_len(buf.data, buf.len);
4917  pfree(buf.data);
4918 
4919  return result;
4920 }
4921 
4922 /*
4923  * Workhorse for to_bin, to_oct, and to_hex. Note that base must be > 1 and <=
4924  * 16.
4925  */
4926 static inline text *
4927 convert_to_base(uint64 value, int base)
4928 {
4929  const char *digits = "0123456789abcdef";
4930 
4931  /* We size the buffer for to_bin's longest possible return value. */
4932  char buf[sizeof(uint64) * BITS_PER_BYTE];
4933  char *const end = buf + sizeof(buf);
4934  char *ptr = end;
4935 
4936  Assert(base > 1);
4937  Assert(base <= 16);
4938 
4939  do
4940  {
4941  *--ptr = digits[value % base];
4942  value /= base;
4943  } while (ptr > buf && value);
4944 
4945  return cstring_to_text_with_len(ptr, end - ptr);
4946 }
4947 
4948 /*
4949  * Convert an integer to a string containing a base-2 (binary) representation
4950  * of the number.
4951  */
4952 Datum
4954 {
4955  uint64 value = (uint32) PG_GETARG_INT32(0);
4956 
4958 }
4959 Datum
4961 {
4962  uint64 value = (uint64) PG_GETARG_INT64(0);
4963 
4965 }
4966 
4967 /*
4968  * Convert an integer to a string containing a base-8 (oct) representation of
4969  * the number.
4970  */
4971 Datum
4973 {
4974  uint64 value = (uint32) PG_GETARG_INT32(0);
4975 
4977 }
4978 Datum
4980 {
4981  uint64 value = (uint64) PG_GETARG_INT64(0);
4982 
4984 }
4985 
4986 /*
4987  * Convert an integer to a string containing a base-16 (hex) representation of
4988  * the number.
4989  */
4990 Datum
4992 {
4993  uint64 value = (uint32) PG_GETARG_INT32(0);
4994 
4996 }
4997 Datum
4999 {
5000  uint64 value = (uint64) PG_GETARG_INT64(0);
5001 
5003 }
5004 
5005 /*
5006  * Return the size of a datum, possibly compressed
5007  *
5008  * Works on any data type
5009  */
5010 Datum
5012 {
5014  int32 result;
5015  int typlen;
5016 
5017  /* On first call, get the input type's typlen, and save at *fn_extra */
5018  if (fcinfo->flinfo->fn_extra == NULL)
5019  {
5020  /* Lookup the datatype of the supplied argument */
5021  Oid argtypeid = get_fn_expr_argtype(fcinfo->flinfo, 0);
5022 
5023  typlen = get_typlen(argtypeid);
5024  if (typlen == 0) /* should not happen */
5025  elog(ERROR, "cache lookup failed for type %u", argtypeid);
5026 
5027  fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
5028  sizeof(int));
5029  *((int *) fcinfo->flinfo->fn_extra) = typlen;
5030  }
5031  else
5032  typlen = *((int *) fcinfo->flinfo->fn_extra);
5033 
5034  if (typlen == -1)
5035  {
5036  /* varlena type, possibly toasted */
5037  result = toast_datum_size(value);
5038  }
5039  else if (typlen == -2)
5040  {
5041  /* cstring */
5042  result = strlen(DatumGetCString(value)) + 1;
5043  }
5044  else
5045  {
5046  /* ordinary fixed-width type */
5047  result = typlen;
5048  }
5049 
5050  PG_RETURN_INT32(result);
5051 }
5052 
5053 /*
5054  * Return the compression method stored in the compressed attribute. Return
5055  * NULL for non varlena type or uncompressed data.
5056  */
5057 Datum
5059 {
5060  int typlen;
5061  char *result;
5062  ToastCompressionId cmid;
5063 
5064  /* On first call, get the input type's typlen, and save at *fn_extra */
5065  if (fcinfo->flinfo->fn_extra == NULL)
5066  {
5067  /* Lookup the datatype of the supplied argument */
5068  Oid argtypeid = get_fn_expr_argtype(fcinfo->flinfo, 0);
5069 
5070  typlen = get_typlen(argtypeid);
5071  if (typlen == 0) /* should not happen */
5072  elog(ERROR, "cache lookup failed for type %u", argtypeid);
5073 
5074  fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
5075  sizeof(int));
5076  *((int *) fcinfo->flinfo->fn_extra) = typlen;
5077  }
5078  else
5079  typlen = *((int *) fcinfo->flinfo->fn_extra);
5080 
5081  if (typlen != -1)
5082  PG_RETURN_NULL();
5083 
5084  /* get the compression method id stored in the compressed varlena */
5085  cmid = toast_get_compression_id((struct varlena *)
5087  if (cmid == TOAST_INVALID_COMPRESSION_ID)
5088  PG_RETURN_NULL();
5089 
5090  /* convert compression method id to compression method name */
5091  switch (cmid)
5092  {
5094  result = "pglz";
5095  break;
5097  result = "lz4";
5098  break;
5099  default:
5100  elog(ERROR, "invalid compression method id %d", cmid);
5101  }
5102 
5104 }
5105 
5106 /*
5107  * string_agg - Concatenates values and returns string.
5108  *
5109  * Syntax: string_agg(value text, delimiter text) RETURNS text
5110  *
5111  * Note: Any NULL values are ignored. The first-call delimiter isn't
5112  * actually used at all, and on subsequent calls the delimiter precedes
5113  * the associated value.
5114  */
5115 
5116 /* subroutine to initialize state */
5117 static StringInfo
5119 {
5120  StringInfo state;
5121  MemoryContext aggcontext;
5122  MemoryContext oldcontext;
5123 
5124  if (!AggCheckCallContext(fcinfo, &aggcontext))
5125  {
5126  /* cannot be called directly because of internal-type argument */
5127  elog(ERROR, "string_agg_transfn called in non-aggregate context");
5128  }
5129 
5130  /*
5131  * Create state in aggregate context. It'll stay there across subsequent
5132  * calls.
5133  */
5134  oldcontext = MemoryContextSwitchTo(aggcontext);
5135  state = makeStringInfo();
5136  MemoryContextSwitchTo(oldcontext);
5137 
5138  return state;
5139 }
5140 
5141 Datum
5143 {
5144  StringInfo state;
5145 
5146  state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
5147 
5148  /* Append the value unless null, preceding it with the delimiter. */
5149  if (!PG_ARGISNULL(1))
5150  {
5152  bool isfirst = false;
5153 
5154  /*
5155  * You might think we can just throw away the first delimiter, however
5156  * we must keep it as we may be a parallel worker doing partial
5157  * aggregation building a state to send to the main process. We need
5158  * to keep the delimiter of every aggregation so that the combine
5159  * function can properly join up the strings of two separately
5160  * partially aggregated results. The first delimiter is only stripped
5161  * off in the final function. To know how much to strip off the front
5162  * of the string, we store the length of the first delimiter in the
5163  * StringInfo's cursor field, which we don't otherwise need here.
5164  */
5165  if (state == NULL)
5166  {
5167  state = makeStringAggState(fcinfo);
5168  isfirst = true;
5169  }
5170 
5171  if (!PG_ARGISNULL(2))
5172  {
5173  text *delim = PG_GETARG_TEXT_PP(2);
5174 
5175  appendStringInfoText(state, delim);
5176  if (isfirst)
5177  state->cursor = VARSIZE_ANY_EXHDR(delim);
5178  }
5179 
5181  }
5182 
5183  /*
5184  * The transition type for string_agg() is declared to be "internal",
5185  * which is a pass-by-value type the same size as a pointer.
5186  */
5187  if (state)
5189  PG_RETURN_NULL();
5190 }
5191 
5192 /*
5193  * string_agg_combine
5194  * Aggregate combine function for string_agg(text) and string_agg(bytea)
5195  */
5196 Datum
5198 {
5199  StringInfo state1;
5200  StringInfo state2;
5201  MemoryContext agg_context;
5202 
5203  if (!AggCheckCallContext(fcinfo, &agg_context))
5204  elog(ERROR, "aggregate function called in non-aggregate context");
5205 
5206  state1 = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
5207  state2 = PG_ARGISNULL(1) ? NULL : (StringInfo) PG_GETARG_POINTER(1);
5208 
5209  if (state2 == NULL)
5210  {
5211  /*
5212  * NULL state2 is easy, just return state1, which we know is already
5213  * in the agg_context
5214  */
5215  if (state1 == NULL)
5216  PG_RETURN_NULL();
5217  PG_RETURN_POINTER(state1);
5218  }
5219 
5220  if (state1 == NULL)
5221  {
5222  /* We must copy state2's data into the agg_context */
5223  MemoryContext old_context;
5224 
5225  old_context = MemoryContextSwitchTo(agg_context);
5226  state1 = makeStringAggState(fcinfo);
5227  appendBinaryStringInfo(state1, state2->data, state2->len);
5228  state1->cursor = state2->cursor;
5229  MemoryContextSwitchTo(old_context);
5230  }
5231  else if (state2->len > 0)
5232  {
5233  /* Combine ... state1->cursor does not change in this case */
5234  appendBinaryStringInfo(state1, state2->data, state2->len);
5235  }
5236 
5237  PG_RETURN_POINTER(state1);
5238 }
5239 
5240 /*
5241  * string_agg_serialize
5242  * Aggregate serialize function for string_agg(text) and string_agg(bytea)
5243  *
5244  * This is strict, so we need not handle NULL input
5245  */
5246 Datum
5248 {
5249  StringInfo state;
5251  bytea *result;
5252 
5253  /* cannot be called directly because of internal-type argument */
5254  Assert(AggCheckCallContext(fcinfo, NULL));
5255 
5257 
5258  pq_begintypsend(&buf);
5259 
5260  /* cursor */
5261  pq_sendint(&buf, state->cursor, 4);
5262 
5263  /* data */
5264  pq_sendbytes(&buf, state->data, state->len);
5265 
5266  result = pq_endtypsend(&buf);
5267 
5268  PG_RETURN_BYTEA_P(result);
5269 }
5270 
5271 /*
5272  * string_agg_deserialize
5273  * Aggregate deserial function for string_agg(text) and string_agg(bytea)
5274  *
5275  * This is strict, so we need not handle NULL input
5276  */
5277 Datum
5279 {
5280  bytea *sstate;
5281  StringInfo result;
5283  char *data;
5284  int datalen;
5285 
5286  /* cannot be called directly because of internal-type argument */
5287  Assert(AggCheckCallContext(fcinfo, NULL));
5288 
5289  sstate = PG_GETARG_BYTEA_PP(0);
5290 
5291  /*
5292  * Copy the bytea into a StringInfo so that we can "receive" it using the
5293  * standard recv-function infrastructure.
5294  */
5295  initStringInfo(&buf);
5297  VARDATA_ANY(sstate), VARSIZE_ANY_EXHDR(sstate));
5298 
5299  result = makeStringAggState(fcinfo);
5300 
5301  /* cursor */
5302  result->cursor = pq_getmsgint(&buf, 4);
5303 
5304  /* data */
5305  datalen = VARSIZE_ANY_EXHDR(sstate) - 4;
5306  data = (char *) pq_getmsgbytes(&buf, datalen);
5307  appendBinaryStringInfo(result, data, datalen);
5308 
5309  pq_getmsgend(&buf);
5310  pfree(buf.data);
5311 
5312  PG_RETURN_POINTER(result);
5313 }
5314 
5315 Datum
5317 {
5318  StringInfo state;
5319 
5320  /* cannot be called directly because of internal-type argument */
5321  Assert(AggCheckCallContext(fcinfo, NULL));
5322 
5323  state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
5324 
5325  if (state != NULL)
5326  {
5327  /* As per comment in transfn, strip data before the cursor position */
5329  state->len - state->cursor));
5330  }
5331  else
5332  PG_RETURN_NULL();
5333 }
5334 
5335 /*
5336  * Prepare cache with fmgr info for the output functions of the datatypes of
5337  * the arguments of a concat-like function, beginning with argument "argidx".
5338  * (Arguments before that will have corresponding slots in the resulting
5339  * FmgrInfo array, but we don't fill those slots.)
5340  */
5341 static FmgrInfo *
5343 {
5344  FmgrInfo *foutcache;
5345  int i;
5346 
5347  /* We keep the info in fn_mcxt so it survives across calls */
5348  foutcache = (FmgrInfo *) MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
5349  PG_NARGS() * sizeof(FmgrInfo));
5350 
5351  for (i = argidx; i < PG_NARGS(); i++)
5352  {
5353  Oid valtype;
5354  Oid typOutput;
5355  bool typIsVarlena;
5356 
5357  valtype = get_fn_expr_argtype(fcinfo->flinfo, i);
5358  if (!OidIsValid(valtype))
5359  elog(ERROR, "could not determine data type of concat() input");
5360 
5361  getTypeOutputInfo(valtype, &typOutput, &typIsVarlena);
5362  fmgr_info_cxt(typOutput, &foutcache[i], fcinfo->flinfo->fn_mcxt);
5363  }
5364 
5365  fcinfo->flinfo->fn_extra = foutcache;
5366 
5367  return foutcache;
5368 }
5369 
5370 /*
5371  * Implementation of both concat() and concat_ws().
5372  *
5373  * sepstr is the separator string to place between values.
5374  * argidx identifies the first argument to concatenate (counting from zero);
5375  * note that this must be constant across any one series of calls.
5376  *
5377  * Returns NULL if result should be NULL, else text value.
5378  */
5379 static text *
5380 concat_internal(const char *sepstr, int argidx,
5381  FunctionCallInfo fcinfo)
5382 {
5383  text *result;
5385  FmgrInfo *foutcache;
5386  bool first_arg = true;
5387  int i;
5388 
5389  /*
5390  * concat(VARIADIC some-array) is essentially equivalent to
5391  * array_to_text(), ie concat the array elements with the given separator.
5392  * So we just pass the case off to that code.
5393  */
5394  if (get_fn_expr_variadic(fcinfo->flinfo))
5395  {
5396  ArrayType *arr;
5397 
5398  /* Should have just the one argument */
5399  Assert(argidx == PG_NARGS() - 1);
5400 
5401  /* concat(VARIADIC NULL) is defined as NULL */
5402  if (PG_ARGISNULL(argidx))
5403  return NULL;
5404 
5405  /*
5406  * Non-null argument had better be an array. We assume that any call
5407  * context that could let get_fn_expr_variadic return true will have
5408  * checked that a VARIADIC-labeled parameter actually is an array. So
5409  * it should be okay to just Assert that it's an array rather than
5410  * doing a full-fledged error check.
5411  */
5413 
5414  /* OK, safe to fetch the array value */
5415  arr = PG_GETARG_ARRAYTYPE_P(argidx);
5416 
5417  /*
5418  * And serialize the array. We tell array_to_text to ignore null
5419  * elements, which matches the behavior of the loop below.
5420  */
5421  return array_to_text_internal(fcinfo, arr, sepstr, NULL);
5422  }
5423 
5424  /* Normal case without explicit VARIADIC marker */
5425  initStringInfo(&str);
5426 
5427  /* Get output function info, building it if first time through */
5428  foutcache = (FmgrInfo *) fcinfo->flinfo->fn_extra;
5429  if (foutcache == NULL)
5430  foutcache = build_concat_foutcache(fcinfo, argidx);
5431 
5432  for (i = argidx; i < PG_NARGS(); i++)
5433  {
5434  if (!PG_ARGISNULL(i))
5435  {
5437 
5438  /* add separator if appropriate */
5439  if (first_arg)
5440  first_arg = false;
5441  else
5442  appendStringInfoString(&str, sepstr);
5443 
5444  /* call the appropriate type output function, append the result */
5446  OutputFunctionCall(&foutcache[i], value));
5447  }
5448  }
5449 
5450  result = cstring_to_text_with_len(str.data, str.len);
5451  pfree(str.data);
5452 
5453  return result;
5454 }
5455 
5456 /*
5457  * Concatenate all arguments. NULL arguments are ignored.
5458  */
5459 Datum
5461 {
5462  text *result;
5463 
5464  result = concat_internal("", 0, fcinfo);
5465  if (result == NULL)
5466  PG_RETURN_NULL();
5467  PG_RETURN_TEXT_P(result);
5468 }
5469 
5470 /*
5471  * Concatenate all but first argument value with separators. The first
5472  * parameter is used as the separator. NULL arguments are ignored.
5473  */
5474 Datum
5476 {
5477  char *sep;
5478  text *result;
5479 
5480  /* return NULL when separator is NULL */
5481  if (PG_ARGISNULL(0))
5482  PG_RETURN_NULL();
5484 
5485  result = concat_internal(sep, 1, fcinfo);
5486  if (result == NULL)
5487  PG_RETURN_NULL();
5488  PG_RETURN_TEXT_P(result);
5489 }
5490 
5491 /*
5492  * Return first n characters in the string. When n is negative,
5493  * return all but last |n| characters.
5494  */
5495 Datum
5497 {
5498  int n = PG_GETARG_INT32(1);
5499 
5500  if (n < 0)
5501  {
5502  text *str = PG_GETARG_TEXT_PP(0);
5503  const char *p = VARDATA_ANY(str);
5504  int len = VARSIZE_ANY_EXHDR(str);
5505  int rlen;
5506 
5507  n = pg_mbstrlen_with_len(p, len) + n;
5508  rlen = pg_mbcharcliplen(p, len, n);
5510  }
5511  else
5513 }
5514 
5515 /*
5516  * Return last n characters in the string. When n is negative,
5517  * return all but first |n| characters.
5518  */
5519 Datum
5521 {
5522  text *str = PG_GETARG_TEXT_PP(0);
5523  const char *p = VARDATA_ANY(str);
5524  int len = VARSIZE_ANY_EXHDR(str);
5525  int n = PG_GETARG_INT32(1);
5526  int off;
5527 
5528  if (n < 0)
5529  n = -n;
5530  else
5531  n = pg_mbstrlen_with_len(p, len) - n;
5532  off = pg_mbcharcliplen(p, len, n);
5533 
5535 }
5536 
5537 /*
5538  * Return reversed string
5539  */
5540 Datum
5542 {
5543  text *str = PG_GETARG_TEXT_PP(0);
5544  const char *p = VARDATA_ANY(str);
5545  int len = VARSIZE_ANY_EXHDR(str);
5546  const char *endp = p + len;
5547  text *result;
5548  char *dst;
5549 
5550  result = palloc(len + VARHDRSZ);
5551  dst = (char *) VARDATA(result) + len;
5552  SET_VARSIZE(result, len + VARHDRSZ);
5553 
5555  {
5556  /* multibyte version */
5557  while (p < endp)
5558  {
5559  int sz;
5560 
5561  sz = pg_mblen(p);
5562  dst -= sz;
5563  memcpy(dst, p, sz);
5564  p += sz;
5565  }
5566  }
5567  else
5568  {
5569  /* single byte version */
5570  while (p < endp)
5571  *(--dst) = *p++;
5572  }
5573 
5574  PG_RETURN_TEXT_P(result);
5575 }
5576 
5577 
5578 /*
5579  * Support macros for text_format()
5580  */
5581 #define TEXT_FORMAT_FLAG_MINUS 0x0001 /* is minus flag present? */
5582 
5583 #define ADVANCE_PARSE_POINTER(ptr,end_ptr) \
5584  do { \
5585  if (++(ptr) >= (end_ptr)) \
5586  ereport(ERROR, \
5587  (errcode(ERRCODE_INVALID_PARAMETER_VALUE), \
5588  errmsg("unterminated format() type specifier"), \
5589  errhint("For a single \"%%\" use \"%%%%\"."))); \
5590  } while (0)
5591 
5592 /*
5593  * Returns a formatted string
5594  */
5595 Datum
5597 {
5598  text *fmt;
5600  const char *cp;
5601  const char *start_ptr;
5602  const char *end_ptr;
5603  text *result;
5604  int arg;
5605  bool funcvariadic;
5606  int nargs;
5607  Datum *elements = NULL;
5608  bool *nulls = NULL;
5609  Oid element_type = InvalidOid;
5610  Oid prev_type = InvalidOid;
5611  Oid prev_width_type = InvalidOid;
5612  FmgrInfo typoutputfinfo;
5613  FmgrInfo typoutputinfo_width;
5614 
5615  /* When format string is null, immediately return null */
5616  if (PG_ARGISNULL(0))
5617  PG_RETURN_NULL();
5618 
5619  /* If argument is marked VARIADIC, expand array into elements */
5620  if (get_fn_expr_variadic(fcinfo->flinfo))
5621  {
5622  ArrayType *arr;
5623  int16 elmlen;
5624  bool elmbyval;
5625  char elmalign;
5626  int nitems;
5627 
5628  /* Should have just the one argument */
5629  Assert(PG_NARGS() == 2);
5630 
5631  /* If argument is NULL, we treat it as zero-length array */
5632  if (PG_ARGISNULL(1))
5633  nitems = 0;
5634  else
5635  {
5636  /*
5637  * Non-null argument had better be an array. We assume that any
5638  * call context that could let get_fn_expr_variadic return true
5639  * will have checked that a VARIADIC-labeled parameter actually is
5640  * an array. So it should be okay to just Assert that it's an
5641  * array rather than doing a full-fledged error check.
5642  */
5644 
5645  /* OK, safe to fetch the array value */
5646  arr = PG_GETARG_ARRAYTYPE_P(1);
5647 
5648  /* Get info about array element type */
5649  element_type = ARR_ELEMTYPE(arr);
5650  get_typlenbyvalalign(element_type,
5651  &elmlen, &elmbyval, &elmalign);
5652 
5653  /* Extract all array elements */
5654  deconstruct_array(arr, element_type, elmlen, elmbyval, elmalign,
5655  &elements, &nulls, &nitems);
5656  }
5657 
5658  nargs = nitems + 1;
5659  funcvariadic = true;
5660  }
5661  else
5662  {
5663  /* Non-variadic case, we'll process the arguments individually */
5664  nargs = PG_NARGS();
5665  funcvariadic = false;
5666  }
5667 
5668  /* Setup for main loop. */
5669  fmt = PG_GETARG_TEXT_PP(0);
5670  start_ptr = VARDATA_ANY(fmt);
5671  end_ptr = start_ptr + VARSIZE_ANY_EXHDR(fmt);
5672  initStringInfo(&str);
5673  arg = 1; /* next argument position to print */
5674 
5675  /* Scan format string, looking for conversion specifiers. */
5676  for (cp = start_ptr; cp < end_ptr; cp++)
5677  {
5678  int argpos;
5679  int widthpos;
5680  int flags;
5681  int width;
5682  Datum value;
5683  bool isNull;
5684  Oid typid;
5685 
5686  /*
5687  * If it's not the start of a conversion specifier, just copy it to
5688  * the output buffer.
5689  */
5690  if (*cp != '%')
5691  {
5693  continue;
5694  }
5695 
5696  ADVANCE_PARSE_POINTER(cp, end_ptr);
5697 
5698  /* Easy case: %% outputs a single % */
5699  if (*cp == '%')
5700  {
5702  continue;
5703  }
5704 
5705  /* Parse the optional portions of the format specifier */
5706  cp = text_format_parse_format(cp, end_ptr,
5707  &argpos, &widthpos,
5708  &flags, &width);
5709 
5710  /*
5711  * Next we should see the main conversion specifier. Whether or not
5712  * an argument position was present, it's known that at least one
5713  * character remains in the string at this point. Experience suggests
5714  * that it's worth checking that that character is one of the expected
5715  * ones before we try to fetch arguments, so as to produce the least
5716  * confusing response to a mis-formatted specifier.
5717  */
5718  if (strchr("sIL", *cp) == NULL)
5719  ereport(ERROR,
5720  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5721  errmsg("unrecognized format() type specifier \"%.*s\"",
5722  pg_mblen(cp), cp),
5723  errhint("For a single \"%%\" use \"%%%%\".")));
5724 
5725  /* If indirect width was specified, get its value */
5726  if (widthpos >= 0)
5727  {
5728  /* Collect the specified or next argument position */
5729  if (widthpos > 0)
5730  arg = widthpos;
5731  if (arg >= nargs)
5732  ereport(ERROR,
5733  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5734  errmsg("too few arguments for format()")));
5735 
5736  /* Get the value and type of the selected argument */
5737  if (!funcvariadic)
5738  {
5740  isNull = PG_ARGISNULL(arg);
5741  typid = get_fn_expr_argtype(fcinfo->flinfo, arg);
5742  }
5743  else
5744  {
5745  value = elements[arg - 1];
5746  isNull = nulls[arg - 1];
5747  typid = element_type;
5748  }
5749  if (!OidIsValid(typid))
5750  elog(ERROR, "could not determine data type of format() input");
5751 
5752  arg++;
5753 
5754  /* We can treat NULL width the same as zero */
5755  if (isNull)
5756  width = 0;
5757  else if (typid == INT4OID)
5758  width = DatumGetInt32(value);
5759  else if (typid == INT2OID)
5760  width = DatumGetInt16(value);
5761  else
5762  {
5763  /* For less-usual datatypes, convert to text then to int */
5764  char *str;
5765 
5766  if (typid != prev_width_type)
5767  {
5768  Oid typoutputfunc;
5769  bool typIsVarlena;
5770 
5771  getTypeOutputInfo(typid, &typoutputfunc, &typIsVarlena);
5772  fmgr_info(typoutputfunc, &typoutputinfo_width);
5773  prev_width_type = typid;
5774  }
5775 
5776  str = OutputFunctionCall(&typoutputinfo_width, value);
5777 
5778  /* pg_strtoint32 will complain about bad data or overflow */
5779  width = pg_strtoint32(str);
5780 
5781  pfree(str);
5782  }
5783  }
5784 
5785  /* Collect the specified or next argument position */
5786  if (argpos > 0)
5787  arg = argpos;
5788  if (arg >= nargs)
5789  ereport(ERROR,
5790  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5791  errmsg("too few arguments for format()")));
5792 
5793  /* Get the value and type of the selected argument */
5794  if (!funcvariadic)
5795  {
5797  isNull = PG_ARGISNULL(arg);
5798  typid = get_fn_expr_argtype(fcinfo->flinfo, arg);
5799  }
5800  else
5801  {
5802  value = elements[arg - 1];
5803  isNull = nulls[arg - 1];
5804  typid = element_type;
5805  }
5806  if (!OidIsValid(typid))
5807  elog(ERROR, "could not determine data type of format() input");
5808 
5809  arg++;
5810 
5811  /*
5812  * Get the appropriate typOutput function, reusing previous one if
5813  * same type as previous argument. That's particularly useful in the
5814  * variadic-array case, but often saves work even for ordinary calls.
5815  */
5816  if (typid != prev_type)
5817  {
5818  Oid typoutputfunc;
5819  bool typIsVarlena;
5820 
5821  getTypeOutputInfo(typid, &typoutputfunc, &typIsVarlena);
5822  fmgr_info(typoutputfunc, &typoutputfinfo);
5823  prev_type = typid;
5824  }
5825 
5826  /*
5827  * And now we can format the value.
5828  */
5829  switch (*cp)
5830  {
5831  case 's':
5832  case 'I':
5833  case 'L':
5834  text_format_string_conversion(&str, *cp, &typoutputfinfo,
5835  value, isNull,
5836  flags, width);
5837  break;
5838  default:
5839  /* should not get here, because of previous check */
5840  ereport(ERROR,
5841  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5842  errmsg("unrecognized format() type specifier \"%.*s\"",
5843  pg_mblen(cp), cp),
5844  errhint("For a single \"%%\" use \"%%%%\".")));
5845  break;
5846  }
5847  }
5848 
5849  /* Don't need deconstruct_array results anymore. */
5850  if (elements != NULL)
5851  pfree(elements);
5852  if (nulls != NULL)
5853  pfree(nulls);
5854 
5855  /* Generate results. */
5856  result = cstring_to_text_with_len(str.data, str.len);
5857  pfree(str.data);
5858 
5859  PG_RETURN_TEXT_P(result);
5860 }
5861 
5862 /*
5863  * Parse contiguous digits as a decimal number.
5864  *
5865  * Returns true if some digits could be parsed.
5866  * The value is returned into *value, and *ptr is advanced to the next
5867  * character to be parsed.
5868  *
5869  * Note parsing invariant: at least one character is known available before
5870  * string end (end_ptr) at entry, and this is still true at exit.
5871  */
5872 static bool
5873 text_format_parse_digits(const char **ptr, const char *end_ptr, int *value)
5874 {
5875  bool found = false;
5876  const char *cp = *ptr;
5877  int val = 0;
5878 
5879  while (*cp >= '0' && *cp <= '9')
5880  {
5881  int8 digit = (*cp - '0');
5882 
5883  if (unlikely(pg_mul_s32_overflow(val, 10, &val)) ||
5884  unlikely(pg_add_s32_overflow(val, digit, &val)))
5885  ereport(ERROR,
5886  (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
5887  errmsg("number is out of range")));
5888  ADVANCE_PARSE_POINTER(cp, end_ptr);
5889  found = true;
5890  }
5891 
5892  *ptr = cp;
5893  *value = val;
5894 
5895  return found;
5896 }
5897 
5898 /*
5899  * Parse a format specifier (generally following the SUS printf spec).
5900  *
5901  * We have already advanced over the initial '%', and we are looking for
5902  * [argpos][flags][width]type (but the type character is not consumed here).
5903  *
5904  * Inputs are start_ptr (the position after '%') and end_ptr (string end + 1).
5905  * Output parameters:
5906  * argpos: argument position for value to be printed. -1 means unspecified.
5907  * widthpos: argument position for width. Zero means the argument position
5908  * was unspecified (ie, take the next arg) and -1 means no width
5909  * argument (width was omitted or specified as a constant).
5910  * flags: bitmask of flags.
5911  * width: directly-specified width value. Zero means the width was omitted
5912  * (note it's not necessary to distinguish this case from an explicit
5913  * zero width value).
5914  *
5915  * The function result is the next character position to be parsed, ie, the
5916  * location where the type character is/should be.
5917  *
5918  * Note parsing invariant: at least one character is known available before
5919  * string end (end_ptr) at entry, and this is still true at exit.
5920  */
5921 static const char *
5922 text_format_parse_format(const char *start_ptr, const char *end_ptr,
5923  int *argpos, int *widthpos,
5924  int *flags, int *width)
5925 {
5926  const char *cp = start_ptr;
5927  int n;
5928 
5929  /* set defaults for output parameters */
5930  *argpos = -1;
5931  *widthpos = -1;
5932  *flags = 0;
5933  *width = 0;
5934 
5935  /* try to identify first number */
5936  if (text_format_parse_digits(&cp, end_ptr, &n))
5937  {
5938  if (*cp != '$')
5939  {
5940  /* Must be just a width and a type, so we're done */
5941  *width = n;
5942  return cp;
5943  }
5944  /* The number was argument position */
5945  *argpos = n;
5946  /* Explicit 0 for argument index is immediately refused */
5947  if (n == 0)
5948  ereport(ERROR,
5949  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5950  errmsg("format specifies argument 0, but arguments are numbered from 1")));
5951  ADVANCE_PARSE_POINTER(cp, end_ptr);
5952  }
5953 
5954  /* Handle flags (only minus is supported now) */
5955  while (*cp == '-')
5956  {
5957  *flags |= TEXT_FORMAT_FLAG_MINUS;
5958  ADVANCE_PARSE_POINTER(cp, end_ptr);
5959  }
5960 
5961  if (*cp == '*')
5962  {
5963  /* Handle indirect width */
5964  ADVANCE_PARSE_POINTER(cp, end_ptr);
5965  if (text_format_parse_digits(&cp, end_ptr, &n))
5966  {
5967  /* number in this position must be closed by $ */
5968  if (*cp != '$')
5969  ereport(ERROR,
5970  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5971  errmsg("width argument position must be ended by \"$\"")));
5972  /* The number was width argument position */
5973  *widthpos = n;
5974  /* Explicit 0 for argument index is immediately refused */
5975  if (n == 0)
5976  ereport(ERROR,
5977  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5978  errmsg("format specifies argument 0, but arguments are numbered from 1")));
5979  ADVANCE_PARSE_POINTER(cp, end_ptr);
5980  }
5981  else
5982  *widthpos = 0; /* width's argument position is unspecified */
5983  }
5984  else
5985  {
5986  /* Check for direct width specification */
5987  if (text_format_parse_digits(&cp, end_ptr, &n))
5988  *width = n;
5989  }
5990 
5991  /* cp should now be pointing at type character */
5992  return cp;
5993 }
5994 
5995 /*
5996  * Format a %s, %I, or %L conversion
5997  */
5998 static void
6000  FmgrInfo *typOutputInfo,
6001  Datum value, bool isNull,
6002  int flags, int width)
6003 {
6004  char *str;
6005