PostgreSQL Source Code  git master
varlena.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * varlena.c
4  * Functions for the variable-length built-in types.
5  *
6  * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  * src/backend/utils/adt/varlena.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 #include "postgres.h"
16 
17 #include <ctype.h>
18 #include <limits.h>
19 
20 #include "access/detoast.h"
21 #include "catalog/pg_collation.h"
22 #include "catalog/pg_type.h"
23 #include "common/hashfn.h"
24 #include "common/int.h"
25 #include "common/unicode_norm.h"
26 #include "lib/hyperloglog.h"
27 #include "libpq/pqformat.h"
28 #include "miscadmin.h"
29 #include "parser/scansup.h"
30 #include "port/pg_bswap.h"
31 #include "regex/regex.h"
32 #include "utils/builtins.h"
33 #include "utils/bytea.h"
34 #include "utils/lsyscache.h"
35 #include "utils/memutils.h"
36 #include "utils/pg_locale.h"
37 #include "utils/sortsupport.h"
38 #include "utils/varlena.h"
39 
40 
41 /* GUC variable */
43 
44 typedef struct varlena unknown;
45 typedef struct varlena VarString;
46 
47 /*
48  * State for text_position_* functions.
49  */
50 typedef struct
51 {
52  bool is_multibyte; /* T if multibyte encoding */
54 
55  char *str1; /* haystack string */
56  char *str2; /* needle string */
57  int len1; /* string lengths in bytes */
58  int len2;
59 
60  /* Skip table for Boyer-Moore-Horspool search algorithm: */
61  int skiptablemask; /* mask for ANDing with skiptable subscripts */
62  int skiptable[256]; /* skip distance for given mismatched char */
63 
64  char *last_match; /* pointer to last match in 'str1' */
65 
66  /*
67  * Sometimes we need to convert the byte position of a match to a
68  * character position. These store the last position that was converted,
69  * so that on the next call, we can continue from that point, rather than
70  * count characters from the very beginning.
71  */
72  char *refpoint; /* pointer within original haystack string */
73  int refpos; /* 0-based character offset of the same point */
75 
76 typedef struct
77 {
78  char *buf1; /* 1st string, or abbreviation original string
79  * buf */
80  char *buf2; /* 2nd string, or abbreviation strxfrm() buf */
81  int buflen1;
82  int buflen2;
83  int last_len1; /* Length of last buf1 string/strxfrm() input */
84  int last_len2; /* Length of last buf2 string/strxfrm() blob */
85  int last_returned; /* Last comparison result (cache) */
86  bool cache_blob; /* Does buf2 contain strxfrm() blob, etc? */
87  bool collate_c;
88  Oid typid; /* Actual datatype (text/bpchar/bytea/name) */
89  hyperLogLogState abbr_card; /* Abbreviated key cardinality state */
90  hyperLogLogState full_card; /* Full key cardinality state */
91  double prop_card; /* Required cardinality proportion */
94 
95 /*
96  * This should be large enough that most strings will fit, but small enough
97  * that we feel comfortable putting it on the stack
98  */
99 #define TEXTBUFLEN 1024
100 
101 #define DatumGetUnknownP(X) ((unknown *) PG_DETOAST_DATUM(X))
102 #define DatumGetUnknownPCopy(X) ((unknown *) PG_DETOAST_DATUM_COPY(X))
103 #define PG_GETARG_UNKNOWN_P(n) DatumGetUnknownP(PG_GETARG_DATUM(n))
104 #define PG_GETARG_UNKNOWN_P_COPY(n) DatumGetUnknownPCopy(PG_GETARG_DATUM(n))
105 #define PG_RETURN_UNKNOWN_P(x) PG_RETURN_POINTER(x)
106 
107 #define DatumGetVarStringP(X) ((VarString *) PG_DETOAST_DATUM(X))
108 #define DatumGetVarStringPP(X) ((VarString *) PG_DETOAST_DATUM_PACKED(X))
109 
110 static int varstrfastcmp_c(Datum x, Datum y, SortSupport ssup);
111 static int bpcharfastcmp_c(Datum x, Datum y, SortSupport ssup);
112 static int namefastcmp_c(Datum x, Datum y, SortSupport ssup);
113 static int varlenafastcmp_locale(Datum x, Datum y, SortSupport ssup);
114 static int namefastcmp_locale(Datum x, Datum y, SortSupport ssup);
115 static int varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup);
116 static int varstrcmp_abbrev(Datum x, Datum y, SortSupport ssup);
117 static Datum varstr_abbrev_convert(Datum original, SortSupport ssup);
118 static bool varstr_abbrev_abort(int memtupcount, SortSupport ssup);
119 static int32 text_length(Datum str);
120 static text *text_catenate(text *t1, text *t2);
121 static text *text_substring(Datum str,
122  int32 start,
123  int32 length,
124  bool length_not_specified);
125 static text *text_overlay(text *t1, text *t2, int sp, int sl);
126 static int text_position(text *t1, text *t2, Oid collid);
127 static void text_position_setup(text *t1, text *t2, Oid collid, TextPositionState *state);
129 static char *text_position_next_internal(char *start_ptr, TextPositionState *state);
133 static void check_collation_set(Oid collid);
134 static int text_cmp(text *arg1, text *arg2, Oid collid);
135 static bytea *bytea_catenate(bytea *t1, bytea *t2);
137  int S,
138  int L,
139  bool length_not_specified);
140 static bytea *bytea_overlay(bytea *t1, bytea *t2, int sp, int sl);
141 static void appendStringInfoText(StringInfo str, const text *t);
144  const char *fldsep, const char *null_string);
146 static bool text_format_parse_digits(const char **ptr, const char *end_ptr,
147  int *value);
148 static const char *text_format_parse_format(const char *start_ptr,
149  const char *end_ptr,
150  int *argpos, int *widthpos,
151  int *flags, int *width);
152 static void text_format_string_conversion(StringInfo buf, char conversion,
153  FmgrInfo *typOutputInfo,
154  Datum value, bool isNull,
155  int flags, int width);
156 static void text_format_append_string(StringInfo buf, const char *str,
157  int flags, int width);
158 
159 
160 /*****************************************************************************
161  * CONVERSION ROUTINES EXPORTED FOR USE BY C CODE *
162  *****************************************************************************/
163 
164 /*
165  * cstring_to_text
166  *
167  * Create a text value from a null-terminated C string.
168  *
169  * The new text value is freshly palloc'd with a full-size VARHDR.
170  */
171 text *
172 cstring_to_text(const char *s)
173 {
174  return cstring_to_text_with_len(s, strlen(s));
175 }
176 
177 /*
178  * cstring_to_text_with_len
179  *
180  * Same as cstring_to_text except the caller specifies the string length;
181  * the string need not be null_terminated.
182  */
183 text *
184 cstring_to_text_with_len(const char *s, int len)
185 {
186  text *result = (text *) palloc(len + VARHDRSZ);
187 
188  SET_VARSIZE(result, len + VARHDRSZ);
189  memcpy(VARDATA(result), s, len);
190 
191  return result;
192 }
193 
194 /*
195  * text_to_cstring
196  *
197  * Create a palloc'd, null-terminated C string from a text value.
198  *
199  * We support being passed a compressed or toasted text value.
200  * This is a bit bogus since such values shouldn't really be referred to as
201  * "text *", but it seems useful for robustness. If we didn't handle that
202  * case here, we'd need another routine that did, anyway.
203  */
204 char *
206 {
207  /* must cast away the const, unfortunately */
208  text *tunpacked = pg_detoast_datum_packed(unconstify(text *, t));
209  int len = VARSIZE_ANY_EXHDR(tunpacked);
210  char *result;
211 
212  result = (char *) palloc(len + 1);
213  memcpy(result, VARDATA_ANY(tunpacked), len);
214  result[len] = '\0';
215 
216  if (tunpacked != t)
217  pfree(tunpacked);
218 
219  return result;
220 }
221 
222 /*
223  * text_to_cstring_buffer
224  *
225  * Copy a text value into a caller-supplied buffer of size dst_len.
226  *
227  * The text string is truncated if necessary to fit. The result is
228  * guaranteed null-terminated (unless dst_len == 0).
229  *
230  * We support being passed a compressed or toasted text value.
231  * This is a bit bogus since such values shouldn't really be referred to as
232  * "text *", but it seems useful for robustness. If we didn't handle that
233  * case here, we'd need another routine that did, anyway.
234  */
235 void
236 text_to_cstring_buffer(const text *src, char *dst, size_t dst_len)
237 {
238  /* must cast away the const, unfortunately */
239  text *srcunpacked = pg_detoast_datum_packed(unconstify(text *, src));
240  size_t src_len = VARSIZE_ANY_EXHDR(srcunpacked);
241 
242  if (dst_len > 0)
243  {
244  dst_len--;
245  if (dst_len >= src_len)
246  dst_len = src_len;
247  else /* ensure truncation is encoding-safe */
248  dst_len = pg_mbcliplen(VARDATA_ANY(srcunpacked), src_len, dst_len);
249  memcpy(dst, VARDATA_ANY(srcunpacked), dst_len);
250  dst[dst_len] = '\0';
251  }
252 
253  if (srcunpacked != src)
254  pfree(srcunpacked);
255 }
256 
257 
258 /*****************************************************************************
259  * USER I/O ROUTINES *
260  *****************************************************************************/
261 
262 
263 #define VAL(CH) ((CH) - '0')
264 #define DIG(VAL) ((VAL) + '0')
265 
266 /*
267  * byteain - converts from printable representation of byte array
268  *
269  * Non-printable characters must be passed as '\nnn' (octal) and are
270  * converted to internal form. '\' must be passed as '\\'.
271  * ereport(ERROR, ...) if bad form.
272  *
273  * BUGS:
274  * The input is scanned twice.
275  * The error checking of input is minimal.
276  */
277 Datum
279 {
280  char *inputText = PG_GETARG_CSTRING(0);
281  char *tp;
282  char *rp;
283  int bc;
284  bytea *result;
285 
286  /* Recognize hex input */
287  if (inputText[0] == '\\' && inputText[1] == 'x')
288  {
289  size_t len = strlen(inputText);
290 
291  bc = (len - 2) / 2 + VARHDRSZ; /* maximum possible length */
292  result = palloc(bc);
293  bc = hex_decode(inputText + 2, len - 2, VARDATA(result));
294  SET_VARSIZE(result, bc + VARHDRSZ); /* actual length */
295 
296  PG_RETURN_BYTEA_P(result);
297  }
298 
299  /* Else, it's the traditional escaped style */
300  for (bc = 0, tp = inputText; *tp != '\0'; bc++)
301  {
302  if (tp[0] != '\\')
303  tp++;
304  else if ((tp[0] == '\\') &&
305  (tp[1] >= '0' && tp[1] <= '3') &&
306  (tp[2] >= '0' && tp[2] <= '7') &&
307  (tp[3] >= '0' && tp[3] <= '7'))
308  tp += 4;
309  else if ((tp[0] == '\\') &&
310  (tp[1] == '\\'))
311  tp += 2;
312  else
313  {
314  /*
315  * one backslash, not followed by another or ### valid octal
316  */
317  ereport(ERROR,
318  (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
319  errmsg("invalid input syntax for type %s", "bytea")));
320  }
321  }
322 
323  bc += VARHDRSZ;
324 
325  result = (bytea *) palloc(bc);
326  SET_VARSIZE(result, bc);
327 
328  tp = inputText;
329  rp = VARDATA(result);
330  while (*tp != '\0')
331  {
332  if (tp[0] != '\\')
333  *rp++ = *tp++;
334  else if ((tp[0] == '\\') &&
335  (tp[1] >= '0' && tp[1] <= '3') &&
336  (tp[2] >= '0' && tp[2] <= '7') &&
337  (tp[3] >= '0' && tp[3] <= '7'))
338  {
339  bc = VAL(tp[1]);
340  bc <<= 3;
341  bc += VAL(tp[2]);
342  bc <<= 3;
343  *rp++ = bc + VAL(tp[3]);
344 
345  tp += 4;
346  }
347  else if ((tp[0] == '\\') &&
348  (tp[1] == '\\'))
349  {
350  *rp++ = '\\';
351  tp += 2;
352  }
353  else
354  {
355  /*
356  * We should never get here. The first pass should not allow it.
357  */
358  ereport(ERROR,
359  (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
360  errmsg("invalid input syntax for type %s", "bytea")));
361  }
362  }
363 
364  PG_RETURN_BYTEA_P(result);
365 }
366 
367 /*
368  * byteaout - converts to printable representation of byte array
369  *
370  * In the traditional escaped format, non-printable characters are
371  * printed as '\nnn' (octal) and '\' as '\\'.
372  */
373 Datum
375 {
376  bytea *vlena = PG_GETARG_BYTEA_PP(0);
377  char *result;
378  char *rp;
379 
381  {
382  /* Print hex format */
383  rp = result = palloc(VARSIZE_ANY_EXHDR(vlena) * 2 + 2 + 1);
384  *rp++ = '\\';
385  *rp++ = 'x';
386  rp += hex_encode(VARDATA_ANY(vlena), VARSIZE_ANY_EXHDR(vlena), rp);
387  }
388  else if (bytea_output == BYTEA_OUTPUT_ESCAPE)
389  {
390  /* Print traditional escaped format */
391  char *vp;
392  uint64 len;
393  int i;
394 
395  len = 1; /* empty string has 1 char */
396  vp = VARDATA_ANY(vlena);
397  for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++)
398  {
399  if (*vp == '\\')
400  len += 2;
401  else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e)
402  len += 4;
403  else
404  len++;
405  }
406 
407  /*
408  * In principle len can't overflow uint32 if the input fit in 1GB, but
409  * for safety let's check rather than relying on palloc's internal
410  * check.
411  */
412  if (len > MaxAllocSize)
413  ereport(ERROR,
414  (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
415  errmsg_internal("result of bytea output conversion is too large")));
416  rp = result = (char *) palloc(len);
417 
418  vp = VARDATA_ANY(vlena);
419  for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++)
420  {
421  if (*vp == '\\')
422  {
423  *rp++ = '\\';
424  *rp++ = '\\';
425  }
426  else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e)
427  {
428  int val; /* holds unprintable chars */
429 
430  val = *vp;
431  rp[0] = '\\';
432  rp[3] = DIG(val & 07);
433  val >>= 3;
434  rp[2] = DIG(val & 07);
435  val >>= 3;
436  rp[1] = DIG(val & 03);
437  rp += 4;
438  }
439  else
440  *rp++ = *vp;
441  }
442  }
443  else
444  {
445  elog(ERROR, "unrecognized bytea_output setting: %d",
446  bytea_output);
447  rp = result = NULL; /* keep compiler quiet */
448  }
449  *rp = '\0';
450  PG_RETURN_CSTRING(result);
451 }
452 
453 /*
454  * bytearecv - converts external binary format to bytea
455  */
456 Datum
458 {
460  bytea *result;
461  int nbytes;
462 
463  nbytes = buf->len - buf->cursor;
464  result = (bytea *) palloc(nbytes + VARHDRSZ);
465  SET_VARSIZE(result, nbytes + VARHDRSZ);
466  pq_copymsgbytes(buf, VARDATA(result), nbytes);
467  PG_RETURN_BYTEA_P(result);
468 }
469 
470 /*
471  * byteasend - converts bytea to binary format
472  *
473  * This is a special case: just copy the input...
474  */
475 Datum
477 {
478  bytea *vlena = PG_GETARG_BYTEA_P_COPY(0);
479 
480  PG_RETURN_BYTEA_P(vlena);
481 }
482 
483 Datum
485 {
487 
488  state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
489 
490  /* Append the value unless null. */
491  if (!PG_ARGISNULL(1))
492  {
494 
495  /* On the first time through, we ignore the delimiter. */
496  if (state == NULL)
497  state = makeStringAggState(fcinfo);
498  else if (!PG_ARGISNULL(2))
499  {
500  bytea *delim = PG_GETARG_BYTEA_PP(2);
501 
503  }
504 
506  }
507 
508  /*
509  * The transition type for string_agg() is declared to be "internal",
510  * which is a pass-by-value type the same size as a pointer.
511  */
512  PG_RETURN_POINTER(state);
513 }
514 
515 Datum
517 {
519 
520  /* cannot be called directly because of internal-type argument */
521  Assert(AggCheckCallContext(fcinfo, NULL));
522 
523  state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
524 
525  if (state != NULL)
526  {
527  bytea *result;
528 
529  result = (bytea *) palloc(state->len + VARHDRSZ);
530  SET_VARSIZE(result, state->len + VARHDRSZ);
531  memcpy(VARDATA(result), state->data, state->len);
532  PG_RETURN_BYTEA_P(result);
533  }
534  else
535  PG_RETURN_NULL();
536 }
537 
538 /*
539  * textin - converts "..." to internal representation
540  */
541 Datum
543 {
544  char *inputText = PG_GETARG_CSTRING(0);
545 
546  PG_RETURN_TEXT_P(cstring_to_text(inputText));
547 }
548 
549 /*
550  * textout - converts internal representation to "..."
551  */
552 Datum
554 {
555  Datum txt = PG_GETARG_DATUM(0);
556 
558 }
559 
560 /*
561  * textrecv - converts external binary format to text
562  */
563 Datum
565 {
567  text *result;
568  char *str;
569  int nbytes;
570 
571  str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
572 
573  result = cstring_to_text_with_len(str, nbytes);
574  pfree(str);
575  PG_RETURN_TEXT_P(result);
576 }
577 
578 /*
579  * textsend - converts text to binary format
580  */
581 Datum
583 {
584  text *t = PG_GETARG_TEXT_PP(0);
586 
587  pq_begintypsend(&buf);
590 }
591 
592 
593 /*
594  * unknownin - converts "..." to internal representation
595  */
596 Datum
598 {
599  char *str = PG_GETARG_CSTRING(0);
600 
601  /* representation is same as cstring */
603 }
604 
605 /*
606  * unknownout - converts internal representation to "..."
607  */
608 Datum
610 {
611  /* representation is same as cstring */
612  char *str = PG_GETARG_CSTRING(0);
613 
615 }
616 
617 /*
618  * unknownrecv - converts external binary format to unknown
619  */
620 Datum
622 {
624  char *str;
625  int nbytes;
626 
627  str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
628  /* representation is same as cstring */
629  PG_RETURN_CSTRING(str);
630 }
631 
632 /*
633  * unknownsend - converts unknown to binary format
634  */
635 Datum
637 {
638  /* representation is same as cstring */
639  char *str = PG_GETARG_CSTRING(0);
641 
642  pq_begintypsend(&buf);
643  pq_sendtext(&buf, str, strlen(str));
645 }
646 
647 
648 /* ========== PUBLIC ROUTINES ========== */
649 
650 /*
651  * textlen -
652  * returns the logical length of a text*
653  * (which is less than the VARSIZE of the text*)
654  */
655 Datum
657 {
659 
660  /* try to avoid decompressing argument */
662 }
663 
664 /*
665  * text_length -
666  * Does the real work for textlen()
667  *
668  * This is broken out so it can be called directly by other string processing
669  * functions. Note that the argument is passed as a Datum, to indicate that
670  * it may still be in compressed form. We can avoid decompressing it at all
671  * in some cases.
672  */
673 static int32
675 {
676  /* fastpath when max encoding length is one */
679  else
680  {
681  text *t = DatumGetTextPP(str);
682 
684  VARSIZE_ANY_EXHDR(t)));
685  }
686 }
687 
688 /*
689  * textoctetlen -
690  * returns the physical length of a text*
691  * (which is less than the VARSIZE of the text*)
692  */
693 Datum
695 {
697 
698  /* We need not detoast the input at all */
700 }
701 
702 /*
703  * textcat -
704  * takes two text* and returns a text* that is the concatenation of
705  * the two.
706  *
707  * Rewritten by Sapa, sapa@hq.icb.chel.su. 8-Jul-96.
708  * Updated by Thomas, Thomas.Lockhart@jpl.nasa.gov 1997-07-10.
709  * Allocate space for output in all cases.
710  * XXX - thomas 1997-07-10
711  */
712 Datum
714 {
715  text *t1 = PG_GETARG_TEXT_PP(0);
716  text *t2 = PG_GETARG_TEXT_PP(1);
717 
719 }
720 
721 /*
722  * text_catenate
723  * Guts of textcat(), broken out so it can be used by other functions
724  *
725  * Arguments can be in short-header form, but not compressed or out-of-line
726  */
727 static text *
729 {
730  text *result;
731  int len1,
732  len2,
733  len;
734  char *ptr;
735 
736  len1 = VARSIZE_ANY_EXHDR(t1);
737  len2 = VARSIZE_ANY_EXHDR(t2);
738 
739  /* paranoia ... probably should throw error instead? */
740  if (len1 < 0)
741  len1 = 0;
742  if (len2 < 0)
743  len2 = 0;
744 
745  len = len1 + len2 + VARHDRSZ;
746  result = (text *) palloc(len);
747 
748  /* Set size of result string... */
749  SET_VARSIZE(result, len);
750 
751  /* Fill data field of result string... */
752  ptr = VARDATA(result);
753  if (len1 > 0)
754  memcpy(ptr, VARDATA_ANY(t1), len1);
755  if (len2 > 0)
756  memcpy(ptr + len1, VARDATA_ANY(t2), len2);
757 
758  return result;
759 }
760 
761 /*
762  * charlen_to_bytelen()
763  * Compute the number of bytes occupied by n characters starting at *p
764  *
765  * It is caller's responsibility that there actually are n characters;
766  * the string need not be null-terminated.
767  */
768 static int
769 charlen_to_bytelen(const char *p, int n)
770 {
772  {
773  /* Optimization for single-byte encodings */
774  return n;
775  }
776  else
777  {
778  const char *s;
779 
780  for (s = p; n > 0; n--)
781  s += pg_mblen(s);
782 
783  return s - p;
784  }
785 }
786 
787 /*
788  * text_substr()
789  * Return a substring starting at the specified position.
790  * - thomas 1997-12-31
791  *
792  * Input:
793  * - string
794  * - starting position (is one-based)
795  * - string length
796  *
797  * If the starting position is zero or less, then return from the start of the string
798  * adjusting the length to be consistent with the "negative start" per SQL.
799  * If the length is less than zero, return the remaining string.
800  *
801  * Added multibyte support.
802  * - Tatsuo Ishii 1998-4-21
803  * Changed behavior if starting position is less than one to conform to SQL behavior.
804  * Formerly returned the entire string; now returns a portion.
805  * - Thomas Lockhart 1998-12-10
806  * Now uses faster TOAST-slicing interface
807  * - John Gray 2002-02-22
808  * Remove "#ifdef MULTIBYTE" and test for encoding_max_length instead. Change
809  * behaviors conflicting with SQL to meet SQL (if E = S + L < S throw
810  * error; if E < 1, return '', not entire string). Fixed MB related bug when
811  * S > LC and < LC + 4 sometimes garbage characters are returned.
812  * - Joe Conway 2002-08-10
813  */
814 Datum
816 {
818  PG_GETARG_INT32(1),
819  PG_GETARG_INT32(2),
820  false));
821 }
822 
823 /*
824  * text_substr_no_len -
825  * Wrapper to avoid opr_sanity failure due to
826  * one function accepting a different number of args.
827  */
828 Datum
830 {
832  PG_GETARG_INT32(1),
833  -1, true));
834 }
835 
836 /*
837  * text_substring -
838  * Does the real work for text_substr() and text_substr_no_len()
839  *
840  * This is broken out so it can be called directly by other string processing
841  * functions. Note that the argument is passed as a Datum, to indicate that
842  * it may still be in compressed/toasted form. We can avoid detoasting all
843  * of it in some cases.
844  *
845  * The result is always a freshly palloc'd datum.
846  */
847 static text *
848 text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
849 {
851  int32 S = start; /* start position */
852  int32 S1; /* adjusted start position */
853  int32 L1; /* adjusted substring length */
854 
855  /* life is easy if the encoding max length is 1 */
856  if (eml == 1)
857  {
858  S1 = Max(S, 1);
859 
860  if (length_not_specified) /* special case - get length to end of
861  * string */
862  L1 = -1;
863  else
864  {
865  /* end position */
866  int E = S + length;
867 
868  /*
869  * A negative value for L is the only way for the end position to
870  * be before the start. SQL99 says to throw an error.
871  */
872  if (E < S)
873  ereport(ERROR,
874  (errcode(ERRCODE_SUBSTRING_ERROR),
875  errmsg("negative substring length not allowed")));
876 
877  /*
878  * A zero or negative value for the end position can happen if the
879  * start was negative or one. SQL99 says to return a zero-length
880  * string.
881  */
882  if (E < 1)
883  return cstring_to_text("");
884 
885  L1 = E - S1;
886  }
887 
888  /*
889  * If the start position is past the end of the string, SQL99 says to
890  * return a zero-length string -- PG_GETARG_TEXT_P_SLICE() will do
891  * that for us. Convert to zero-based starting position
892  */
893  return DatumGetTextPSlice(str, S1 - 1, L1);
894  }
895  else if (eml > 1)
896  {
897  /*
898  * When encoding max length is > 1, we can't get LC without
899  * detoasting, so we'll grab a conservatively large slice now and go
900  * back later to do the right thing
901  */
902  int32 slice_start;
903  int32 slice_size;
904  int32 slice_strlen;
905  text *slice;
906  int32 E1;
907  int32 i;
908  char *p;
909  char *s;
910  text *ret;
911 
912  /*
913  * if S is past the end of the string, the tuple toaster will return a
914  * zero-length string to us
915  */
916  S1 = Max(S, 1);
917 
918  /*
919  * We need to start at position zero because there is no way to know
920  * in advance which byte offset corresponds to the supplied start
921  * position.
922  */
923  slice_start = 0;
924 
925  if (length_not_specified) /* special case - get length to end of
926  * string */
927  slice_size = L1 = -1;
928  else
929  {
930  int E = S + length;
931 
932  /*
933  * A negative value for L is the only way for the end position to
934  * be before the start. SQL99 says to throw an error.
935  */
936  if (E < S)
937  ereport(ERROR,
938  (errcode(ERRCODE_SUBSTRING_ERROR),
939  errmsg("negative substring length not allowed")));
940 
941  /*
942  * A zero or negative value for the end position can happen if the
943  * start was negative or one. SQL99 says to return a zero-length
944  * string.
945  */
946  if (E < 1)
947  return cstring_to_text("");
948 
949  /*
950  * if E is past the end of the string, the tuple toaster will
951  * truncate the length for us
952  */
953  L1 = E - S1;
954 
955  /*
956  * Total slice size in bytes can't be any longer than the start
957  * position plus substring length times the encoding max length.
958  */
959  slice_size = (S1 + L1) * eml;
960  }
961 
962  /*
963  * If we're working with an untoasted source, no need to do an extra
964  * copying step.
965  */
968  slice = DatumGetTextPSlice(str, slice_start, slice_size);
969  else
970  slice = (text *) DatumGetPointer(str);
971 
972  /* see if we got back an empty string */
973  if (VARSIZE_ANY_EXHDR(slice) == 0)
974  {
975  if (slice != (text *) DatumGetPointer(str))
976  pfree(slice);
977  return cstring_to_text("");
978  }
979 
980  /* Now we can get the actual length of the slice in MB characters */
981  slice_strlen = pg_mbstrlen_with_len(VARDATA_ANY(slice),
982  VARSIZE_ANY_EXHDR(slice));
983 
984  /*
985  * Check that the start position wasn't > slice_strlen. If so, SQL99
986  * says to return a zero-length string.
987  */
988  if (S1 > slice_strlen)
989  {
990  if (slice != (text *) DatumGetPointer(str))
991  pfree(slice);
992  return cstring_to_text("");
993  }
994 
995  /*
996  * Adjust L1 and E1 now that we know the slice string length. Again
997  * remember that S1 is one based, and slice_start is zero based.
998  */
999  if (L1 > -1)
1000  E1 = Min(S1 + L1, slice_start + 1 + slice_strlen);
1001  else
1002  E1 = slice_start + 1 + slice_strlen;
1003 
1004  /*
1005  * Find the start position in the slice; remember S1 is not zero based
1006  */
1007  p = VARDATA_ANY(slice);
1008  for (i = 0; i < S1 - 1; i++)
1009  p += pg_mblen(p);
1010 
1011  /* hang onto a pointer to our start position */
1012  s = p;
1013 
1014  /*
1015  * Count the actual bytes used by the substring of the requested
1016  * length.
1017  */
1018  for (i = S1; i < E1; i++)
1019  p += pg_mblen(p);
1020 
1021  ret = (text *) palloc(VARHDRSZ + (p - s));
1022  SET_VARSIZE(ret, VARHDRSZ + (p - s));
1023  memcpy(VARDATA(ret), s, (p - s));
1024 
1025  if (slice != (text *) DatumGetPointer(str))
1026  pfree(slice);
1027 
1028  return ret;
1029  }
1030  else
1031  elog(ERROR, "invalid backend encoding: encoding max length < 1");
1032 
1033  /* not reached: suppress compiler warning */
1034  return NULL;
1035 }
1036 
1037 /*
1038  * textoverlay
1039  * Replace specified substring of first string with second
1040  *
1041  * The SQL standard defines OVERLAY() in terms of substring and concatenation.
1042  * This code is a direct implementation of what the standard says.
1043  */
1044 Datum
1046 {
1047  text *t1 = PG_GETARG_TEXT_PP(0);
1048  text *t2 = PG_GETARG_TEXT_PP(1);
1049  int sp = PG_GETARG_INT32(2); /* substring start position */
1050  int sl = PG_GETARG_INT32(3); /* substring length */
1051 
1052  PG_RETURN_TEXT_P(text_overlay(t1, t2, sp, sl));
1053 }
1054 
1055 Datum
1057 {
1058  text *t1 = PG_GETARG_TEXT_PP(0);
1059  text *t2 = PG_GETARG_TEXT_PP(1);
1060  int sp = PG_GETARG_INT32(2); /* substring start position */
1061  int sl;
1062 
1063  sl = text_length(PointerGetDatum(t2)); /* defaults to length(t2) */
1064  PG_RETURN_TEXT_P(text_overlay(t1, t2, sp, sl));
1065 }
1066 
1067 static text *
1068 text_overlay(text *t1, text *t2, int sp, int sl)
1069 {
1070  text *result;
1071  text *s1;
1072  text *s2;
1073  int sp_pl_sl;
1074 
1075  /*
1076  * Check for possible integer-overflow cases. For negative sp, throw a
1077  * "substring length" error because that's what should be expected
1078  * according to the spec's definition of OVERLAY().
1079  */
1080  if (sp <= 0)
1081  ereport(ERROR,
1082  (errcode(ERRCODE_SUBSTRING_ERROR),
1083  errmsg("negative substring length not allowed")));
1084  if (pg_add_s32_overflow(sp, sl, &sp_pl_sl))
1085  ereport(ERROR,
1086  (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
1087  errmsg("integer out of range")));
1088 
1089  s1 = text_substring(PointerGetDatum(t1), 1, sp - 1, false);
1090  s2 = text_substring(PointerGetDatum(t1), sp_pl_sl, -1, true);
1091  result = text_catenate(s1, t2);
1092  result = text_catenate(result, s2);
1093 
1094  return result;
1095 }
1096 
1097 /*
1098  * textpos -
1099  * Return the position of the specified substring.
1100  * Implements the SQL POSITION() function.
1101  * Ref: A Guide To The SQL Standard, Date & Darwen, 1997
1102  * - thomas 1997-07-27
1103  */
1104 Datum
1106 {
1107  text *str = PG_GETARG_TEXT_PP(0);
1108  text *search_str = PG_GETARG_TEXT_PP(1);
1109 
1110  PG_RETURN_INT32((int32) text_position(str, search_str, PG_GET_COLLATION()));
1111 }
1112 
1113 /*
1114  * text_position -
1115  * Does the real work for textpos()
1116  *
1117  * Inputs:
1118  * t1 - string to be searched
1119  * t2 - pattern to match within t1
1120  * Result:
1121  * Character index of the first matched char, starting from 1,
1122  * or 0 if no match.
1123  *
1124  * This is broken out so it can be called directly by other string processing
1125  * functions.
1126  */
1127 static int
1128 text_position(text *t1, text *t2, Oid collid)
1129 {
1131  int result;
1132 
1133  /* Empty needle always matches at position 1 */
1134  if (VARSIZE_ANY_EXHDR(t2) < 1)
1135  return 1;
1136 
1137  /* Otherwise, can't match if haystack is shorter than needle */
1138  if (VARSIZE_ANY_EXHDR(t1) < VARSIZE_ANY_EXHDR(t2))
1139  return 0;
1140 
1141  text_position_setup(t1, t2, collid, &state);
1142  if (!text_position_next(&state))
1143  result = 0;
1144  else
1145  result = text_position_get_match_pos(&state);
1146  text_position_cleanup(&state);
1147  return result;
1148 }
1149 
1150 
1151 /*
1152  * text_position_setup, text_position_next, text_position_cleanup -
1153  * Component steps of text_position()
1154  *
1155  * These are broken out so that a string can be efficiently searched for
1156  * multiple occurrences of the same pattern. text_position_next may be
1157  * called multiple times, and it advances to the next match on each call.
1158  * text_position_get_match_ptr() and text_position_get_match_pos() return
1159  * a pointer or 1-based character position of the last match, respectively.
1160  *
1161  * The "state" variable is normally just a local variable in the caller.
1162  *
1163  * NOTE: text_position_next skips over the matched portion. For example,
1164  * searching for "xx" in "xxx" returns only one match, not two.
1165  */
1166 
1167 static void
1169 {
1170  int len1 = VARSIZE_ANY_EXHDR(t1);
1171  int len2 = VARSIZE_ANY_EXHDR(t2);
1172  pg_locale_t mylocale = 0;
1173 
1174  check_collation_set(collid);
1175 
1176  if (!lc_collate_is_c(collid) && collid != DEFAULT_COLLATION_OID)
1177  mylocale = pg_newlocale_from_collation(collid);
1178 
1179  if (mylocale && !mylocale->deterministic)
1180  ereport(ERROR,
1181  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1182  errmsg("nondeterministic collations are not supported for substring searches")));
1183 
1184  Assert(len1 > 0);
1185  Assert(len2 > 0);
1186 
1187  /*
1188  * Even with a multi-byte encoding, we perform the search using the raw
1189  * byte sequence, ignoring multibyte issues. For UTF-8, that works fine,
1190  * because in UTF-8 the byte sequence of one character cannot contain
1191  * another character. For other multi-byte encodings, we do the search
1192  * initially as a simple byte search, ignoring multibyte issues, but
1193  * verify afterwards that the match we found is at a character boundary,
1194  * and continue the search if it was a false match.
1195  */
1197  {
1198  state->is_multibyte = false;
1199  state->is_multibyte_char_in_char = false;
1200  }
1201  else if (GetDatabaseEncoding() == PG_UTF8)
1202  {
1203  state->is_multibyte = true;
1204  state->is_multibyte_char_in_char = false;
1205  }
1206  else
1207  {
1208  state->is_multibyte = true;
1209  state->is_multibyte_char_in_char = true;
1210  }
1211 
1212  state->str1 = VARDATA_ANY(t1);
1213  state->str2 = VARDATA_ANY(t2);
1214  state->len1 = len1;
1215  state->len2 = len2;
1216  state->last_match = NULL;
1217  state->refpoint = state->str1;
1218  state->refpos = 0;
1219 
1220  /*
1221  * Prepare the skip table for Boyer-Moore-Horspool searching. In these
1222  * notes we use the terminology that the "haystack" is the string to be
1223  * searched (t1) and the "needle" is the pattern being sought (t2).
1224  *
1225  * If the needle is empty or bigger than the haystack then there is no
1226  * point in wasting cycles initializing the table. We also choose not to
1227  * use B-M-H for needles of length 1, since the skip table can't possibly
1228  * save anything in that case.
1229  */
1230  if (len1 >= len2 && len2 > 1)
1231  {
1232  int searchlength = len1 - len2;
1233  int skiptablemask;
1234  int last;
1235  int i;
1236  const char *str2 = state->str2;
1237 
1238  /*
1239  * First we must determine how much of the skip table to use. The
1240  * declaration of TextPositionState allows up to 256 elements, but for
1241  * short search problems we don't really want to have to initialize so
1242  * many elements --- it would take too long in comparison to the
1243  * actual search time. So we choose a useful skip table size based on
1244  * the haystack length minus the needle length. The closer the needle
1245  * length is to the haystack length the less useful skipping becomes.
1246  *
1247  * Note: since we use bit-masking to select table elements, the skip
1248  * table size MUST be a power of 2, and so the mask must be 2^N-1.
1249  */
1250  if (searchlength < 16)
1251  skiptablemask = 3;
1252  else if (searchlength < 64)
1253  skiptablemask = 7;
1254  else if (searchlength < 128)
1255  skiptablemask = 15;
1256  else if (searchlength < 512)
1257  skiptablemask = 31;
1258  else if (searchlength < 2048)
1259  skiptablemask = 63;
1260  else if (searchlength < 4096)
1261  skiptablemask = 127;
1262  else
1263  skiptablemask = 255;
1264  state->skiptablemask = skiptablemask;
1265 
1266  /*
1267  * Initialize the skip table. We set all elements to the needle
1268  * length, since this is the correct skip distance for any character
1269  * not found in the needle.
1270  */
1271  for (i = 0; i <= skiptablemask; i++)
1272  state->skiptable[i] = len2;
1273 
1274  /*
1275  * Now examine the needle. For each character except the last one,
1276  * set the corresponding table element to the appropriate skip
1277  * distance. Note that when two characters share the same skip table
1278  * entry, the one later in the needle must determine the skip
1279  * distance.
1280  */
1281  last = len2 - 1;
1282 
1283  for (i = 0; i < last; i++)
1284  state->skiptable[(unsigned char) str2[i] & skiptablemask] = last - i;
1285  }
1286 }
1287 
1288 /*
1289  * Advance to the next match, starting from the end of the previous match
1290  * (or the beginning of the string, on first call). Returns true if a match
1291  * is found.
1292  *
1293  * Note that this refuses to match an empty-string needle. Most callers
1294  * will have handled that case specially and we'll never see it here.
1295  */
1296 static bool
1298 {
1299  int needle_len = state->len2;
1300  char *start_ptr;
1301  char *matchptr;
1302 
1303  if (needle_len <= 0)
1304  return false; /* result for empty pattern */
1305 
1306  /* Start from the point right after the previous match. */
1307  if (state->last_match)
1308  start_ptr = state->last_match + needle_len;
1309  else
1310  start_ptr = state->str1;
1311 
1312 retry:
1313  matchptr = text_position_next_internal(start_ptr, state);
1314 
1315  if (!matchptr)
1316  return false;
1317 
1318  /*
1319  * Found a match for the byte sequence. If this is a multibyte encoding,
1320  * where one character's byte sequence can appear inside a longer
1321  * multi-byte character, we need to verify that the match was at a
1322  * character boundary, not in the middle of a multi-byte character.
1323  */
1324  if (state->is_multibyte_char_in_char)
1325  {
1326  /* Walk one character at a time, until we reach the match. */
1327 
1328  /* the search should never move backwards. */
1329  Assert(state->refpoint <= matchptr);
1330 
1331  while (state->refpoint < matchptr)
1332  {
1333  /* step to next character. */
1334  state->refpoint += pg_mblen(state->refpoint);
1335  state->refpos++;
1336 
1337  /*
1338  * If we stepped over the match's start position, then it was a
1339  * false positive, where the byte sequence appeared in the middle
1340  * of a multi-byte character. Skip it, and continue the search at
1341  * the next character boundary.
1342  */
1343  if (state->refpoint > matchptr)
1344  {
1345  start_ptr = state->refpoint;
1346  goto retry;
1347  }
1348  }
1349  }
1350 
1351  state->last_match = matchptr;
1352  return true;
1353 }
1354 
1355 /*
1356  * Subroutine of text_position_next(). This searches for the raw byte
1357  * sequence, ignoring any multi-byte encoding issues. Returns the first
1358  * match starting at 'start_ptr', or NULL if no match is found.
1359  */
1360 static char *
1362 {
1363  int haystack_len = state->len1;
1364  int needle_len = state->len2;
1365  int skiptablemask = state->skiptablemask;
1366  const char *haystack = state->str1;
1367  const char *needle = state->str2;
1368  const char *haystack_end = &haystack[haystack_len];
1369  const char *hptr;
1370 
1371  Assert(start_ptr >= haystack && start_ptr <= haystack_end);
1372 
1373  if (needle_len == 1)
1374  {
1375  /* No point in using B-M-H for a one-character needle */
1376  char nchar = *needle;
1377 
1378  hptr = start_ptr;
1379  while (hptr < haystack_end)
1380  {
1381  if (*hptr == nchar)
1382  return (char *) hptr;
1383  hptr++;
1384  }
1385  }
1386  else
1387  {
1388  const char *needle_last = &needle[needle_len - 1];
1389 
1390  /* Start at startpos plus the length of the needle */
1391  hptr = start_ptr + needle_len - 1;
1392  while (hptr < haystack_end)
1393  {
1394  /* Match the needle scanning *backward* */
1395  const char *nptr;
1396  const char *p;
1397 
1398  nptr = needle_last;
1399  p = hptr;
1400  while (*nptr == *p)
1401  {
1402  /* Matched it all? If so, return 1-based position */
1403  if (nptr == needle)
1404  return (char *) p;
1405  nptr--, p--;
1406  }
1407 
1408  /*
1409  * No match, so use the haystack char at hptr to decide how far to
1410  * advance. If the needle had any occurrence of that character
1411  * (or more precisely, one sharing the same skiptable entry)
1412  * before its last character, then we advance far enough to align
1413  * the last such needle character with that haystack position.
1414  * Otherwise we can advance by the whole needle length.
1415  */
1416  hptr += state->skiptable[(unsigned char) *hptr & skiptablemask];
1417  }
1418  }
1419 
1420  return 0; /* not found */
1421 }
1422 
1423 /*
1424  * Return a pointer to the current match.
1425  *
1426  * The returned pointer points into correct position in the original
1427  * the haystack string.
1428  */
1429 static char *
1431 {
1432  return state->last_match;
1433 }
1434 
1435 /*
1436  * Return the offset of the current match.
1437  *
1438  * The offset is in characters, 1-based.
1439  */
1440 static int
1442 {
1443  if (!state->is_multibyte)
1444  return state->last_match - state->str1 + 1;
1445  else
1446  {
1447  /* Convert the byte position to char position. */
1448  while (state->refpoint < state->last_match)
1449  {
1450  state->refpoint += pg_mblen(state->refpoint);
1451  state->refpos++;
1452  }
1453  Assert(state->refpoint == state->last_match);
1454  return state->refpos + 1;
1455  }
1456 }
1457 
1458 static void
1460 {
1461  /* no cleanup needed */
1462 }
1463 
1464 static void
1466 {
1467  if (!OidIsValid(collid))
1468  {
1469  /*
1470  * This typically means that the parser could not resolve a conflict
1471  * of implicit collations, so report it that way.
1472  */
1473  ereport(ERROR,
1474  (errcode(ERRCODE_INDETERMINATE_COLLATION),
1475  errmsg("could not determine which collation to use for string comparison"),
1476  errhint("Use the COLLATE clause to set the collation explicitly.")));
1477  }
1478 }
1479 
1480 /* varstr_cmp()
1481  * Comparison function for text strings with given lengths.
1482  * Includes locale support, but must copy strings to temporary memory
1483  * to allow null-termination for inputs to strcoll().
1484  * Returns an integer less than, equal to, or greater than zero, indicating
1485  * whether arg1 is less than, equal to, or greater than arg2.
1486  *
1487  * Note: many functions that depend on this are marked leakproof; therefore,
1488  * avoid reporting the actual contents of the input when throwing errors.
1489  * All errors herein should be things that can't happen except on corrupt
1490  * data, anyway; otherwise we will have trouble with indexing strings that
1491  * would cause them.
1492  */
1493 int
1494 varstr_cmp(const char *arg1, int len1, const char *arg2, int len2, Oid collid)
1495 {
1496  int result;
1497 
1498  check_collation_set(collid);
1499 
1500  /*
1501  * Unfortunately, there is no strncoll(), so in the non-C locale case we
1502  * have to do some memory copying. This turns out to be significantly
1503  * slower, so we optimize the case where LC_COLLATE is C. We also try to
1504  * optimize relatively-short strings by avoiding palloc/pfree overhead.
1505  */
1506  if (lc_collate_is_c(collid))
1507  {
1508  result = memcmp(arg1, arg2, Min(len1, len2));
1509  if ((result == 0) && (len1 != len2))
1510  result = (len1 < len2) ? -1 : 1;
1511  }
1512  else
1513  {
1514  char a1buf[TEXTBUFLEN];
1515  char a2buf[TEXTBUFLEN];
1516  char *a1p,
1517  *a2p;
1518  pg_locale_t mylocale = 0;
1519 
1520  if (collid != DEFAULT_COLLATION_OID)
1521  mylocale = pg_newlocale_from_collation(collid);
1522 
1523  /*
1524  * memcmp() can't tell us which of two unequal strings sorts first,
1525  * but it's a cheap way to tell if they're equal. Testing shows that
1526  * memcmp() followed by strcoll() is only trivially slower than
1527  * strcoll() by itself, so we don't lose much if this doesn't work out
1528  * very often, and if it does - for example, because there are many
1529  * equal strings in the input - then we win big by avoiding expensive
1530  * collation-aware comparisons.
1531  */
1532  if (len1 == len2 && memcmp(arg1, arg2, len1) == 0)
1533  return 0;
1534 
1535 #ifdef WIN32
1536  /* Win32 does not have UTF-8, so we need to map to UTF-16 */
1537  if (GetDatabaseEncoding() == PG_UTF8
1538  && (!mylocale || mylocale->provider == COLLPROVIDER_LIBC))
1539  {
1540  int a1len;
1541  int a2len;
1542  int r;
1543 
1544  if (len1 >= TEXTBUFLEN / 2)
1545  {
1546  a1len = len1 * 2 + 2;
1547  a1p = palloc(a1len);
1548  }
1549  else
1550  {
1551  a1len = TEXTBUFLEN;
1552  a1p = a1buf;
1553  }
1554  if (len2 >= TEXTBUFLEN / 2)
1555  {
1556  a2len = len2 * 2 + 2;
1557  a2p = palloc(a2len);
1558  }
1559  else
1560  {
1561  a2len = TEXTBUFLEN;
1562  a2p = a2buf;
1563  }
1564 
1565  /* stupid Microsloth API does not work for zero-length input */
1566  if (len1 == 0)
1567  r = 0;
1568  else
1569  {
1570  r = MultiByteToWideChar(CP_UTF8, 0, arg1, len1,
1571  (LPWSTR) a1p, a1len / 2);
1572  if (!r)
1573  ereport(ERROR,
1574  (errmsg("could not convert string to UTF-16: error code %lu",
1575  GetLastError())));
1576  }
1577  ((LPWSTR) a1p)[r] = 0;
1578 
1579  if (len2 == 0)
1580  r = 0;
1581  else
1582  {
1583  r = MultiByteToWideChar(CP_UTF8, 0, arg2, len2,
1584  (LPWSTR) a2p, a2len / 2);
1585  if (!r)
1586  ereport(ERROR,
1587  (errmsg("could not convert string to UTF-16: error code %lu",
1588  GetLastError())));
1589  }
1590  ((LPWSTR) a2p)[r] = 0;
1591 
1592  errno = 0;
1593 #ifdef HAVE_LOCALE_T
1594  if (mylocale)
1595  result = wcscoll_l((LPWSTR) a1p, (LPWSTR) a2p, mylocale->info.lt);
1596  else
1597 #endif
1598  result = wcscoll((LPWSTR) a1p, (LPWSTR) a2p);
1599  if (result == 2147483647) /* _NLSCMPERROR; missing from mingw
1600  * headers */
1601  ereport(ERROR,
1602  (errmsg("could not compare Unicode strings: %m")));
1603 
1604  /* Break tie if necessary. */
1605  if (result == 0 &&
1606  (!mylocale || mylocale->deterministic))
1607  {
1608  result = memcmp(arg1, arg2, Min(len1, len2));
1609  if ((result == 0) && (len1 != len2))
1610  result = (len1 < len2) ? -1 : 1;
1611  }
1612 
1613  if (a1p != a1buf)
1614  pfree(a1p);
1615  if (a2p != a2buf)
1616  pfree(a2p);
1617 
1618  return result;
1619  }
1620 #endif /* WIN32 */
1621 
1622  if (len1 >= TEXTBUFLEN)
1623  a1p = (char *) palloc(len1 + 1);
1624  else
1625  a1p = a1buf;
1626  if (len2 >= TEXTBUFLEN)
1627  a2p = (char *) palloc(len2 + 1);
1628  else
1629  a2p = a2buf;
1630 
1631  memcpy(a1p, arg1, len1);
1632  a1p[len1] = '\0';
1633  memcpy(a2p, arg2, len2);
1634  a2p[len2] = '\0';
1635 
1636  if (mylocale)
1637  {
1638  if (mylocale->provider == COLLPROVIDER_ICU)
1639  {
1640 #ifdef USE_ICU
1641 #ifdef HAVE_UCOL_STRCOLLUTF8
1642  if (GetDatabaseEncoding() == PG_UTF8)
1643  {
1644  UErrorCode status;
1645 
1646  status = U_ZERO_ERROR;
1647  result = ucol_strcollUTF8(mylocale->info.icu.ucol,
1648  arg1, len1,
1649  arg2, len2,
1650  &status);
1651  if (U_FAILURE(status))
1652  ereport(ERROR,
1653  (errmsg("collation failed: %s", u_errorName(status))));
1654  }
1655  else
1656 #endif
1657  {
1658  int32_t ulen1,
1659  ulen2;
1660  UChar *uchar1,
1661  *uchar2;
1662 
1663  ulen1 = icu_to_uchar(&uchar1, arg1, len1);
1664  ulen2 = icu_to_uchar(&uchar2, arg2, len2);
1665 
1666  result = ucol_strcoll(mylocale->info.icu.ucol,
1667  uchar1, ulen1,
1668  uchar2, ulen2);
1669 
1670  pfree(uchar1);
1671  pfree(uchar2);
1672  }
1673 #else /* not USE_ICU */
1674  /* shouldn't happen */
1675  elog(ERROR, "unsupported collprovider: %c", mylocale->provider);
1676 #endif /* not USE_ICU */
1677  }
1678  else
1679  {
1680 #ifdef HAVE_LOCALE_T
1681  result = strcoll_l(a1p, a2p, mylocale->info.lt);
1682 #else
1683  /* shouldn't happen */
1684  elog(ERROR, "unsupported collprovider: %c", mylocale->provider);
1685 #endif
1686  }
1687  }
1688  else
1689  result = strcoll(a1p, a2p);
1690 
1691  /* Break tie if necessary. */
1692  if (result == 0 &&
1693  (!mylocale || mylocale->deterministic))
1694  result = strcmp(a1p, a2p);
1695 
1696  if (a1p != a1buf)
1697  pfree(a1p);
1698  if (a2p != a2buf)
1699  pfree(a2p);
1700  }
1701 
1702  return result;
1703 }
1704 
1705 /* text_cmp()
1706  * Internal comparison function for text strings.
1707  * Returns -1, 0 or 1
1708  */
1709 static int
1710 text_cmp(text *arg1, text *arg2, Oid collid)
1711 {
1712  char *a1p,
1713  *a2p;
1714  int len1,
1715  len2;
1716 
1717  a1p = VARDATA_ANY(arg1);
1718  a2p = VARDATA_ANY(arg2);
1719 
1720  len1 = VARSIZE_ANY_EXHDR(arg1);
1721  len2 = VARSIZE_ANY_EXHDR(arg2);
1722 
1723  return varstr_cmp(a1p, len1, a2p, len2, collid);
1724 }
1725 
1726 /*
1727  * Comparison functions for text strings.
1728  *
1729  * Note: btree indexes need these routines not to leak memory; therefore,
1730  * be careful to free working copies of toasted datums. Most places don't
1731  * need to be so careful.
1732  */
1733 
1734 Datum
1736 {
1737  Oid collid = PG_GET_COLLATION();
1738  bool result;
1739 
1740  check_collation_set(collid);
1741 
1742  if (lc_collate_is_c(collid) ||
1743  collid == DEFAULT_COLLATION_OID ||
1744  pg_newlocale_from_collation(collid)->deterministic)
1745  {
1746  Datum arg1 = PG_GETARG_DATUM(0);
1747  Datum arg2 = PG_GETARG_DATUM(1);
1748  Size len1,
1749  len2;
1750 
1751  /*
1752  * Since we only care about equality or not-equality, we can avoid all
1753  * the expense of strcoll() here, and just do bitwise comparison. In
1754  * fact, we don't even have to do a bitwise comparison if we can show
1755  * the lengths of the strings are unequal; which might save us from
1756  * having to detoast one or both values.
1757  */
1758  len1 = toast_raw_datum_size(arg1);
1759  len2 = toast_raw_datum_size(arg2);
1760  if (len1 != len2)
1761  result = false;
1762  else
1763  {
1764  text *targ1 = DatumGetTextPP(arg1);
1765  text *targ2 = DatumGetTextPP(arg2);
1766 
1767  result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1768  len1 - VARHDRSZ) == 0);
1769 
1770  PG_FREE_IF_COPY(targ1, 0);
1771  PG_FREE_IF_COPY(targ2, 1);
1772  }
1773  }
1774  else
1775  {
1776  text *arg1 = PG_GETARG_TEXT_PP(0);
1777  text *arg2 = PG_GETARG_TEXT_PP(1);
1778 
1779  result = (text_cmp(arg1, arg2, collid) == 0);
1780 
1781  PG_FREE_IF_COPY(arg1, 0);
1782  PG_FREE_IF_COPY(arg2, 1);
1783  }
1784 
1785  PG_RETURN_BOOL(result);
1786 }
1787 
1788 Datum
1790 {
1791  Oid collid = PG_GET_COLLATION();
1792  bool result;
1793 
1794  check_collation_set(collid);
1795 
1796  if (lc_collate_is_c(collid) ||
1797  collid == DEFAULT_COLLATION_OID ||
1798  pg_newlocale_from_collation(collid)->deterministic)
1799  {
1800  Datum arg1 = PG_GETARG_DATUM(0);
1801  Datum arg2 = PG_GETARG_DATUM(1);
1802  Size len1,
1803  len2;
1804 
1805  /* See comment in texteq() */
1806  len1 = toast_raw_datum_size(arg1);
1807  len2 = toast_raw_datum_size(arg2);
1808  if (len1 != len2)
1809  result = true;
1810  else
1811  {
1812  text *targ1 = DatumGetTextPP(arg1);
1813  text *targ2 = DatumGetTextPP(arg2);
1814 
1815  result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1816  len1 - VARHDRSZ) != 0);
1817 
1818  PG_FREE_IF_COPY(targ1, 0);
1819  PG_FREE_IF_COPY(targ2, 1);
1820  }
1821  }
1822  else
1823  {
1824  text *arg1 = PG_GETARG_TEXT_PP(0);
1825  text *arg2 = PG_GETARG_TEXT_PP(1);
1826 
1827  result = (text_cmp(arg1, arg2, collid) != 0);
1828 
1829  PG_FREE_IF_COPY(arg1, 0);
1830  PG_FREE_IF_COPY(arg2, 1);
1831  }
1832 
1833  PG_RETURN_BOOL(result);
1834 }
1835 
1836 Datum
1838 {
1839  text *arg1 = PG_GETARG_TEXT_PP(0);
1840  text *arg2 = PG_GETARG_TEXT_PP(1);
1841  bool result;
1842 
1843  result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) < 0);
1844 
1845  PG_FREE_IF_COPY(arg1, 0);
1846  PG_FREE_IF_COPY(arg2, 1);
1847 
1848  PG_RETURN_BOOL(result);
1849 }
1850 
1851 Datum
1853 {
1854  text *arg1 = PG_GETARG_TEXT_PP(0);
1855  text *arg2 = PG_GETARG_TEXT_PP(1);
1856  bool result;
1857 
1858  result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) <= 0);
1859 
1860  PG_FREE_IF_COPY(arg1, 0);
1861  PG_FREE_IF_COPY(arg2, 1);
1862 
1863  PG_RETURN_BOOL(result);
1864 }
1865 
1866 Datum
1868 {
1869  text *arg1 = PG_GETARG_TEXT_PP(0);
1870  text *arg2 = PG_GETARG_TEXT_PP(1);
1871  bool result;
1872 
1873  result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) > 0);
1874 
1875  PG_FREE_IF_COPY(arg1, 0);
1876  PG_FREE_IF_COPY(arg2, 1);
1877 
1878  PG_RETURN_BOOL(result);
1879 }
1880 
1881 Datum
1883 {
1884  text *arg1 = PG_GETARG_TEXT_PP(0);
1885  text *arg2 = PG_GETARG_TEXT_PP(1);
1886  bool result;
1887 
1888  result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) >= 0);
1889 
1890  PG_FREE_IF_COPY(arg1, 0);
1891  PG_FREE_IF_COPY(arg2, 1);
1892 
1893  PG_RETURN_BOOL(result);
1894 }
1895 
1896 Datum
1898 {
1899  Datum arg1 = PG_GETARG_DATUM(0);
1900  Datum arg2 = PG_GETARG_DATUM(1);
1901  Oid collid = PG_GET_COLLATION();
1902  pg_locale_t mylocale = 0;
1903  bool result;
1904  Size len1,
1905  len2;
1906 
1907  check_collation_set(collid);
1908 
1909  if (!lc_collate_is_c(collid) && collid != DEFAULT_COLLATION_OID)
1910  mylocale = pg_newlocale_from_collation(collid);
1911 
1912  if (mylocale && !mylocale->deterministic)
1913  ereport(ERROR,
1914  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1915  errmsg("nondeterministic collations are not supported for substring searches")));
1916 
1917  len1 = toast_raw_datum_size(arg1);
1918  len2 = toast_raw_datum_size(arg2);
1919  if (len2 > len1)
1920  result = false;
1921  else
1922  {
1923  text *targ1 = text_substring(arg1, 1, len2, false);
1924  text *targ2 = DatumGetTextPP(arg2);
1925 
1926  result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1927  VARSIZE_ANY_EXHDR(targ2)) == 0);
1928 
1929  PG_FREE_IF_COPY(targ1, 0);
1930  PG_FREE_IF_COPY(targ2, 1);
1931  }
1932 
1933  PG_RETURN_BOOL(result);
1934 }
1935 
1936 Datum
1938 {
1939  text *arg1 = PG_GETARG_TEXT_PP(0);
1940  text *arg2 = PG_GETARG_TEXT_PP(1);
1941  int32 result;
1942 
1943  result = text_cmp(arg1, arg2, PG_GET_COLLATION());
1944 
1945  PG_FREE_IF_COPY(arg1, 0);
1946  PG_FREE_IF_COPY(arg2, 1);
1947 
1948  PG_RETURN_INT32(result);
1949 }
1950 
1951 Datum
1953 {
1955  Oid collid = ssup->ssup_collation;
1956  MemoryContext oldcontext;
1957 
1958  oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
1959 
1960  /* Use generic string SortSupport */
1961  varstr_sortsupport(ssup, TEXTOID, collid);
1962 
1963  MemoryContextSwitchTo(oldcontext);
1964 
1965  PG_RETURN_VOID();
1966 }
1967 
1968 /*
1969  * Generic sortsupport interface for character type's operator classes.
1970  * Includes locale support, and support for BpChar semantics (i.e. removing
1971  * trailing spaces before comparison).
1972  *
1973  * Relies on the assumption that text, VarChar, BpChar, and bytea all have the
1974  * same representation. Callers that always use the C collation (e.g.
1975  * non-collatable type callers like bytea) may have NUL bytes in their strings;
1976  * this will not work with any other collation, though.
1977  */
1978 void
1980 {
1981  bool abbreviate = ssup->abbreviate;
1982  bool collate_c = false;
1983  VarStringSortSupport *sss;
1984  pg_locale_t locale = 0;
1985 
1986  check_collation_set(collid);
1987 
1988  /*
1989  * If possible, set ssup->comparator to a function which can be used to
1990  * directly compare two datums. If we can do this, we'll avoid the
1991  * overhead of a trip through the fmgr layer for every comparison, which
1992  * can be substantial.
1993  *
1994  * Most typically, we'll set the comparator to varlenafastcmp_locale,
1995  * which uses strcoll() to perform comparisons. We use that for the
1996  * BpChar case too, but type NAME uses namefastcmp_locale. However, if
1997  * LC_COLLATE = C, we can make things quite a bit faster with
1998  * varstrfastcmp_c, bpcharfastcmp_c, or namefastcmp_c, all of which use
1999  * memcmp() rather than strcoll().
2000  */
2001  if (lc_collate_is_c(collid))
2002  {
2003  if (typid == BPCHAROID)
2004  ssup->comparator = bpcharfastcmp_c;
2005  else if (typid == NAMEOID)
2006  {
2007  ssup->comparator = namefastcmp_c;
2008  /* Not supporting abbreviation with type NAME, for now */
2009  abbreviate = false;
2010  }
2011  else
2012  ssup->comparator = varstrfastcmp_c;
2013 
2014  collate_c = true;
2015  }
2016  else
2017  {
2018  /*
2019  * We need a collation-sensitive comparison. To make things faster,
2020  * we'll figure out the collation based on the locale id and cache the
2021  * result.
2022  */
2023  if (collid != DEFAULT_COLLATION_OID)
2024  locale = pg_newlocale_from_collation(collid);
2025 
2026  /*
2027  * There is a further exception on Windows. When the database
2028  * encoding is UTF-8 and we are not using the C collation, complex
2029  * hacks are required. We don't currently have a comparator that
2030  * handles that case, so we fall back on the slow method of having the
2031  * sort code invoke bttextcmp() (in the case of text) via the fmgr
2032  * trampoline. ICU locales work just the same on Windows, however.
2033  */
2034 #ifdef WIN32
2035  if (GetDatabaseEncoding() == PG_UTF8 &&
2036  !(locale && locale->provider == COLLPROVIDER_ICU))
2037  return;
2038 #endif
2039 
2040  /*
2041  * We use varlenafastcmp_locale except for type NAME.
2042  */
2043  if (typid == NAMEOID)
2044  {
2046  /* Not supporting abbreviation with type NAME, for now */
2047  abbreviate = false;
2048  }
2049  else
2051  }
2052 
2053  /*
2054  * Unfortunately, it seems that abbreviation for non-C collations is
2055  * broken on many common platforms; testing of multiple versions of glibc
2056  * reveals that, for many locales, strcoll() and strxfrm() do not return
2057  * consistent results, which is fatal to this optimization. While no
2058  * other libc other than Cygwin has so far been shown to have a problem,
2059  * we take the conservative course of action for right now and disable
2060  * this categorically. (Users who are certain this isn't a problem on
2061  * their system can define TRUST_STRXFRM.)
2062  *
2063  * Even apart from the risk of broken locales, it's possible that there
2064  * are platforms where the use of abbreviated keys should be disabled at
2065  * compile time. Having only 4 byte datums could make worst-case
2066  * performance drastically more likely, for example. Moreover, macOS's
2067  * strxfrm() implementation is known to not effectively concentrate a
2068  * significant amount of entropy from the original string in earlier
2069  * transformed blobs. It's possible that other supported platforms are
2070  * similarly encumbered. So, if we ever get past disabling this
2071  * categorically, we may still want or need to disable it for particular
2072  * platforms.
2073  */
2074 #ifndef TRUST_STRXFRM
2075  if (!collate_c && !(locale && locale->provider == COLLPROVIDER_ICU))
2076  abbreviate = false;
2077 #endif
2078 
2079  /*
2080  * If we're using abbreviated keys, or if we're using a locale-aware
2081  * comparison, we need to initialize a VarStringSortSupport object. Both
2082  * cases will make use of the temporary buffers we initialize here for
2083  * scratch space (and to detect requirement for BpChar semantics from
2084  * caller), and the abbreviation case requires additional state.
2085  */
2086  if (abbreviate || !collate_c)
2087  {
2088  sss = palloc(sizeof(VarStringSortSupport));
2089  sss->buf1 = palloc(TEXTBUFLEN);
2090  sss->buflen1 = TEXTBUFLEN;
2091  sss->buf2 = palloc(TEXTBUFLEN);
2092  sss->buflen2 = TEXTBUFLEN;
2093  /* Start with invalid values */
2094  sss->last_len1 = -1;
2095  sss->last_len2 = -1;
2096  /* Initialize */
2097  sss->last_returned = 0;
2098  sss->locale = locale;
2099 
2100  /*
2101  * To avoid somehow confusing a strxfrm() blob and an original string,
2102  * constantly keep track of the variety of data that buf1 and buf2
2103  * currently contain.
2104  *
2105  * Comparisons may be interleaved with conversion calls. Frequently,
2106  * conversions and comparisons are batched into two distinct phases,
2107  * but the correctness of caching cannot hinge upon this. For
2108  * comparison caching, buffer state is only trusted if cache_blob is
2109  * found set to false, whereas strxfrm() caching only trusts the state
2110  * when cache_blob is found set to true.
2111  *
2112  * Arbitrarily initialize cache_blob to true.
2113  */
2114  sss->cache_blob = true;
2115  sss->collate_c = collate_c;
2116  sss->typid = typid;
2117  ssup->ssup_extra = sss;
2118 
2119  /*
2120  * If possible, plan to use the abbreviated keys optimization. The
2121  * core code may switch back to authoritative comparator should
2122  * abbreviation be aborted.
2123  */
2124  if (abbreviate)
2125  {
2126  sss->prop_card = 0.20;
2127  initHyperLogLog(&sss->abbr_card, 10);
2128  initHyperLogLog(&sss->full_card, 10);
2129  ssup->abbrev_full_comparator = ssup->comparator;
2130  ssup->comparator = varstrcmp_abbrev;
2133  }
2134  }
2135 }
2136 
2137 /*
2138  * sortsupport comparison func (for C locale case)
2139  */
2140 static int
2142 {
2143  VarString *arg1 = DatumGetVarStringPP(x);
2144  VarString *arg2 = DatumGetVarStringPP(y);
2145  char *a1p,
2146  *a2p;
2147  int len1,
2148  len2,
2149  result;
2150 
2151  a1p = VARDATA_ANY(arg1);
2152  a2p = VARDATA_ANY(arg2);
2153 
2154  len1 = VARSIZE_ANY_EXHDR(arg1);
2155  len2 = VARSIZE_ANY_EXHDR(arg2);
2156 
2157  result = memcmp(a1p, a2p, Min(len1, len2));
2158  if ((result == 0) && (len1 != len2))
2159  result = (len1 < len2) ? -1 : 1;
2160 
2161  /* We can't afford to leak memory here. */
2162  if (PointerGetDatum(arg1) != x)
2163  pfree(arg1);
2164  if (PointerGetDatum(arg2) != y)
2165  pfree(arg2);
2166 
2167  return result;
2168 }
2169 
2170 /*
2171  * sortsupport comparison func (for BpChar C locale case)
2172  *
2173  * BpChar outsources its sortsupport to this module. Specialization for the
2174  * varstr_sortsupport BpChar case, modeled on
2175  * internal_bpchar_pattern_compare().
2176  */
2177 static int
2179 {
2180  BpChar *arg1 = DatumGetBpCharPP(x);
2181  BpChar *arg2 = DatumGetBpCharPP(y);
2182  char *a1p,
2183  *a2p;
2184  int len1,
2185  len2,
2186  result;
2187 
2188  a1p = VARDATA_ANY(arg1);
2189  a2p = VARDATA_ANY(arg2);
2190 
2191  len1 = bpchartruelen(a1p, VARSIZE_ANY_EXHDR(arg1));
2192  len2 = bpchartruelen(a2p, VARSIZE_ANY_EXHDR(arg2));
2193 
2194  result = memcmp(a1p, a2p, Min(len1, len2));
2195  if ((result == 0) && (len1 != len2))
2196  result = (len1 < len2) ? -1 : 1;
2197 
2198  /* We can't afford to leak memory here. */
2199  if (PointerGetDatum(arg1) != x)
2200  pfree(arg1);
2201  if (PointerGetDatum(arg2) != y)
2202  pfree(arg2);
2203 
2204  return result;
2205 }
2206 
2207 /*
2208  * sortsupport comparison func (for NAME C locale case)
2209  */
2210 static int
2212 {
2213  Name arg1 = DatumGetName(x);
2214  Name arg2 = DatumGetName(y);
2215 
2216  return strncmp(NameStr(*arg1), NameStr(*arg2), NAMEDATALEN);
2217 }
2218 
2219 /*
2220  * sortsupport comparison func (for locale case with all varlena types)
2221  */
2222 static int
2224 {
2225  VarString *arg1 = DatumGetVarStringPP(x);
2226  VarString *arg2 = DatumGetVarStringPP(y);
2227  char *a1p,
2228  *a2p;
2229  int len1,
2230  len2,
2231  result;
2232 
2233  a1p = VARDATA_ANY(arg1);
2234  a2p = VARDATA_ANY(arg2);
2235 
2236  len1 = VARSIZE_ANY_EXHDR(arg1);
2237  len2 = VARSIZE_ANY_EXHDR(arg2);
2238 
2239  result = varstrfastcmp_locale(a1p, len1, a2p, len2, ssup);
2240 
2241  /* We can't afford to leak memory here. */
2242  if (PointerGetDatum(arg1) != x)
2243  pfree(arg1);
2244  if (PointerGetDatum(arg2) != y)
2245  pfree(arg2);
2246 
2247  return result;
2248 }
2249 
2250 /*
2251  * sortsupport comparison func (for locale case with NAME type)
2252  */
2253 static int
2255 {
2256  Name arg1 = DatumGetName(x);
2257  Name arg2 = DatumGetName(y);
2258 
2259  return varstrfastcmp_locale(NameStr(*arg1), strlen(NameStr(*arg1)),
2260  NameStr(*arg2), strlen(NameStr(*arg2)),
2261  ssup);
2262 }
2263 
2264 /*
2265  * sortsupport comparison func for locale cases
2266  */
2267 static int
2268 varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup)
2269 {
2271  int result;
2272  bool arg1_match;
2273 
2274  /* Fast pre-check for equality, as discussed in varstr_cmp() */
2275  if (len1 == len2 && memcmp(a1p, a2p, len1) == 0)
2276  {
2277  /*
2278  * No change in buf1 or buf2 contents, so avoid changing last_len1 or
2279  * last_len2. Existing contents of buffers might still be used by
2280  * next call.
2281  *
2282  * It's fine to allow the comparison of BpChar padding bytes here,
2283  * even though that implies that the memcmp() will usually be
2284  * performed for BpChar callers (though multibyte characters could
2285  * still prevent that from occurring). The memcmp() is still very
2286  * cheap, and BpChar's funny semantics have us remove trailing spaces
2287  * (not limited to padding), so we need make no distinction between
2288  * padding space characters and "real" space characters.
2289  */
2290  return 0;
2291  }
2292 
2293  if (sss->typid == BPCHAROID)
2294  {
2295  /* Get true number of bytes, ignoring trailing spaces */
2296  len1 = bpchartruelen(a1p, len1);
2297  len2 = bpchartruelen(a2p, len2);
2298  }
2299 
2300  if (len1 >= sss->buflen1)
2301  {
2302  pfree(sss->buf1);
2303  sss->buflen1 = Max(len1 + 1, Min(sss->buflen1 * 2, MaxAllocSize));
2304  sss->buf1 = MemoryContextAlloc(ssup->ssup_cxt, sss->buflen1);
2305  }
2306  if (len2 >= sss->buflen2)
2307  {
2308  pfree(sss->buf2);
2309  sss->buflen2 = Max(len2 + 1, Min(sss->buflen2 * 2, MaxAllocSize));
2310  sss->buf2 = MemoryContextAlloc(ssup->ssup_cxt, sss->buflen2);
2311  }
2312 
2313  /*
2314  * We're likely to be asked to compare the same strings repeatedly, and
2315  * memcmp() is so much cheaper than strcoll() that it pays to try to cache
2316  * comparisons, even though in general there is no reason to think that
2317  * that will work out (every string datum may be unique). Caching does
2318  * not slow things down measurably when it doesn't work out, and can speed
2319  * things up by rather a lot when it does. In part, this is because the
2320  * memcmp() compares data from cachelines that are needed in L1 cache even
2321  * when the last comparison's result cannot be reused.
2322  */
2323  arg1_match = true;
2324  if (len1 != sss->last_len1 || memcmp(sss->buf1, a1p, len1) != 0)
2325  {
2326  arg1_match = false;
2327  memcpy(sss->buf1, a1p, len1);
2328  sss->buf1[len1] = '\0';
2329  sss->last_len1 = len1;
2330  }
2331 
2332  /*
2333  * If we're comparing the same two strings as last time, we can return the
2334  * same answer without calling strcoll() again. This is more likely than
2335  * it seems (at least with moderate to low cardinality sets), because
2336  * quicksort compares the same pivot against many values.
2337  */
2338  if (len2 != sss->last_len2 || memcmp(sss->buf2, a2p, len2) != 0)
2339  {
2340  memcpy(sss->buf2, a2p, len2);
2341  sss->buf2[len2] = '\0';
2342  sss->last_len2 = len2;
2343  }
2344  else if (arg1_match && !sss->cache_blob)
2345  {
2346  /* Use result cached following last actual strcoll() call */
2347  return sss->last_returned;
2348  }
2349 
2350  if (sss->locale)
2351  {
2352  if (sss->locale->provider == COLLPROVIDER_ICU)
2353  {
2354 #ifdef USE_ICU
2355 #ifdef HAVE_UCOL_STRCOLLUTF8
2356  if (GetDatabaseEncoding() == PG_UTF8)
2357  {
2358  UErrorCode status;
2359 
2360  status = U_ZERO_ERROR;
2361  result = ucol_strcollUTF8(sss->locale->info.icu.ucol,
2362  a1p, len1,
2363  a2p, len2,
2364  &status);
2365  if (U_FAILURE(status))
2366  ereport(ERROR,
2367  (errmsg("collation failed: %s", u_errorName(status))));
2368  }
2369  else
2370 #endif
2371  {
2372  int32_t ulen1,
2373  ulen2;
2374  UChar *uchar1,
2375  *uchar2;
2376 
2377  ulen1 = icu_to_uchar(&uchar1, a1p, len1);
2378  ulen2 = icu_to_uchar(&uchar2, a2p, len2);
2379 
2380  result = ucol_strcoll(sss->locale->info.icu.ucol,
2381  uchar1, ulen1,
2382  uchar2, ulen2);
2383 
2384  pfree(uchar1);
2385  pfree(uchar2);
2386  }
2387 #else /* not USE_ICU */
2388  /* shouldn't happen */
2389  elog(ERROR, "unsupported collprovider: %c", sss->locale->provider);
2390 #endif /* not USE_ICU */
2391  }
2392  else
2393  {
2394 #ifdef HAVE_LOCALE_T
2395  result = strcoll_l(sss->buf1, sss->buf2, sss->locale->info.lt);
2396 #else
2397  /* shouldn't happen */
2398  elog(ERROR, "unsupported collprovider: %c", sss->locale->provider);
2399 #endif
2400  }
2401  }
2402  else
2403  result = strcoll(sss->buf1, sss->buf2);
2404 
2405  /* Break tie if necessary. */
2406  if (result == 0 &&
2407  (!sss->locale || sss->locale->deterministic))
2408  result = strcmp(sss->buf1, sss->buf2);
2409 
2410  /* Cache result, perhaps saving an expensive strcoll() call next time */
2411  sss->cache_blob = false;
2412  sss->last_returned = result;
2413  return result;
2414 }
2415 
2416 /*
2417  * Abbreviated key comparison func
2418  */
2419 static int
2421 {
2422  /*
2423  * When 0 is returned, the core system will call varstrfastcmp_c()
2424  * (bpcharfastcmp_c() in BpChar case) or varlenafastcmp_locale(). Even a
2425  * strcmp() on two non-truncated strxfrm() blobs cannot indicate *equality*
2426  * authoritatively, for the same reason that there is a strcoll()
2427  * tie-breaker call to strcmp() in varstr_cmp().
2428  */
2429  if (x > y)
2430  return 1;
2431  else if (x == y)
2432  return 0;
2433  else
2434  return -1;
2435 }
2436 
2437 /*
2438  * Conversion routine for sortsupport. Converts original to abbreviated key
2439  * representation. Our encoding strategy is simple -- pack the first 8 bytes
2440  * of a strxfrm() blob into a Datum (on little-endian machines, the 8 bytes are
2441  * stored in reverse order), and treat it as an unsigned integer. When the "C"
2442  * locale is used, or in case of bytea, just memcpy() from original instead.
2443  */
2444 static Datum
2446 {
2448  VarString *authoritative = DatumGetVarStringPP(original);
2449  char *authoritative_data = VARDATA_ANY(authoritative);
2450 
2451  /* working state */
2452  Datum res;
2453  char *pres;
2454  int len;
2455  uint32 hash;
2456 
2457  pres = (char *) &res;
2458  /* memset(), so any non-overwritten bytes are NUL */
2459  memset(pres, 0, sizeof(Datum));
2460  len = VARSIZE_ANY_EXHDR(authoritative);
2461 
2462  /* Get number of bytes, ignoring trailing spaces */
2463  if (sss->typid == BPCHAROID)
2464  len = bpchartruelen(authoritative_data, len);
2465 
2466  /*
2467  * If we're using the C collation, use memcpy(), rather than strxfrm(), to
2468  * abbreviate keys. The full comparator for the C locale is always
2469  * memcmp(). It would be incorrect to allow bytea callers (callers that
2470  * always force the C collation -- bytea isn't a collatable type, but this
2471  * approach is convenient) to use strxfrm(). This is because bytea
2472  * strings may contain NUL bytes. Besides, this should be faster, too.
2473  *
2474  * More generally, it's okay that bytea callers can have NUL bytes in
2475  * strings because varstrcmp_abbrev() need not make a distinction between
2476  * terminating NUL bytes, and NUL bytes representing actual NULs in the
2477  * authoritative representation. Hopefully a comparison at or past one
2478  * abbreviated key's terminating NUL byte will resolve the comparison
2479  * without consulting the authoritative representation; specifically, some
2480  * later non-NUL byte in the longer string can resolve the comparison
2481  * against a subsequent terminating NUL in the shorter string. There will
2482  * usually be what is effectively a "length-wise" resolution there and
2483  * then.
2484  *
2485  * If that doesn't work out -- if all bytes in the longer string
2486  * positioned at or past the offset of the smaller string's (first)
2487  * terminating NUL are actually representative of NUL bytes in the
2488  * authoritative binary string (perhaps with some *terminating* NUL bytes
2489  * towards the end of the longer string iff it happens to still be small)
2490  * -- then an authoritative tie-breaker will happen, and do the right
2491  * thing: explicitly consider string length.
2492  */
2493  if (sss->collate_c)
2494  memcpy(pres, authoritative_data, Min(len, sizeof(Datum)));
2495  else
2496  {
2497  Size bsize;
2498 #ifdef USE_ICU
2499  int32_t ulen = -1;
2500  UChar *uchar = NULL;
2501 #endif
2502 
2503  /*
2504  * We're not using the C collation, so fall back on strxfrm or ICU
2505  * analogs.
2506  */
2507 
2508  /* By convention, we use buffer 1 to store and NUL-terminate */
2509  if (len >= sss->buflen1)
2510  {
2511  pfree(sss->buf1);
2512  sss->buflen1 = Max(len + 1, Min(sss->buflen1 * 2, MaxAllocSize));
2513  sss->buf1 = palloc(sss->buflen1);
2514  }
2515 
2516  /* Might be able to reuse strxfrm() blob from last call */
2517  if (sss->last_len1 == len && sss->cache_blob &&
2518  memcmp(sss->buf1, authoritative_data, len) == 0)
2519  {
2520  memcpy(pres, sss->buf2, Min(sizeof(Datum), sss->last_len2));
2521  /* No change affecting cardinality, so no hashing required */
2522  goto done;
2523  }
2524 
2525  memcpy(sss->buf1, authoritative_data, len);
2526 
2527  /*
2528  * Just like strcoll(), strxfrm() expects a NUL-terminated string. Not
2529  * necessary for ICU, but doesn't hurt.
2530  */
2531  sss->buf1[len] = '\0';
2532  sss->last_len1 = len;
2533 
2534 #ifdef USE_ICU
2535  /* When using ICU and not UTF8, convert string to UChar. */
2536  if (sss->locale && sss->locale->provider == COLLPROVIDER_ICU &&
2538  ulen = icu_to_uchar(&uchar, sss->buf1, len);
2539 #endif
2540 
2541  /*
2542  * Loop: Call strxfrm() or ucol_getSortKey(), possibly enlarge buffer,
2543  * and try again. Both of these functions have the result buffer
2544  * content undefined if the result did not fit, so we need to retry
2545  * until everything fits, even though we only need the first few bytes
2546  * in the end. When using ucol_nextSortKeyPart(), however, we only
2547  * ask for as many bytes as we actually need.
2548  */
2549  for (;;)
2550  {
2551 #ifdef USE_ICU
2552  if (sss->locale && sss->locale->provider == COLLPROVIDER_ICU)
2553  {
2554  /*
2555  * When using UTF8, use the iteration interface so we only
2556  * need to produce as many bytes as we actually need.
2557  */
2558  if (GetDatabaseEncoding() == PG_UTF8)
2559  {
2560  UCharIterator iter;
2561  uint32_t state[2];
2562  UErrorCode status;
2563 
2564  uiter_setUTF8(&iter, sss->buf1, len);
2565  state[0] = state[1] = 0; /* won't need that again */
2566  status = U_ZERO_ERROR;
2567  bsize = ucol_nextSortKeyPart(sss->locale->info.icu.ucol,
2568  &iter,
2569  state,
2570  (uint8_t *) sss->buf2,
2571  Min(sizeof(Datum), sss->buflen2),
2572  &status);
2573  if (U_FAILURE(status))
2574  ereport(ERROR,
2575  (errmsg("sort key generation failed: %s",
2576  u_errorName(status))));
2577  }
2578  else
2579  bsize = ucol_getSortKey(sss->locale->info.icu.ucol,
2580  uchar, ulen,
2581  (uint8_t *) sss->buf2, sss->buflen2);
2582  }
2583  else
2584 #endif
2585 #ifdef HAVE_LOCALE_T
2586  if (sss->locale && sss->locale->provider == COLLPROVIDER_LIBC)
2587  bsize = strxfrm_l(sss->buf2, sss->buf1,
2588  sss->buflen2, sss->locale->info.lt);
2589  else
2590 #endif
2591  bsize = strxfrm(sss->buf2, sss->buf1, sss->buflen2);
2592 
2593  sss->last_len2 = bsize;
2594  if (bsize < sss->buflen2)
2595  break;
2596 
2597  /*
2598  * Grow buffer and retry.
2599  */
2600  pfree(sss->buf2);
2601  sss->buflen2 = Max(bsize + 1,
2602  Min(sss->buflen2 * 2, MaxAllocSize));
2603  sss->buf2 = palloc(sss->buflen2);
2604  }
2605 
2606  /*
2607  * Every Datum byte is always compared. This is safe because the
2608  * strxfrm() blob is itself NUL terminated, leaving no danger of
2609  * misinterpreting any NUL bytes not intended to be interpreted as
2610  * logically representing termination.
2611  *
2612  * (Actually, even if there were NUL bytes in the blob it would be
2613  * okay. See remarks on bytea case above.)
2614  */
2615  memcpy(pres, sss->buf2, Min(sizeof(Datum), bsize));
2616 
2617 #ifdef USE_ICU
2618  if (uchar)
2619  pfree(uchar);
2620 #endif
2621  }
2622 
2623  /*
2624  * Maintain approximate cardinality of both abbreviated keys and original,
2625  * authoritative keys using HyperLogLog. Used as cheap insurance against
2626  * the worst case, where we do many string transformations for no saving
2627  * in full strcoll()-based comparisons. These statistics are used by
2628  * varstr_abbrev_abort().
2629  *
2630  * First, Hash key proper, or a significant fraction of it. Mix in length
2631  * in order to compensate for cases where differences are past
2632  * PG_CACHE_LINE_SIZE bytes, so as to limit the overhead of hashing.
2633  */
2634  hash = DatumGetUInt32(hash_any((unsigned char *) authoritative_data,
2635  Min(len, PG_CACHE_LINE_SIZE)));
2636 
2637  if (len > PG_CACHE_LINE_SIZE)
2638  hash ^= DatumGetUInt32(hash_uint32((uint32) len));
2639 
2640  addHyperLogLog(&sss->full_card, hash);
2641 
2642  /* Hash abbreviated key */
2643 #if SIZEOF_DATUM == 8
2644  {
2645  uint32 lohalf,
2646  hihalf;
2647 
2648  lohalf = (uint32) res;
2649  hihalf = (uint32) (res >> 32);
2650  hash = DatumGetUInt32(hash_uint32(lohalf ^ hihalf));
2651  }
2652 #else /* SIZEOF_DATUM != 8 */
2653  hash = DatumGetUInt32(hash_uint32((uint32) res));
2654 #endif
2655 
2656  addHyperLogLog(&sss->abbr_card, hash);
2657 
2658  /* Cache result, perhaps saving an expensive strxfrm() call next time */
2659  sss->cache_blob = true;
2660 done:
2661 
2662  /*
2663  * Byteswap on little-endian machines.
2664  *
2665  * This is needed so that varstrcmp_abbrev() (an unsigned integer 3-way
2666  * comparator) works correctly on all platforms. If we didn't do this,
2667  * the comparator would have to call memcmp() with a pair of pointers to
2668  * the first byte of each abbreviated key, which is slower.
2669  */
2670  res = DatumBigEndianToNative(res);
2671 
2672  /* Don't leak memory here */
2673  if (PointerGetDatum(authoritative) != original)
2674  pfree(authoritative);
2675 
2676  return res;
2677 }
2678 
2679 /*
2680  * Callback for estimating effectiveness of abbreviated key optimization, using
2681  * heuristic rules. Returns value indicating if the abbreviation optimization
2682  * should be aborted, based on its projected effectiveness.
2683  */
2684 static bool
2685 varstr_abbrev_abort(int memtupcount, SortSupport ssup)
2686 {
2688  double abbrev_distinct,
2689  key_distinct;
2690 
2691  Assert(ssup->abbreviate);
2692 
2693  /* Have a little patience */
2694  if (memtupcount < 100)
2695  return false;
2696 
2697  abbrev_distinct = estimateHyperLogLog(&sss->abbr_card);
2698  key_distinct = estimateHyperLogLog(&sss->full_card);
2699 
2700  /*
2701  * Clamp cardinality estimates to at least one distinct value. While
2702  * NULLs are generally disregarded, if only NULL values were seen so far,
2703  * that might misrepresent costs if we failed to clamp.
2704  */
2705  if (abbrev_distinct <= 1.0)
2706  abbrev_distinct = 1.0;
2707 
2708  if (key_distinct <= 1.0)
2709  key_distinct = 1.0;
2710 
2711  /*
2712  * In the worst case all abbreviated keys are identical, while at the same
2713  * time there are differences within full key strings not captured in
2714  * abbreviations.
2715  */
2716 #ifdef TRACE_SORT
2717  if (trace_sort)
2718  {
2719  double norm_abbrev_card = abbrev_distinct / (double) memtupcount;
2720 
2721  elog(LOG, "varstr_abbrev: abbrev_distinct after %d: %f "
2722  "(key_distinct: %f, norm_abbrev_card: %f, prop_card: %f)",
2723  memtupcount, abbrev_distinct, key_distinct, norm_abbrev_card,
2724  sss->prop_card);
2725  }
2726 #endif
2727 
2728  /*
2729  * If the number of distinct abbreviated keys approximately matches the
2730  * number of distinct authoritative original keys, that's reason enough to
2731  * proceed. We can win even with a very low cardinality set if most
2732  * tie-breakers only memcmp(). This is by far the most important
2733  * consideration.
2734  *
2735  * While comparisons that are resolved at the abbreviated key level are
2736  * considerably cheaper than tie-breakers resolved with memcmp(), both of
2737  * those two outcomes are so much cheaper than a full strcoll() once
2738  * sorting is underway that it doesn't seem worth it to weigh abbreviated
2739  * cardinality against the overall size of the set in order to more
2740  * accurately model costs. Assume that an abbreviated comparison, and an
2741  * abbreviated comparison with a cheap memcmp()-based authoritative
2742  * resolution are equivalent.
2743  */
2744  if (abbrev_distinct > key_distinct * sss->prop_card)
2745  {
2746  /*
2747  * When we have exceeded 10,000 tuples, decay required cardinality
2748  * aggressively for next call.
2749  *
2750  * This is useful because the number of comparisons required on
2751  * average increases at a linearithmic rate, and at roughly 10,000
2752  * tuples that factor will start to dominate over the linear costs of
2753  * string transformation (this is a conservative estimate). The decay
2754  * rate is chosen to be a little less aggressive than halving -- which
2755  * (since we're called at points at which memtupcount has doubled)
2756  * would never see the cost model actually abort past the first call
2757  * following a decay. This decay rate is mostly a precaution against
2758  * a sudden, violent swing in how well abbreviated cardinality tracks
2759  * full key cardinality. The decay also serves to prevent a marginal
2760  * case from being aborted too late, when too much has already been
2761  * invested in string transformation.
2762  *
2763  * It's possible for sets of several million distinct strings with
2764  * mere tens of thousands of distinct abbreviated keys to still
2765  * benefit very significantly. This will generally occur provided
2766  * each abbreviated key is a proxy for a roughly uniform number of the
2767  * set's full keys. If it isn't so, we hope to catch that early and
2768  * abort. If it isn't caught early, by the time the problem is
2769  * apparent it's probably not worth aborting.
2770  */
2771  if (memtupcount > 10000)
2772  sss->prop_card *= 0.65;
2773 
2774  return false;
2775  }
2776 
2777  /*
2778  * Abort abbreviation strategy.
2779  *
2780  * The worst case, where all abbreviated keys are identical while all
2781  * original strings differ will typically only see a regression of about
2782  * 10% in execution time for small to medium sized lists of strings.
2783  * Whereas on modern CPUs where cache stalls are the dominant cost, we can
2784  * often expect very large improvements, particularly with sets of strings
2785  * of moderately high to high abbreviated cardinality. There is little to
2786  * lose but much to gain, which our strategy reflects.
2787  */
2788 #ifdef TRACE_SORT
2789  if (trace_sort)
2790  elog(LOG, "varstr_abbrev: aborted abbreviation at %d "
2791  "(abbrev_distinct: %f, key_distinct: %f, prop_card: %f)",
2792  memtupcount, abbrev_distinct, key_distinct, sss->prop_card);
2793 #endif
2794 
2795  return true;
2796 }
2797 
2798 /*
2799  * Generic equalimage support function for character type's operator classes.
2800  * Disables the use of deduplication with nondeterministic collations.
2801  */
2802 Datum
2804 {
2805  /* Oid opcintype = PG_GETARG_OID(0); */
2806  Oid collid = PG_GET_COLLATION();
2807 
2808  check_collation_set(collid);
2809 
2810  if (lc_collate_is_c(collid) ||
2811  collid == DEFAULT_COLLATION_OID ||
2813  PG_RETURN_BOOL(true);
2814  else
2815  PG_RETURN_BOOL(false);
2816 }
2817 
2818 Datum
2820 {
2821  text *arg1 = PG_GETARG_TEXT_PP(0);
2822  text *arg2 = PG_GETARG_TEXT_PP(1);
2823  text *result;
2824 
2825  result = ((text_cmp(arg1, arg2, PG_GET_COLLATION()) > 0) ? arg1 : arg2);
2826 
2827  PG_RETURN_TEXT_P(result);
2828 }
2829 
2830 Datum
2832 {
2833  text *arg1 = PG_GETARG_TEXT_PP(0);
2834  text *arg2 = PG_GETARG_TEXT_PP(1);
2835  text *result;
2836 
2837  result = ((text_cmp(arg1, arg2, PG_GET_COLLATION()) < 0) ? arg1 : arg2);
2838 
2839  PG_RETURN_TEXT_P(result);
2840 }
2841 
2842 
2843 /*
2844  * Cross-type comparison functions for types text and name.
2845  */
2846 
2847 Datum
2849 {
2850  Name arg1 = PG_GETARG_NAME(0);
2851  text *arg2 = PG_GETARG_TEXT_PP(1);
2852  size_t len1 = strlen(NameStr(*arg1));
2853  size_t len2 = VARSIZE_ANY_EXHDR(arg2);
2854  Oid collid = PG_GET_COLLATION();
2855  bool result;
2856 
2857  check_collation_set(collid);
2858 
2859  if (collid == C_COLLATION_OID)
2860  result = (len1 == len2 &&
2861  memcmp(NameStr(*arg1), VARDATA_ANY(arg2), len1) == 0);
2862  else
2863  result = (varstr_cmp(NameStr(*arg1), len1,
2864  VARDATA_ANY(arg2), len2,
2865  collid) == 0);
2866 
2867  PG_FREE_IF_COPY(arg2, 1);
2868 
2869  PG_RETURN_BOOL(result);
2870 }
2871 
2872 Datum
2874 {
2875  text *arg1 = PG_GETARG_TEXT_PP(0);
2876  Name arg2 = PG_GETARG_NAME(1);
2877  size_t len1 = VARSIZE_ANY_EXHDR(arg1);
2878  size_t len2 = strlen(NameStr(*arg2));
2879  Oid collid = PG_GET_COLLATION();
2880  bool result;
2881 
2882  check_collation_set(collid);
2883 
2884  if (collid == C_COLLATION_OID)
2885  result = (len1 == len2 &&
2886  memcmp(VARDATA_ANY(arg1), NameStr(*arg2), len1) == 0);
2887  else
2888  result = (varstr_cmp(VARDATA_ANY(arg1), len1,
2889  NameStr(*arg2), len2,
2890  collid) == 0);
2891 
2892  PG_FREE_IF_COPY(arg1, 0);
2893 
2894  PG_RETURN_BOOL(result);
2895 }
2896 
2897 Datum
2899 {
2900  Name arg1 = PG_GETARG_NAME(0);
2901  text *arg2 = PG_GETARG_TEXT_PP(1);
2902  size_t len1 = strlen(NameStr(*arg1));
2903  size_t len2 = VARSIZE_ANY_EXHDR(arg2);
2904  Oid collid = PG_GET_COLLATION();
2905  bool result;
2906 
2907  check_collation_set(collid);
2908 
2909  if (collid == C_COLLATION_OID)
2910  result = !(len1 == len2 &&
2911  memcmp(NameStr(*arg1), VARDATA_ANY(arg2), len1) == 0);
2912  else
2913  result = !(varstr_cmp(NameStr(*arg1), len1,
2914  VARDATA_ANY(arg2), len2,
2915  collid) == 0);
2916 
2917  PG_FREE_IF_COPY(arg2, 1);
2918 
2919  PG_RETURN_BOOL(result);
2920 }
2921 
2922 Datum
2924 {
2925  text *arg1 = PG_GETARG_TEXT_PP(0);
2926  Name arg2 = PG_GETARG_NAME(1);
2927  size_t len1 = VARSIZE_ANY_EXHDR(arg1);
2928  size_t len2 = strlen(NameStr(*arg2));
2929  Oid collid = PG_GET_COLLATION();
2930  bool result;
2931 
2932  check_collation_set(collid);
2933 
2934  if (collid == C_COLLATION_OID)
2935  result = !(len1 == len2 &&
2936  memcmp(VARDATA_ANY(arg1), NameStr(*arg2), len1) == 0);
2937  else
2938  result = !(varstr_cmp(VARDATA_ANY(arg1), len1,
2939  NameStr(*arg2), len2,
2940  collid) == 0);
2941 
2942  PG_FREE_IF_COPY(arg1, 0);
2943 
2944  PG_RETURN_BOOL(result);
2945 }
2946 
2947 Datum
2949 {
2950  Name arg1 = PG_GETARG_NAME(0);
2951  text *arg2 = PG_GETARG_TEXT_PP(1);
2952  int32 result;
2953 
2954  result = varstr_cmp(NameStr(*arg1), strlen(NameStr(*arg1)),
2955  VARDATA_ANY(arg2), VARSIZE_ANY_EXHDR(arg2),
2956  PG_GET_COLLATION());
2957 
2958  PG_FREE_IF_COPY(arg2, 1);
2959 
2960  PG_RETURN_INT32(result);
2961 }
2962 
2963 Datum
2965 {
2966  text *arg1 = PG_GETARG_TEXT_PP(0);
2967  Name arg2 = PG_GETARG_NAME(1);
2968  int32 result;
2969 
2970  result = varstr_cmp(VARDATA_ANY(arg1), VARSIZE_ANY_EXHDR(arg1),
2971  NameStr(*arg2), strlen(NameStr(*arg2)),
2972  PG_GET_COLLATION());
2973 
2974  PG_FREE_IF_COPY(arg1, 0);
2975 
2976  PG_RETURN_INT32(result);
2977 }
2978 
2979 #define CmpCall(cmpfunc) \
2980  DatumGetInt32(DirectFunctionCall2Coll(cmpfunc, \
2981  PG_GET_COLLATION(), \
2982  PG_GETARG_DATUM(0), \
2983  PG_GETARG_DATUM(1)))
2984 
2985 Datum
2987 {
2989 }
2990 
2991 Datum
2993 {
2995 }
2996 
2997 Datum
2999 {
3001 }
3002 
3003 Datum
3005 {
3007 }
3008 
3009 Datum
3011 {
3013 }
3014 
3015 Datum
3017 {
3019 }
3020 
3021 Datum
3023 {
3025 }
3026 
3027 Datum
3029 {
3031 }
3032 
3033 #undef CmpCall
3034 
3035 
3036 /*
3037  * The following operators support character-by-character comparison
3038  * of text datums, to allow building indexes suitable for LIKE clauses.
3039  * Note that the regular texteq/textne comparison operators, and regular
3040  * support functions 1 and 2 with "C" collation are assumed to be
3041  * compatible with these!
3042  */
3043 
3044 static int
3046 {
3047  int result;
3048  int len1,
3049  len2;
3050 
3051  len1 = VARSIZE_ANY_EXHDR(arg1);
3052  len2 = VARSIZE_ANY_EXHDR(arg2);
3053 
3054  result = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3055  if (result != 0)
3056  return result;
3057  else if (len1 < len2)
3058  return -1;
3059  else if (len1 > len2)
3060  return 1;
3061  else
3062  return 0;
3063 }
3064 
3065 
3066 Datum
3068 {
3069  text *arg1 = PG_GETARG_TEXT_PP(0);
3070  text *arg2 = PG_GETARG_TEXT_PP(1);
3071  int result;
3072 
3073  result = internal_text_pattern_compare(arg1, arg2);
3074 
3075  PG_FREE_IF_COPY(arg1, 0);
3076  PG_FREE_IF_COPY(arg2, 1);
3077 
3078  PG_RETURN_BOOL(result < 0);
3079 }
3080 
3081 
3082 Datum
3084 {
3085  text *arg1 = PG_GETARG_TEXT_PP(0);
3086  text *arg2 = PG_GETARG_TEXT_PP(1);
3087  int result;
3088 
3089  result = internal_text_pattern_compare(arg1, arg2);
3090 
3091  PG_FREE_IF_COPY(arg1, 0);
3092  PG_FREE_IF_COPY(arg2, 1);
3093 
3094  PG_RETURN_BOOL(result <= 0);
3095 }
3096 
3097 
3098 Datum
3100 {
3101  text *arg1 = PG_GETARG_TEXT_PP(0);
3102  text *arg2 = PG_GETARG_TEXT_PP(1);
3103  int result;
3104 
3105  result = internal_text_pattern_compare(arg1, arg2);
3106 
3107  PG_FREE_IF_COPY(arg1, 0);
3108  PG_FREE_IF_COPY(arg2, 1);
3109 
3110  PG_RETURN_BOOL(result >= 0);
3111 }
3112 
3113 
3114 Datum
3116 {
3117  text *arg1 = PG_GETARG_TEXT_PP(0);
3118  text *arg2 = PG_GETARG_TEXT_PP(1);
3119  int result;
3120 
3121  result = internal_text_pattern_compare(arg1, arg2);
3122 
3123  PG_FREE_IF_COPY(arg1, 0);
3124  PG_FREE_IF_COPY(arg2, 1);
3125 
3126  PG_RETURN_BOOL(result > 0);
3127 }
3128 
3129 
3130 Datum
3132 {
3133  text *arg1 = PG_GETARG_TEXT_PP(0);
3134  text *arg2 = PG_GETARG_TEXT_PP(1);
3135  int result;
3136 
3137  result = internal_text_pattern_compare(arg1, arg2);
3138 
3139  PG_FREE_IF_COPY(arg1, 0);
3140  PG_FREE_IF_COPY(arg2, 1);
3141 
3142  PG_RETURN_INT32(result);
3143 }
3144 
3145 
3146 Datum
3148 {
3150  MemoryContext oldcontext;
3151 
3152  oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
3153 
3154  /* Use generic string SortSupport, forcing "C" collation */
3155  varstr_sortsupport(ssup, TEXTOID, C_COLLATION_OID);
3156 
3157  MemoryContextSwitchTo(oldcontext);
3158 
3159  PG_RETURN_VOID();
3160 }
3161 
3162 
3163 /*-------------------------------------------------------------
3164  * byteaoctetlen
3165  *
3166  * get the number of bytes contained in an instance of type 'bytea'
3167  *-------------------------------------------------------------
3168  */
3169 Datum
3171 {
3172  Datum str = PG_GETARG_DATUM(0);
3173 
3174  /* We need not detoast the input at all */
3176 }
3177 
3178 /*
3179  * byteacat -
3180  * takes two bytea* and returns a bytea* that is the concatenation of
3181  * the two.
3182  *
3183  * Cloned from textcat and modified as required.
3184  */
3185 Datum
3187 {
3188  bytea *t1 = PG_GETARG_BYTEA_PP(0);
3189  bytea *t2 = PG_GETARG_BYTEA_PP(1);
3190 
3192 }
3193 
3194 /*
3195  * bytea_catenate
3196  * Guts of byteacat(), broken out so it can be used by other functions
3197  *
3198  * Arguments can be in short-header form, but not compressed or out-of-line
3199  */
3200 static bytea *
3202 {
3203  bytea *result;
3204  int len1,
3205  len2,
3206  len;
3207  char *ptr;
3208 
3209  len1 = VARSIZE_ANY_EXHDR(t1);
3210  len2 = VARSIZE_ANY_EXHDR(t2);
3211 
3212  /* paranoia ... probably should throw error instead? */
3213  if (len1 < 0)
3214  len1 = 0;
3215  if (len2 < 0)
3216  len2 = 0;
3217 
3218  len = len1 + len2 + VARHDRSZ;
3219  result = (bytea *) palloc(len);
3220 
3221  /* Set size of result string... */
3222  SET_VARSIZE(result, len);
3223 
3224  /* Fill data field of result string... */
3225  ptr = VARDATA(result);
3226  if (len1 > 0)
3227  memcpy(ptr, VARDATA_ANY(t1), len1);
3228  if (len2 > 0)
3229  memcpy(ptr + len1, VARDATA_ANY(t2), len2);
3230 
3231  return result;
3232 }
3233 
3234 #define PG_STR_GET_BYTEA(str_) \
3235  DatumGetByteaPP(DirectFunctionCall1(byteain, CStringGetDatum(str_)))
3236 
3237 /*
3238  * bytea_substr()
3239  * Return a substring starting at the specified position.
3240  * Cloned from text_substr and modified as required.
3241  *
3242  * Input:
3243  * - string
3244  * - starting position (is one-based)
3245  * - string length (optional)
3246  *
3247  * If the starting position is zero or less, then return from the start of the string
3248  * adjusting the length to be consistent with the "negative start" per SQL.
3249  * If the length is less than zero, an ERROR is thrown. If no third argument
3250  * (length) is provided, the length to the end of the string is assumed.
3251  */
3252 Datum
3254 {
3256  PG_GETARG_INT32(1),
3257  PG_GETARG_INT32(2),
3258  false));
3259 }
3260 
3261 /*
3262  * bytea_substr_no_len -
3263  * Wrapper to avoid opr_sanity failure due to
3264  * one function accepting a different number of args.
3265  */
3266 Datum
3268 {
3270  PG_GETARG_INT32(1),
3271  -1,
3272  true));
3273 }
3274 
3275 static bytea *
3277  int S,
3278  int L,
3279  bool length_not_specified)
3280 {
3281  int S1; /* adjusted start position */
3282  int L1; /* adjusted substring length */
3283 
3284  S1 = Max(S, 1);
3285 
3286  if (length_not_specified)
3287  {
3288  /*
3289  * Not passed a length - DatumGetByteaPSlice() grabs everything to the
3290  * end of the string if we pass it a negative value for length.
3291  */
3292  L1 = -1;
3293  }
3294  else
3295  {
3296  /* end position */
3297  int E = S + L;
3298 
3299  /*
3300  * A negative value for L is the only way for the end position to be
3301  * before the start. SQL99 says to throw an error.
3302  */
3303  if (E < S)
3304  ereport(ERROR,
3305  (errcode(ERRCODE_SUBSTRING_ERROR),
3306  errmsg("negative substring length not allowed")));
3307 
3308  /*
3309  * A zero or negative value for the end position can happen if the
3310  * start was negative or one. SQL99 says to return a zero-length
3311  * string.
3312  */
3313  if (E < 1)
3314  return PG_STR_GET_BYTEA("");
3315 
3316  L1 = E - S1;
3317  }
3318 
3319  /*
3320  * If the start position is past the end of the string, SQL99 says to
3321  * return a zero-length string -- DatumGetByteaPSlice() will do that for
3322  * us. Convert to zero-based starting position
3323  */
3324  return DatumGetByteaPSlice(str, S1 - 1, L1);
3325 }
3326 
3327 /*
3328  * byteaoverlay
3329  * Replace specified substring of first string with second
3330  *
3331  * The SQL standard defines OVERLAY() in terms of substring and concatenation.
3332  * This code is a direct implementation of what the standard says.
3333  */
3334 Datum
3336 {
3337  bytea *t1 = PG_GETARG_BYTEA_PP(0);
3338  bytea *t2 = PG_GETARG_BYTEA_PP(1);
3339  int sp = PG_GETARG_INT32(2); /* substring start position */
3340  int sl = PG_GETARG_INT32(3); /* substring length */
3341 
3342  PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl));
3343 }
3344 
3345 Datum
3347 {
3348  bytea *t1 = PG_GETARG_BYTEA_PP(0);
3349  bytea *t2 = PG_GETARG_BYTEA_PP(1);
3350  int sp = PG_GETARG_INT32(2); /* substring start position */
3351  int sl;
3352 
3353  sl = VARSIZE_ANY_EXHDR(t2); /* defaults to length(t2) */
3354  PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl));
3355 }
3356 
3357 static bytea *
3358 bytea_overlay(bytea *t1, bytea *t2, int sp, int sl)
3359 {
3360  bytea *result;
3361  bytea *s1;
3362  bytea *s2;
3363  int sp_pl_sl;
3364 
3365  /*
3366  * Check for possible integer-overflow cases. For negative sp, throw a
3367  * "substring length" error because that's what should be expected
3368  * according to the spec's definition of OVERLAY().
3369  */
3370  if (sp <= 0)
3371  ereport(ERROR,
3372  (errcode(ERRCODE_SUBSTRING_ERROR),
3373  errmsg("negative substring length not allowed")));
3374  if (pg_add_s32_overflow(sp, sl, &sp_pl_sl))
3375  ereport(ERROR,
3376  (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
3377  errmsg("integer out of range")));
3378 
3379  s1 = bytea_substring(PointerGetDatum(t1), 1, sp - 1, false);
3380  s2 = bytea_substring(PointerGetDatum(t1), sp_pl_sl, -1, true);
3381  result = bytea_catenate(s1, t2);
3382  result = bytea_catenate(result, s2);
3383 
3384  return result;
3385 }
3386 
3387 /*
3388  * byteapos -
3389  * Return the position of the specified substring.
3390  * Implements the SQL POSITION() function.
3391  * Cloned from textpos and modified as required.
3392  */
3393 Datum
3395 {
3396  bytea *t1 = PG_GETARG_BYTEA_PP(0);
3397  bytea *t2 = PG_GETARG_BYTEA_PP(1);
3398  int pos;
3399  int px,
3400  p;
3401  int len1,
3402  len2;
3403  char *p1,
3404  *p2;
3405 
3406  len1 = VARSIZE_ANY_EXHDR(t1);
3407  len2 = VARSIZE_ANY_EXHDR(t2);
3408 
3409  if (len2 <= 0)
3410  PG_RETURN_INT32(1); /* result for empty pattern */
3411 
3412  p1 = VARDATA_ANY(t1);
3413  p2 = VARDATA_ANY(t2);
3414 
3415  pos = 0;
3416  px = (len1 - len2);
3417  for (p = 0; p <= px; p++)
3418  {
3419  if ((*p2 == *p1) && (memcmp(p1, p2, len2) == 0))
3420  {
3421  pos = p + 1;
3422  break;
3423  };
3424  p1++;
3425  };
3426 
3427  PG_RETURN_INT32(pos);
3428 }
3429 
3430 /*-------------------------------------------------------------
3431  * byteaGetByte
3432  *
3433  * this routine treats "bytea" as an array of bytes.
3434  * It returns the Nth byte (a number between 0 and 255).
3435  *-------------------------------------------------------------
3436  */
3437 Datum
3439 {
3440  bytea *v = PG_GETARG_BYTEA_PP(0);
3441  int32 n = PG_GETARG_INT32(1);
3442  int len;
3443  int byte;
3444 
3445  len = VARSIZE_ANY_EXHDR(v);
3446 
3447  if (n < 0 || n >= len)
3448  ereport(ERROR,
3449  (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3450  errmsg("index %d out of valid range, 0..%d",
3451  n, len - 1)));
3452 
3453  byte = ((unsigned char *) VARDATA_ANY(v))[n];
3454 
3455  PG_RETURN_INT32(byte);
3456 }
3457 
3458 /*-------------------------------------------------------------
3459  * byteaGetBit
3460  *
3461  * This routine treats a "bytea" type like an array of bits.
3462  * It returns the value of the Nth bit (0 or 1).
3463  *
3464  *-------------------------------------------------------------
3465  */
3466 Datum
3468 {
3469  bytea *v = PG_GETARG_BYTEA_PP(0);
3470  int64 n = PG_GETARG_INT64(1);
3471  int byteNo,
3472  bitNo;
3473  int len;
3474  int byte;
3475 
3476  len = VARSIZE_ANY_EXHDR(v);
3477 
3478  if (n < 0 || n >= (int64) len * 8)
3479  ereport(ERROR,
3480  (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3481  errmsg("index %lld out of valid range, 0..%lld",
3482  (long long) n, (long long) len * 8 - 1)));
3483 
3484  /* n/8 is now known < len, so safe to cast to int */
3485  byteNo = (int) (n / 8);
3486  bitNo = (int) (n % 8);
3487 
3488  byte = ((unsigned char *) VARDATA_ANY(v))[byteNo];
3489 
3490  if (byte & (1 << bitNo))
3491  PG_RETURN_INT32(1);
3492  else
3493  PG_RETURN_INT32(0);
3494 }
3495 
3496 /*-------------------------------------------------------------
3497  * byteaSetByte
3498  *
3499  * Given an instance of type 'bytea' creates a new one with
3500  * the Nth byte set to the given value.
3501  *
3502  *-------------------------------------------------------------
3503  */
3504 Datum
3506 {
3507  bytea *res = PG_GETARG_BYTEA_P_COPY(0);
3508  int32 n = PG_GETARG_INT32(1);
3509  int32 newByte = PG_GETARG_INT32(2);
3510  int len;
3511 
3512  len = VARSIZE(res) - VARHDRSZ;
3513 
3514  if (n < 0 || n >= len)
3515  ereport(ERROR,
3516  (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3517  errmsg("index %d out of valid range, 0..%d",
3518  n, len - 1)));
3519 
3520  /*
3521  * Now set the byte.
3522  */
3523  ((unsigned char *) VARDATA(res))[n] = newByte;
3524 
3525  PG_RETURN_BYTEA_P(res);
3526 }
3527 
3528 /*-------------------------------------------------------------
3529  * byteaSetBit
3530  *
3531  * Given an instance of type 'bytea' creates a new one with
3532  * the Nth bit set to the given value.
3533  *
3534  *-------------------------------------------------------------
3535  */
3536 Datum
3538 {
3539  bytea *res = PG_GETARG_BYTEA_P_COPY(0);
3540  int64 n = PG_GETARG_INT64(1);
3541  int32 newBit = PG_GETARG_INT32(2);
3542  int len;
3543  int oldByte,
3544  newByte;
3545  int byteNo,
3546  bitNo;
3547 
3548  len = VARSIZE(res) - VARHDRSZ;
3549 
3550  if (n < 0 || n >= (int64) len * 8)
3551  ereport(ERROR,
3552  (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3553  errmsg("index %lld out of valid range, 0..%lld",
3554  (long long) n, (long long) len * 8 - 1)));
3555 
3556  /* n/8 is now known < len, so safe to cast to int */
3557  byteNo = (int) (n / 8);
3558  bitNo = (int) (n % 8);
3559 
3560  /*
3561  * sanity check!
3562  */
3563  if (newBit != 0 && newBit != 1)
3564  ereport(ERROR,
3565  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
3566  errmsg("new bit must be 0 or 1")));
3567 
3568  /*
3569  * Update the byte.
3570  */
3571  oldByte = ((unsigned char *) VARDATA(res))[byteNo];
3572 
3573  if (newBit == 0)
3574  newByte = oldByte & (~(1 << bitNo));
3575  else
3576  newByte = oldByte | (1 << bitNo);
3577 
3578  ((unsigned char *) VARDATA(res))[byteNo] = newByte;
3579 
3580  PG_RETURN_BYTEA_P(res);
3581 }
3582 
3583 
3584 /* text_name()
3585  * Converts a text type to a Name type.
3586  */
3587 Datum
3589 {
3590  text *s = PG_GETARG_TEXT_PP(0);
3591  Name result;
3592  int len;
3593 
3594  len = VARSIZE_ANY_EXHDR(s);
3595 
3596  /* Truncate oversize input */
3597  if (len >= NAMEDATALEN)
3598  len = pg_mbcliplen(VARDATA_ANY(s), len, NAMEDATALEN - 1);
3599 
3600  /* We use palloc0 here to ensure result is zero-padded */
3601  result = (Name) palloc0(NAMEDATALEN);
3602  memcpy(NameStr(*result), VARDATA_ANY(s), len);
3603 
3604  PG_RETURN_NAME(result);
3605 }
3606 
3607 /* name_text()
3608  * Converts a Name type to a text type.
3609  */
3610 Datum
3612 {
3613  Name s = PG_GETARG_NAME(0);
3614 
3616 }
3617 
3618 
3619 /*
3620  * textToQualifiedNameList - convert a text object to list of names
3621  *
3622  * This implements the input parsing needed by nextval() and other
3623  * functions that take a text parameter representing a qualified name.
3624  * We split the name at dots, downcase if not double-quoted, and
3625  * truncate names if they're too long.
3626  */
3627 List *
3629 {
3630  char *rawname;
3631  List *result = NIL;
3632  List *namelist;
3633  ListCell *l;
3634 
3635  /* Convert to C string (handles possible detoasting). */
3636  /* Note we rely on being able to modify rawname below. */
3637  rawname = text_to_cstring(textval);
3638 
3639  if (!SplitIdentifierString(rawname, '.', &namelist))
3640  ereport(ERROR,
3641  (errcode(ERRCODE_INVALID_NAME),
3642  errmsg("invalid name syntax")));
3643 
3644  if (namelist == NIL)
3645  ereport(ERROR,
3646  (errcode(ERRCODE_INVALID_NAME),
3647  errmsg("invalid name syntax")));
3648 
3649  foreach(l, namelist)
3650  {
3651  char *curname = (char *) lfirst(l);
3652 
3653  result = lappend(result, makeString(pstrdup(curname)));
3654  }
3655 
3656  pfree(rawname);
3657  list_free(namelist);
3658 
3659  return result;
3660 }
3661 
3662 /*
3663  * SplitIdentifierString --- parse a string containing identifiers
3664  *
3665  * This is the guts of textToQualifiedNameList, and is exported for use in
3666  * other situations such as parsing GUC variables. In the GUC case, it's
3667  * important to avoid memory leaks, so the API is designed to minimize the
3668  * amount of stuff that needs to be allocated and freed.
3669  *
3670  * Inputs:
3671  * rawstring: the input string; must be overwritable! On return, it's
3672  * been modified to contain the separated identifiers.
3673  * separator: the separator punctuation expected between identifiers
3674  * (typically '.' or ','). Whitespace may also appear around
3675  * identifiers.
3676  * Outputs:
3677  * namelist: filled with a palloc'd list of pointers to identifiers within
3678  * rawstring. Caller should list_free() this even on error return.
3679  *
3680  * Returns true if okay, false if there is a syntax error in the string.
3681  *
3682  * Note that an empty string is considered okay here, though not in
3683  * textToQualifiedNameList.
3684  */
3685 bool
3686 SplitIdentifierString(char *rawstring, char separator,
3687  List **namelist)
3688 {
3689  char *nextp = rawstring;
3690  bool done = false;
3691 
3692  *namelist = NIL;
3693 
3694  while (scanner_isspace(*nextp))
3695  nextp++; /* skip leading whitespace */
3696 
3697  if (*nextp == '\0')
3698  return true; /* allow empty string */
3699 
3700  /* At the top of the loop, we are at start of a new identifier. */
3701  do
3702  {
3703  char *curname;
3704  char *endp;
3705 
3706  if (*nextp == '"')
3707  {
3708  /* Quoted name --- collapse quote-quote pairs, no downcasing */
3709  curname = nextp + 1;
3710  for (;;)
3711  {
3712  endp = strchr(nextp + 1, '"');
3713  if (endp == NULL)
3714  return false; /* mismatched quotes */
3715  if (endp[1] != '"')
3716  break; /* found end of quoted name */
3717  /* Collapse adjacent quotes into one quote, and look again */
3718  memmove(endp, endp + 1, strlen(endp));
3719  nextp = endp;
3720  }
3721  /* endp now points at the terminating quote */
3722  nextp = endp + 1;
3723  }
3724  else
3725  {
3726  /* Unquoted name --- extends to separator or whitespace */
3727  char *downname;
3728  int len;
3729 
3730  curname = nextp;
3731  while (*nextp && *nextp != separator &&
3732  !scanner_isspace(*nextp))
3733  nextp++;
3734  endp = nextp;
3735  if (curname == nextp)
3736  return false; /* empty unquoted name not allowed */
3737 
3738  /*
3739  * Downcase the identifier, using same code as main lexer does.
3740  *
3741  * XXX because we want to overwrite the input in-place, we cannot
3742  * support a downcasing transformation that increases the string
3743  * length. This is not a problem given the current implementation
3744  * of downcase_truncate_identifier, but we'll probably have to do
3745  * something about this someday.
3746  */
3747  len = endp - curname;
3748  downname = downcase_truncate_identifier(curname, len, false);
3749  Assert(strlen(downname) <= len);
3750  strncpy(curname, downname, len); /* strncpy is required here */
3751  pfree(downname);
3752  }
3753 
3754  while (scanner_isspace(*nextp))
3755  nextp++; /* skip trailing whitespace */
3756 
3757  if (*nextp == separator)
3758  {
3759  nextp++;
3760  while (scanner_isspace(*nextp))
3761  nextp++; /* skip leading whitespace for next */
3762  /* we expect another name, so done remains false */
3763  }
3764  else if (*nextp == '\0')
3765  done = true;
3766  else
3767  return false; /* invalid syntax */
3768 
3769  /* Now safe to overwrite separator with a null */
3770  *endp = '\0';
3771 
3772  /* Truncate name if it's overlength */
3773  truncate_identifier(curname, strlen(curname), false);
3774 
3775  /*
3776  * Finished isolating current name --- add it to list
3777  */
3778  *namelist = lappend(*namelist, curname);
3779 
3780  /* Loop back if we didn't reach end of string */
3781  } while (!done);
3782 
3783  return true;
3784 }
3785 
3786 
3787 /*
3788  * SplitDirectoriesString --- parse a string containing file/directory names
3789  *
3790  * This works fine on file names too; the function name is historical.
3791  *
3792  * This is similar to SplitIdentifierString, except that the parsing
3793  * rules are meant to handle pathnames instead of identifiers: there is
3794  * no downcasing, embedded spaces are allowed, the max length is MAXPGPATH-1,
3795  * and we apply canonicalize_path() to each extracted string. Because of the
3796  * last, the returned strings are separately palloc'd rather than being
3797  * pointers into rawstring --- but we still scribble on rawstring.
3798  *
3799  * Inputs:
3800  * rawstring: the input string; must be modifiable!
3801  * separator: the separator punctuation expected between directories
3802  * (typically ',' or ';'). Whitespace may also appear around
3803  * directories.
3804  * Outputs:
3805  * namelist: filled with a palloc'd list of directory names.
3806  * Caller should list_free_deep() this even on error return.
3807  *
3808  * Returns true if okay, false if there is a syntax error in the string.
3809  *
3810  * Note that an empty string is considered okay here.
3811  */
3812 bool
3813 SplitDirectoriesString(char *rawstring, char separator,
3814  List **namelist)
3815 {
3816  char *nextp = rawstring;
3817  bool done = false;
3818 
3819  *namelist = NIL;
3820 
3821  while (scanner_isspace(*nextp))
3822  nextp++; /* skip leading whitespace */
3823 
3824  if (*nextp == '\0')
3825  return true; /* allow empty string */
3826 
3827  /* At the top of the loop, we are at start of a new directory. */
3828  do
3829  {
3830  char *curname;
3831  char *endp;
3832 
3833  if (*nextp == '"')
3834  {
3835  /* Quoted name --- collapse quote-quote pairs */
3836  curname = nextp + 1;
3837  for (;;)
3838  {
3839  endp = strchr(nextp + 1, '"');
3840  if (endp == NULL)
3841  return false; /* mismatched quotes */
3842  if (endp[1] != '"')
3843  break; /* found end of quoted name */
3844  /* Collapse adjacent quotes into one quote, and look again */
3845  memmove(endp, endp + 1, strlen(endp));
3846  nextp = endp;
3847  }
3848  /* endp now points at the terminating quote */
3849  nextp = endp + 1;
3850  }
3851  else
3852  {
3853  /* Unquoted name --- extends to separator or end of string */
3854  curname = endp = nextp;
3855  while (*nextp && *nextp != separator)
3856  {
3857  /* trailing whitespace should not be included in name */
3858  if (!scanner_isspace(*nextp))
3859  endp = nextp + 1;
3860  nextp++;
3861  }
3862  if (curname == endp)
3863  return false; /* empty unquoted name not allowed */
3864  }
3865 
3866  while (scanner_isspace(*nextp))
3867  nextp++; /* skip trailing whitespace */
3868 
3869  if (*nextp == separator)
3870  {
3871  nextp++;
3872  while (scanner_isspace(*nextp))
3873  nextp++; /* skip leading whitespace for next */
3874  /* we expect another name, so done remains false */
3875  }
3876  else if (*nextp == '\0')
3877  done = true;
3878  else
3879  return false; /* invalid syntax */
3880 
3881  /* Now safe to overwrite separator with a null */
3882  *endp = '\0';
3883 
3884  /* Truncate path if it's overlength */
3885  if (strlen(curname) >= MAXPGPATH)
3886  curname[MAXPGPATH - 1] = '\0';
3887 
3888  /*
3889  * Finished isolating current name --- add it to list
3890  */
3891  curname = pstrdup(curname);
3892  canonicalize_path(curname);
3893  *namelist = lappend(*namelist, curname);
3894 
3895  /* Loop back if we didn't reach end of string */
3896  } while (!done);
3897 
3898  return true;
3899 }
3900 
3901 
3902 /*
3903  * SplitGUCList --- parse a string containing identifiers or file names
3904  *
3905  * This is used to split the value of a GUC_LIST_QUOTE GUC variable, without
3906  * presuming whether the elements will be taken as identifiers or file names.
3907  * We assume the input has already been through flatten_set_variable_args(),
3908  * so that we need never downcase (if appropriate, that was done already).
3909  * Nor do we ever truncate, since we don't know the correct max length.
3910  * We disallow embedded whitespace for simplicity (it shouldn't matter,
3911  * because any embedded whitespace should have led to double-quoting).
3912  * Otherwise the API is identical to SplitIdentifierString.
3913  *
3914  * XXX it's annoying to have so many copies of this string-splitting logic.
3915  * However, it's not clear that having one function with a bunch of option
3916  * flags would be much better.
3917  *
3918  * XXX there is a version of this function in src/bin/pg_dump/dumputils.c.
3919  * Be sure to update that if you have to change this.
3920  *
3921  * Inputs:
3922  * rawstring: the input string; must be overwritable! On return, it's
3923  * been modified to contain the separated identifiers.
3924  * separator: the separator punctuation expected between identifiers
3925  * (typically '.' or ','). Whitespace may also appear around
3926  * identifiers.
3927  * Outputs:
3928  * namelist: filled with a palloc'd list of pointers to identifiers within
3929  * rawstring. Caller should list_free() this even on error return.
3930  *
3931  * Returns true if okay, false if there is a syntax error in the string.
3932  */
3933 bool
3934 SplitGUCList(char *rawstring, char separator,
3935  List **namelist)
3936 {
3937  char *nextp = rawstring;
3938  bool done = false;
3939 
3940  *namelist = NIL;
3941 
3942  while (scanner_isspace(*nextp))
3943  nextp++; /* skip leading whitespace */
3944 
3945  if (*nextp == '\0')
3946  return true; /* allow empty string */
3947 
3948  /* At the top of the loop, we are at start of a new identifier. */
3949  do
3950  {
3951  char *curname;
3952  char *endp;
3953 
3954  if (*nextp == '"')
3955  {
3956  /* Quoted name --- collapse quote-quote pairs */
3957  curname = nextp + 1;
3958  for (;;)
3959  {
3960  endp = strchr(nextp + 1, '"');
3961  if (endp == NULL)
3962  return false; /* mismatched quotes */
3963  if (endp[1] != '"')
3964  break; /* found end of quoted name */
3965  /* Collapse adjacent quotes into one quote, and look again */
3966  memmove(endp, endp + 1, strlen(endp));
3967  nextp = endp;
3968  }
3969  /* endp now points at the terminating quote */
3970  nextp = endp + 1;
3971  }
3972  else
3973  {
3974  /* Unquoted name --- extends to separator or whitespace */
3975  curname = nextp;
3976  while (*nextp && *nextp != separator &&
3977  !scanner_isspace(*nextp))
3978  nextp++;
3979  endp = nextp;
3980  if (curname == nextp)
3981  return false; /* empty unquoted name not allowed */
3982  }
3983 
3984  while (scanner_isspace(*nextp))
3985  nextp++; /* skip trailing whitespace */
3986 
3987  if (*nextp == separator)
3988  {
3989  nextp++;
3990  while (scanner_isspace(*nextp))
3991  nextp++; /* skip leading whitespace for next */
3992  /* we expect another name, so done remains false */
3993  }
3994  else if (*nextp == '\0')
3995  done = true;
3996  else
3997  return false; /* invalid syntax */
3998 
3999  /* Now safe to overwrite separator with a null */
4000  *endp = '\0';
4001 
4002  /*
4003  * Finished isolating current name --- add it to list
4004  */
4005  *namelist = lappend(*namelist, curname);
4006 
4007  /* Loop back if we didn't reach end of string */
4008  } while (!done);
4009 
4010  return true;
4011 }
4012 
4013 
4014 /*****************************************************************************
4015  * Comparison Functions used for bytea
4016  *
4017  * Note: btree indexes need these routines not to leak memory; therefore,
4018  * be careful to free working copies of toasted datums. Most places don't
4019  * need to be so careful.
4020  *****************************************************************************/
4021 
4022 Datum
4024 {
4025  Datum arg1 = PG_GETARG_DATUM(0);
4026  Datum arg2 = PG_GETARG_DATUM(1);
4027  bool result;
4028  Size len1,
4029  len2;
4030 
4031  /*
4032  * We can use a fast path for unequal lengths, which might save us from
4033  * having to detoast one or both values.
4034  */
4035  len1 = toast_raw_datum_size(arg1);
4036  len2 = toast_raw_datum_size(arg2);
4037  if (len1 != len2)
4038  result = false;
4039  else
4040  {
4041  bytea *barg1 = DatumGetByteaPP(arg1);
4042  bytea *barg2 = DatumGetByteaPP(arg2);
4043 
4044  result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2),
4045  len1 - VARHDRSZ) == 0);
4046 
4047  PG_FREE_IF_COPY(barg1, 0);
4048  PG_FREE_IF_COPY(barg2, 1);
4049  }
4050 
4051  PG_RETURN_BOOL(result);
4052 }
4053 
4054 Datum
4056 {
4057  Datum arg1 = PG_GETARG_DATUM(0);
4058  Datum arg2 = PG_GETARG_DATUM(1);
4059  bool result;
4060  Size len1,
4061  len2;
4062 
4063  /*
4064  * We can use a fast path for unequal lengths, which might save us from
4065  * having to detoast one or both values.
4066  */
4067  len1 = toast_raw_datum_size(arg1);
4068  len2 = toast_raw_datum_size(arg2);
4069  if (len1 != len2)
4070  result = true;
4071  else
4072  {
4073  bytea *barg1 = DatumGetByteaPP(arg1);
4074  bytea *barg2 = DatumGetByteaPP(arg2);
4075 
4076  result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2),
4077  len1 - VARHDRSZ) != 0);
4078 
4079  PG_FREE_IF_COPY(barg1, 0);
4080  PG_FREE_IF_COPY(barg2, 1);
4081  }
4082 
4083  PG_RETURN_BOOL(result);
4084 }
4085 
4086 Datum
4088 {
4089  bytea *arg1 = PG_GETARG_BYTEA_PP(0);
4090  bytea *arg2 = PG_GETARG_BYTEA_PP(1);
4091  int len1,
4092  len2;
4093  int cmp;
4094 
4095  len1 = VARSIZE_ANY_EXHDR(arg1);
4096  len2 = VARSIZE_ANY_EXHDR(arg2);
4097 
4098  cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4099 
4100  PG_FREE_IF_COPY(arg1, 0);
4101  PG_FREE_IF_COPY(arg2, 1);
4102 
4103  PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 < len2)));
4104 }
4105 
4106 Datum
4108 {
4109  bytea *arg1 = PG_GETARG_BYTEA_PP(0);
4110  bytea *arg2 = PG_GETARG_BYTEA_PP(1);
4111  int len1,
4112  len2;
4113  int cmp;
4114 
4115  len1 = VARSIZE_ANY_EXHDR(arg1);
4116  len2 = VARSIZE_ANY_EXHDR(arg2);
4117 
4118  cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4119 
4120  PG_FREE_IF_COPY(arg1, 0);
4121  PG_FREE_IF_COPY(arg2, 1);
4122 
4123  PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 <= len2)));
4124 }
4125 
4126 Datum
4128 {
4129  bytea *arg1 = PG_GETARG_BYTEA_PP(0);
4130  bytea *arg2 = PG_GETARG_BYTEA_PP(1);
4131  int len1,
4132  len2;
4133  int cmp;
4134 
4135  len1 = VARSIZE_ANY_EXHDR(arg1);
4136  len2 = VARSIZE_ANY_EXHDR(arg2);
4137 
4138  cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4139 
4140  PG_FREE_IF_COPY(arg1, 0);
4141  PG_FREE_IF_COPY(arg2, 1);
4142 
4143  PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 > len2)));
4144 }
4145 
4146 Datum
4148 {
4149  bytea *arg1 = PG_GETARG_BYTEA_PP(0);
4150  bytea *arg2 = PG_GETARG_BYTEA_PP(1);
4151  int len1,
4152  len2;
4153  int cmp;
4154 
4155  len1 = VARSIZE_ANY_EXHDR(arg1);
4156  len2 = VARSIZE_ANY_EXHDR(arg2);
4157 
4158  cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4159 
4160  PG_FREE_IF_COPY(arg1, 0);
4161  PG_FREE_IF_COPY(arg2, 1);
4162 
4163  PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 >= len2)));
4164 }
4165 
4166 Datum
4168 {
4169  bytea *arg1 = PG_GETARG_BYTEA_PP(0);
4170  bytea *arg2 = PG_GETARG_BYTEA_PP(1);
4171  int len1,
4172  len2;
4173  int cmp;
4174 
4175  len1 = VARSIZE_ANY_EXHDR(arg1);
4176  len2 = VARSIZE_ANY_EXHDR(arg2);
4177 
4178  cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4179  if ((cmp == 0) && (len1 != len2))
4180  cmp = (len1 < len2) ? -1 : 1;
4181 
4182  PG_FREE_IF_COPY(arg1, 0);
4183  PG_FREE_IF_COPY(arg2, 1);
4184 
4185  PG_RETURN_INT32(cmp);
4186 }
4187 
4188 Datum
4190 {
4192  MemoryContext oldcontext;
4193 
4194  oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
4195 
4196  /* Use generic string SortSupport, forcing "C" collation */
4197  varstr_sortsupport(ssup, BYTEAOID, C_COLLATION_OID);
4198 
4199  MemoryContextSwitchTo(oldcontext);
4200 
4201  PG_RETURN_VOID();
4202 }
4203 
4204 /*
4205  * appendStringInfoText
4206  *
4207  * Append a text to str.
4208  * Like appendStringInfoString(str, text_to_cstring(t)) but faster.
4209  */
4210 static void
4212 {
4214 }
4215 
4216 /*
4217  * replace_text
4218  * replace all occurrences of 'old_sub_str' in 'orig_str'
4219  * with 'new_sub_str' to form 'new_str'
4220  *
4221  * returns 'orig_str' if 'old_sub_str' == '' or 'orig_str' == ''
4222  * otherwise returns 'new_str'
4223  */
4224 Datum
4226 {
4227  text *src_text = PG_GETARG_TEXT_PP(0);
4228  text *from_sub_text = PG_GETARG_TEXT_PP(1);
4229  text *to_sub_text = PG_GETARG_TEXT_PP(2);
4230  int src_text_len;
4231  int from_sub_text_len;
4233  text *ret_text;
4234  int chunk_len;
4235  char *curr_ptr;
4236  char *start_ptr;
4238  bool found;
4239 
4240  src_text_len = VARSIZE_ANY_EXHDR(src_text);
4241  from_sub_text_len = VARSIZE_ANY_EXHDR(from_sub_text);
4242 
4243  /* Return unmodified source string if empty source or pattern */
4244  if (src_text_len < 1 || from_sub_text_len < 1)
4245  {
4246  PG_RETURN_TEXT_P(src_text);
4247  }
4248 
4249  text_position_setup(src_text, from_sub_text, PG_GET_COLLATION(), &state);
4250 
4251  found = text_position_next(&state);
4252 
4253  /* When the from_sub_text is not found, there is nothing to do. */
4254  if (!found)
4255  {
4256  text_position_cleanup(&state);
4257  PG_RETURN_TEXT_P(src_text);
4258  }
4259  curr_ptr = text_position_get_match_ptr(&state);
4260  start_ptr = VARDATA_ANY(src_text);
4261 
4262  initStringInfo(&str);
4263 
4264  do
4265  {
4267 
4268  /* copy the data skipped over by last text_position_next() */
4269  chunk_len = curr_ptr - start_ptr;
4270  appendBinaryStringInfo(&str, start_ptr, chunk_len);
4271 
4272  appendStringInfoText(&str, to_sub_text);
4273 
4274  start_ptr = curr_ptr + from_sub_text_len;
4275 
4276  found = text_position_next(&state);
4277  if (found)
4278  curr_ptr = text_position_get_match_ptr(&state);
4279  }
4280  while (found);
4281 
4282  /* copy trailing data */
4283  chunk_len = ((char *) src_text + VARSIZE_ANY(src_text)) - start_ptr;
4284  appendBinaryStringInfo(&str, start_ptr, chunk_len);
4285 
4286  text_position_cleanup(&state);
4287 
4288  ret_text = cstring_to_text_with_len(str.data, str.len);
4289  pfree(str.data);
4290 
4291  PG_RETURN_TEXT_P(ret_text);
4292 }
4293 
4294 /*
4295  * check_replace_text_has_escape_char
4296  *
4297  * check whether replace_text contains escape char.
4298  */
4299 static bool
4301 {
4302  const char *p = VARDATA_ANY(replace_text);
4303  const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);
4304 
4306  {
4307  for (; p < p_end; p++)
4308  {
4309  if (*p == '\\')
4310  return true;
4311  }
4312  }
4313  else
4314  {
4315  for (; p < p_end; p += pg_mblen(p))
4316  {
4317  if (*p == '\\')
4318  return true;
4319  }
4320  }
4321 
4322  return false;
4323 }
4324 
4325 /*
4326  * appendStringInfoRegexpSubstr
4327  *
4328  * Append replace_text to str, substituting regexp back references for
4329  * \n escapes. start_ptr is the start of the match in the source string,
4330  * at logical character position data_pos.
4331  */
4332 static void
4334  regmatch_t *pmatch,
4335  char *start_ptr, int data_pos)
4336 {
4337  const char *p = VARDATA_ANY(replace_text);
4338  const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);
4339  int eml = pg_database_encoding_max_length();
4340 
4341  for (;;)
4342  {
4343  const char *chunk_start = p;
4344  int so;
4345  int eo;
4346 
4347  /* Find next escape char. */
4348  if (eml == 1)
4349  {
4350  for (; p < p_end && *p != '\\'; p++)
4351  /* nothing */ ;
4352  }
4353  else
4354  {
4355  for (; p < p_end && *p != '\\'; p += pg_mblen(p))
4356  /* nothing */ ;
4357  }
4358 
4359  /* Copy the text we just scanned over, if any. */
4360  if (p > chunk_start)
4361  appendBinaryStringInfo(str, chunk_start, p - chunk_start);
4362 
4363  /* Done if at end of string, else advance over escape char. */
4364  if (p >= p_end)
4365  break;
4366  p++;
4367 
4368  if (p >= p_end)
4369  {
4370  /* Escape at very end of input. Treat same as unexpected char */
4371  appendStringInfoChar(str, '\\');
4372  break;
4373  }
4374 
4375  if (*p >= '1' && *p <= '9')
4376  {
4377  /* Use the back reference of regexp. */
4378  int idx = *p - '0';
4379 
4380  so = pmatch[idx].rm_so;
4381  eo = pmatch[idx].rm_eo;
4382  p++;
4383  }
4384  else if (*p == '&')
4385  {
4386  /* Use the entire matched string. */
4387  so = pmatch[0].rm_so;
4388  eo = pmatch[0].rm_eo;
4389  p++;
4390  }
4391  else if (*p == '\\')
4392  {
4393  /* \\ means transfer one \ to output. */
4394  appendStringInfoChar(str, '\\');
4395  p++;
4396  continue;
4397  }
4398  else
4399  {
4400  /*
4401  * If escape char is not followed by any expected char, just treat
4402  * it as ordinary data to copy. (XXX would it be better to throw
4403  * an error?)
4404  */
4405  appendStringInfoChar(str, '\\');
4406  continue;
4407  }
4408 
4409  if (so != -1 && eo != -1)
4410  {
4411  /*
4412  * Copy the text that is back reference of regexp. Note so and eo
4413  * are counted in characters not bytes.
4414  */
4415  char *chunk_start;
4416  int chunk_len;
4417 
4418  Assert(so >= data_pos);
4419  chunk_start = start_ptr;
4420  chunk_start += charlen_to_bytelen(chunk_start, so - data_pos);
4421  chunk_len = charlen_to_bytelen(chunk_start, eo - so);
4422  appendBinaryStringInfo(str, chunk_start, chunk_len);
4423  }
4424  }
4425 }
4426 
4427 #define REGEXP_REPLACE_BACKREF_CNT 10
4428 
4429 /*
4430  * replace_text_regexp
4431  *
4432  * replace text that matches to regexp in src_text to replace_text.
4433  *
4434  * Note: to avoid having to include regex.h in builtins.h, we declare
4435  * the regexp argument as void *, but really it's regex_t *.
4436  */
4437 text *
4438 replace_text_regexp(text *src_text, void *regexp,
4439  text *replace_text, bool glob)
4440 {
4441  text *ret_text;
4442  regex_t *re = (regex_t *) regexp;
4443  int src_text_len = VARSIZE_ANY_EXHDR(src_text);
4446  pg_wchar *data;
4447  size_t data_len;
4448  int search_start;
4449  int data_pos;
4450  char *start_ptr;
4451  bool have_escape;
4452 
4453  initStringInfo(&buf);
4454 
4455  /* Convert data string to wide characters. */
4456  data = (pg_wchar *) palloc((src_text_len + 1) * sizeof(pg_wchar));
4457  data_len = pg_mb2wchar_with_len(VARDATA_ANY(src_text), data, src_text_len);
4458 
4459  /* Check whether replace_text has escape char. */
4460  have_escape = check_replace_text_has_escape_char(replace_text);
4461 
4462  /* start_ptr points to the data_pos'th character of src_text */
4463  start_ptr = (char *) VARDATA_ANY(src_text);
4464  data_pos = 0;
4465 
4466  search_start = 0;
4467  while (search_start <= data_len)
4468  {
4469  int regexec_result;
4470 
4472 
4473  regexec_result = pg_regexec(re,
4474  data,
4475  data_len,
4476  search_start,
4477  NULL, /* no details */
4479  pmatch,
4480  0);
4481 
4482  if (regexec_result == REG_NOMATCH)
4483  break;
4484 
4485  if (regexec_result != REG_OKAY)
4486  {
4487  char errMsg[100];
4488 
4490  pg_regerror(regexec_result, re, errMsg, sizeof(errMsg));
4491  ereport(ERROR,
4492  (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
4493  errmsg("regular expression failed: %s", errMsg)));
4494  }
4495 
4496  /*
4497  * Copy the text to the left of the match position. Note we are given
4498  * character not byte indexes.
4499  */
4500  if (pmatch[0].rm_so - data_pos > 0)
4501  {
4502  int chunk_len;
4503 
4504  chunk_len = charlen_to_bytelen(start_ptr,
4505  pmatch[0].rm_so - data_pos);
4506  appendBinaryStringInfo(&buf, start_ptr, chunk_len);
4507 
4508  /*
4509  * Advance start_ptr over that text, to avoid multiple rescans of
4510  * it if the replace_text contains multiple back-references.
4511  */
4512  start_ptr += chunk_len;
4513  data_pos = pmatch[0].rm_so;
4514  }
4515 
4516  /*
4517  * Copy the replace_text. Process back references when the
4518  * replace_text has escape characters.
4519  */
4520  if (have_escape)
4521  appendStringInfoRegexpSubstr(&buf, replace_text, pmatch,
4522  start_ptr, data_pos);
4523  else
4524  appendStringInfoText(&buf, replace_text);
4525 
4526  /* Advance start_ptr and data_pos over the matched text. */
4527  start_ptr += charlen_to_bytelen(start_ptr,
4528  pmatch[0].rm_eo - data_pos);
4529  data_pos = pmatch[0].rm_eo;
4530 
4531  /*
4532  * When global option is off, replace the first instance only.
4533  */
4534  if (!glob)
4535  break;
4536 
4537  /*
4538  * Advance search position. Normally we start the next search at the
4539  * end of the previous match; but if the match was of zero length, we
4540  * have to advance by one character, or we'd just find the same match
4541  * again.
4542  */
4543  search_start = data_pos;
4544  if (pmatch[0].rm_so == pmatch[0].rm_eo)
4545  search_start++;
4546  }
4547 
4548  /*
4549  * Copy the text to the right of the last match.
4550  */
4551  if (data_pos < data_len)
4552  {
4553  int chunk_len;
4554 
4555  chunk_len = ((char *) src_text + VARSIZE_ANY(src_text)) - start_ptr;
4556  appendBinaryStringInfo(&buf, start_ptr, chunk_len);
4557  }
4558 
4559  ret_text = cstring_to_text_with_len(buf.data, buf.len);
4560  pfree(buf.data);
4561  pfree(data);
4562 
4563  return ret_text;
4564 }
4565 
4566 /*
4567  * split_text
4568  * parse input string
4569  * return ord item (1 based)
4570  * based on provided field separator
4571  */
4572 Datum
4574 {
4575  text *inputstring = PG_GETARG_TEXT_PP(0);
4576  text *fldsep = PG_GETARG_TEXT_PP(1);
4577  int fldnum = PG_GETARG_INT32(2);
4578  int inputstring_len;
4579  int fldsep_len;
4581  char *start_ptr;
4582  char *end_ptr;
4583  text *result_text;
4584  bool found;
4585 
4586  /* field number is 1 based */
4587  if (fldnum < 1)
4588  ereport(ERROR,
4589  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4590  errmsg("field position must be greater than zero")));
4591 
4592  inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
4593  fldsep_len = VARSIZE_ANY_EXHDR(fldsep);
4594 
4595  /* return empty string for empty input string */
4596  if (inputstring_len < 1)
4598 
4599  /* empty field separator */
4600  if (fldsep_len < 1)
4601  {
4602  text_position_cleanup(&state);
4603  /* if first field, return input string, else empty string */
4604  if (fldnum == 1)
4605  PG_RETURN_TEXT_P(inputstring);
4606  else
4608  }
4609 
4610  text_position_setup(inputstring, fldsep, PG_GET_COLLATION(), &state);
4611 
4612  /* identify bounds of first field */
4613  start_ptr = VARDATA_ANY(inputstring);
4614  found = text_position_next(&state);
4615 
4616  /* special case if fldsep not found at all */
4617  if (!found)
4618  {
4619  text_position_cleanup(&state);
4620  /* if field 1 requested, return input string, else empty string */
4621  if (fldnum == 1)
4622  PG_RETURN_TEXT_P(inputstring);
4623  else
4625  }
4626  end_ptr = text_position_get_match_ptr(&state);
4627 
4628  while (found && --fldnum > 0)
4629  {
4630  /* identify bounds of next field */
4631  start_ptr = end_ptr + fldsep_len;
4632  found = text_position_next(&state);
4633  if (found)
4634  end_ptr = text_position_get_match_ptr(&state);
4635  }
4636 
4637  text_position_cleanup(&state);
4638 
4639  if (fldnum > 0)
4640  {
4641  /* N'th field separator not found */
4642  /* if last field requested, return it, else empty string */
4643  if (fldnum == 1)
4644  {
4645  int last_len = start_ptr - VARDATA_ANY(inputstring);
4646 
4647  result_text = cstring_to_text_with_len(start_ptr,
4648  inputstring_len - last_len);
4649  }
4650  else
4651  result_text = cstring_to_text("");
4652  }
4653  else
4654  {
4655  /* non-last field requested */
4656  result_text = cstring_to_text_with_len(start_ptr, end_ptr - start_ptr);
4657  }
4658 
4659  PG_RETURN_TEXT_P(result_text);
4660 }
4661 
4662 /*
4663  * Convenience function to return true when two text params are equal.
4664  */
4665 static bool
4666 text_isequal(text *txt1, text *txt2, Oid collid)
4667 {
4669  collid,
4670  PointerGetDatum(txt1),
4671  PointerGetDatum(txt2)));
4672 }
4673 
4674 /*
4675  * text_to_array
4676  * parse input string and return text array of elements,
4677  * based on provided field separator
4678  */
4679 Datum
4681 {
4682  return text_to_array_internal(fcinfo);
4683 }
4684 
4685 /*
4686  * text_to_array_null
4687  * parse input string and return text array of elements,
4688  * based on provided field separator and null string
4689  *
4690  * This is a separate entry point only to prevent the regression tests from
4691  * complaining about different argument sets for the same internal function.
4692  */
4693 Datum
4695 {
4696  return text_to_array_internal(fcinfo);
4697 }
4698 
4699 /*
4700  * common code for text_to_array and text_to_array_null functions
4701  *
4702  * These are not strict so we have to test for null inputs explicitly.
4703  */
4704 static Datum
4706 {
4707  text *inputstring;
4708  text *fldsep;
4709  text *null_string;
4710  int inputstring_len;
4711  int fldsep_len;
4712  char *start_ptr;
4713  text *result_text;
4714  bool is_null;
4715  ArrayBuildState *astate = NULL;
4716 
4717  /* when input string is NULL, then result is NULL too */
4718  if (PG_ARGISNULL(0))
4719  PG_RETURN_NULL();
4720 
4721  inputstring = PG_GETARG_TEXT_PP(0);
4722 
4723  /* fldsep can be NULL */
4724  if (!PG_ARGISNULL(1))
4725  fldsep = PG_GETARG_TEXT_PP(1);
4726  else
4727  fldsep = NULL;
4728 
4729  /* null_string can be NULL or omitted */
4730  if (PG_NARGS() > 2 && !PG_ARGISNULL(2))
4731  null_string = PG_GETARG_TEXT_PP(2);
4732  else
4733  null_string = NULL;
4734 
4735  if (fldsep != NULL)
4736  {
4737  /*
4738  * Normal case with non-null fldsep. Use the text_position machinery
4739  * to search for occurrences of fldsep.
4740  */
4742 
4743  inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
4744  fldsep_len = VARSIZE_ANY_EXHDR(fldsep);
4745 
4746  /* return empty array for empty input string */
4747  if (inputstring_len < 1)
4749 
4750  /*
4751  * empty field separator: return the input string as a one-element
4752  * array
4753  */
4754  if (fldsep_len < 1)
4755  {
4756  Datum elems[1];
4757  bool nulls[1];
4758  int dims[1];
4759  int lbs[1];
4760 
4761  /* single element can be a NULL too */
4762  is_null = null_string ? text_isequal(inputstring, null_string, PG_GET_COLLATION()) : false;
4763 
4764  elems[0] = PointerGetDatum(inputstring);
4765  nulls[0] = is_null;
4766  dims[0] = 1;
4767  lbs[0] = 1;
4768  /* XXX: this hardcodes assumptions about the text type */
4770  1, dims, lbs,
4771  TEXTOID, -1, false, TYPALIGN_INT));
4772  }
4773 
4774  text_position_setup(inputstring, fldsep, PG_GET_COLLATION(), &state);
4775 
4776  start_ptr = VARDATA_ANY(inputstring);
4777 
4778  for (;;)
4779  {
4780  bool found;
4781  char *end_ptr;
4782  int chunk_len;
4783 
4785 
4786  found = text_position_next(&state);
4787  if (!found)
4788  {
4789  /* fetch last field */
4790  chunk_len = ((char *) inputstring + VARSIZE_ANY(inputstring)) - start_ptr;
4791  end_ptr = NULL; /* not used, but some compilers complain */
4792  }
4793  else
4794  {
4795  /* fetch non-last field */
4796  end_ptr = text_position_get_match_ptr(&state);
4797  chunk_len = end_ptr - start_ptr;
4798  }
4799 
4800  /* must build a temp text datum to pass to accumArrayResult */
4801  result_text = cstring_to_text_with_len(start_ptr, chunk_len);
4802  is_null = null_string ? text_isequal(result_text, null_string, PG_GET_COLLATION()) : false;
4803 
4804  /* stash away this field */
4805  astate = accumArrayResult(astate,
4806  PointerGetDatum(result_text),
4807  is_null,
4808  TEXTOID,
4810 
4811  pfree(result_text);
4812 
4813  if (!found)
4814  break;
4815 
4816  start_ptr = end_ptr + fldsep_len;
4817  }
4818 
4819  text_position_cleanup(&state);
4820  }
4821  else
4822  {
4823  /*
4824  * When fldsep is NULL, each character in the inputstring becomes an
4825  * element in the result array. The separator is effectively the
4826  * space between characters.
4827  */
4828  inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
4829 
4830  /* return empty array for empty input string */
4831  if (inputstring_len < 1)
4833 
4834  start_ptr = VARDATA_ANY(inputstring);
4835 
4836  while (inputstring_len > 0)
4837  {
4838  int chunk_len = pg_mblen(start_ptr);
4839 
4841 
4842  /* must build a temp text datum to pass to accumArrayResult */
4843  result_text = cstring_to_text_with_len(start_ptr, chunk_len);
4844  is_null = null_string ? text_isequal(result_text, null_string, PG_GET_COLLATION()) : false;
4845 
4846  /* stash away this field */
4847  astate = accumArrayResult(astate,
4848  PointerGetDatum(result_text),
4849  is_null,
4850  TEXTOID,
4852 
4853  pfree(result_text);
4854 
4855  start_ptr += chunk_len;
4856  inputstring_len -= chunk_len;
4857  }
4858  }
4859 
4862 }
4863 
4864 /*
4865  * array_to_text
4866  * concatenate Cstring representation of input array elements
4867  * using provided field separator
4868  */
4869 Datum
4871 {
4873  char *fldsep = text_to_cstring(PG_GETARG_TEXT_PP(1));
4874 
4875  PG_RETURN_TEXT_P(array_to_text_internal(fcinfo, v, fldsep, NULL));
4876 }
4877 
4878 /*
4879  * array_to_text_null
4880  * concatenate Cstring representation of input array elements
4881  * using provided field separator and null string
4882  *
4883  * This version is not strict so we have to test for null inputs explicitly.
4884  */
4885 Datum
4887 {
4888  ArrayType *v;
4889  char *fldsep;
4890  char *null_string;
4891 
4892  /* returns NULL when first or second parameter is NULL */
4893  if (PG_ARGISNULL(0) || PG_ARGISNULL(1))
4894  PG_RETURN_NULL();
4895 
4896  v = PG_GETARG_ARRAYTYPE_P(0);
4897  fldsep = text_to_cstring(PG_GETARG_TEXT_PP(1));
4898 
4899  /* NULL null string is passed through as a null pointer */
4900  if (!PG_ARGISNULL(2))
4901  null_string = text_to_cstring(PG_GETARG_TEXT_PP(2));
4902  else
4903  null_string = NULL;
4904 
4905  PG_RETURN_TEXT_P(array_to_text_internal(fcinfo, v, fldsep, null_string));
4906 }
4907 
4908 /*
4909  * common code for array_to_text and array_to_text_null functions
4910  */
4911 static text *
4913  const char *fldsep, const char *null_string)
4914 {
4915  text *result;
4916  int nitems,
4917  *dims,
4918  ndims;
4919  Oid element_type;
4920  int typlen;
4921  bool typbyval;
4922  char typalign;
4924  bool printed = false;
4925  char *p;
4926  bits8 *bitmap;
4927  int bitmask;
4928  int i;
4929  ArrayMetaState *my_extra;
4930 
4931  ndims = ARR_NDIM(v);
4932  dims = ARR_DIMS(v);
4933  nitems = ArrayGetNItems(ndims, dims);
4934 
4935  /* if there are no elements, return an empty string */
4936  if (nitems == 0)
4937  return cstring_to_text_with_len("", 0);
4938 
4939  element_type = ARR_ELEMTYPE(v);
4940  initStringInfo(&buf);
4941 
4942  /*
4943  * We arrange to look up info about element type, including its output
4944  * conversion proc, only once per series of calls, assuming the element
4945  * type doesn't change underneath us.
4946  */
4947  my_extra = (ArrayMetaState *) fcinfo->flinfo->fn_extra;
4948  if (my_extra == NULL)
4949  {
4950  fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
4951  sizeof(ArrayMetaState));
4952  my_extra = (ArrayMetaState *) fcinfo->flinfo->fn_extra;
4953  my_extra->element_type = ~element_type;
4954  }
4955 
4956  if (my_extra->element_type != element_type)
4957  {
4958  /*
4959  * Get info about element type, including its output conversion proc
4960  */
4961  get_type_io_data(element_type, IOFunc_output,
4962  &my_extra->typlen, &my_extra->typbyval,
4963  &my_extra->typalign, &my_extra->typdelim,
4964  &my_extra->typioparam, &my_extra->typiofunc);
4965  fmgr_info_cxt(my_extra->typiofunc, &my_extra->proc,
4966  fcinfo->flinfo->fn_mcxt);
4967  my_extra->element_type = element_type;
4968  }
4969  typlen = my_extra->typlen;
4970  typbyval = my_extra->typbyval;
4971  typalign = my_extra->typalign;
4972 
4973  p = ARR_DATA_PTR(v);
4974  bitmap = ARR_NULLBITMAP(v);
4975  bitmask = 1;
4976 
4977  for (i = 0; i < nitems; i++)
4978  {
4979  Datum itemvalue;
4980  char *value;
4981 
4982  /* Get source element, checking for NULL */
4983  if (bitmap && (*bitmap & bitmask) == 0)
4984  {
4985  /* if null_string is NULL, we just ignore null elements */
4986  if (null_string != NULL)
4987  {
4988  if (printed)
4989  appendStringInfo(&buf, "%s%s", fldsep, null_string);
4990  else
4991  appendStringInfoString(&buf, null_string);
4992  printed = true;
4993  }
4994  }
4995  else
4996  {
4997  itemvalue = fetch_att(p, typbyval, typlen);
4998 
4999  value = OutputFunctionCall(&my_extra->proc, itemvalue);
5000 
5001  if (printed)
5002  appendStringInfo(&buf, "%s%s", fldsep, value);
5003  else
5004  appendStringInfoString(&buf, value);
5005  printed = true;
5006 
5007  p = att_addlength_pointer(p, typlen, p);
5008  p = (char *) att_align_nominal(p, typalign);
5009  }
5010 
5011  /* advance bitmap pointer if any */
5012  if (bitmap)
5013  {
5014  bitmask <<= 1;
5015  if (bitmask == 0x100)
5016  {
5017  bitmap++;
5018  bitmask = 1;
5019  }
5020  }
5021  }
5022 
5023  result = cstring_to_text_with_len(buf.data, buf.len);
5024  pfree(buf.data);
5025 
5026  return result;
5027 }
5028 
5029 #define HEXBASE 16
5030 /*
5031  * Convert an int32 to a string containing a base 16 (hex) representation of
5032  * the number.
5033  */
5034 Datum
5036 {
5038  char *ptr;
5039  const char *digits = "0123456789abcdef";
5040  char buf[32]; /* bigger than needed, but reasonable */
5041 
5042  ptr = buf + sizeof(buf) - 1;
5043  *ptr = '\0';
5044 
5045  do
5046  {
5047  *--ptr = digits[value % HEXBASE];
5048  value /= HEXBASE;
5049  } while (ptr > buf && value);
5050 
5052 }
5053 
5054 /*
5055  * Convert an int64 to a string containing a base 16 (hex) representation of
5056  * the number.
5057  */
5058 Datum
5060 {
5061  uint64 value = (uint64) PG_GETARG_INT64(0);
5062  char *ptr;
5063  const char *digits = "0123456789abcdef";
5064  char buf[32]; /* bigger than needed, but reasonable */
5065 
5066  ptr = buf + sizeof(buf) - 1;
5067  *ptr = '\0';
5068 
5069  do
5070  {
5071  *--ptr = digits[value % HEXBASE];
5072  value /= HEXBASE;
5073  } while (ptr > buf && value);
5074 
5076 }
5077 
5078 /*
5079  * Return the size of a datum, possibly compressed
5080  *
5081  * Works on any data type
5082  */
5083 Datum
5085 {
5087  int32 result;
5088  int typlen;
5089 
5090  /* On first call, get the input type's typlen, and save at *fn_extra */
5091  if (fcinfo->flinfo->fn_extra == NULL)
5092  {
5093  /* Lookup the datatype of the supplied argument */
5094  Oid argtypeid = get_fn_expr_argtype(fcinfo->flinfo, 0);
5095 
5096  typlen = get_typlen(argtypeid);
5097  if (typlen == 0) /* should not happen */
5098  elog(ERROR, "cache lookup failed for type %u", argtypeid);
5099 
5100  fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
5101  sizeof(int));
5102  *((int *) fcinfo->flinfo->fn_extra) = typlen;
5103  }
5104  else
5105  typlen = *((int *) fcinfo->flinfo->fn_extra);
5106 
5107  if (typlen == -1)
5108  {
5109  /* varlena type, possibly toasted */
5110  result = toast_datum_size(value);
5111  }
5112  else if (typlen == -2)
5113  {
5114  /* cstring */
5115  result = strlen(DatumGetCString(value)) + 1;
5116  }
5117  else
5118  {
5119  /* ordinary fixed-width type */
5120  result = typlen;
5121  }
5122 
5123  PG_RETURN_INT32(result);
5124 }
5125 
5126 /*
5127  * string_agg - Concatenates values and returns string.
5128  *
5129  * Syntax: string_agg(value text, delimiter text) RETURNS text
5130  *
5131  * Note: Any NULL values are ignored. The first-call delimiter isn't
5132  * actually used at all, and on subsequent calls the delimiter precedes
5133  * the associated value.
5134  */
5135 
5136 /* subroutine to initialize state */
5137 static StringInfo
5139 {
5140  StringInfo state;
5141  MemoryContext aggcontext;
5142  MemoryContext oldcontext;
5143 
5144  if (!AggCheckCallContext(fcinfo, &aggcontext))
5145  {
5146  /* cannot be called directly because of internal-type argument */
5147  elog(ERROR, "string_agg_transfn called in non-aggregate context");
5148  }
5149 
5150  /*
5151  * Create state in aggregate context. It'll stay there across subsequent
5152  * calls.
5153  */
5154  oldcontext = MemoryContextSwitchTo(aggcontext);
5155  state = makeStringInfo();
5156  MemoryContextSwitchTo(oldcontext);
5157 
5158  return state;
5159 }
5160 
5161 Datum
5163 {
5164  StringInfo state;
5165 
5166  state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
5167 
5168  /* Append the value unless null. */
5169  if (!PG_ARGISNULL(1))
5170  {
5171  /* On the first time through, we ignore the delimiter. */
5172  if (state == NULL)
5173  state = makeStringAggState(fcinfo);
5174  else if (!PG_ARGISNULL(2))
5175  appendStringInfoText(state, PG_GETARG_TEXT_PP(2)); /* delimiter */
5176 
5177  appendStringInfoText(state, PG_GETARG_TEXT_PP(1)); /* value */
5178  }
5179 
5180  /*
5181  * The transition type for string_agg() is declared to be "internal",
5182  * which is a pass-by-value type the same size as a pointer.
5183  */
5184  PG_RETURN_POINTER(state);
5185 }
5186 
5187 Datum
5189 {
5190  StringInfo state;
5191 
5192  /* cannot be called directly because of internal-type argument */
5193  Assert(AggCheckCallContext(fcinfo, NULL));
5194 
5195  state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
5196 
5197  if (state != NULL)
5199  else
5200  PG_RETURN_NULL();
5201 }
5202 
5203 /*
5204  * Prepare cache with fmgr info for the output functions of the datatypes of
5205  * the arguments of a concat-like function, beginning with argument "argidx".
5206  * (Arguments before that will have corresponding slots in the resulting
5207  * FmgrInfo array, but we don't fill those slots.)
5208  */
5209 static FmgrInfo *
5211 {
5212  FmgrInfo *foutcache;
5213  int i;
5214 
5215  /* We keep the info in fn_mcxt so it survives across calls */
5216  foutcache = (FmgrInfo *) MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
5217  PG_NARGS() * sizeof(FmgrInfo));
5218 
5219  for (i = argidx; i < PG_NARGS(); i++)
5220  {
5221  Oid valtype;
5222  Oid typOutput;
5223  bool typIsVarlena;
5224 
5225  valtype = get_fn_expr_argtype(fcinfo->flinfo, i);
5226  if (!OidIsValid(valtype))
5227  elog(ERROR, "could not determine data type of concat() input");
5228 
5229  getTypeOutputInfo(valtype, &typOutput, &typIsVarlena);
5230  fmgr_info_cxt(typOutput, &foutcache[i], fcinfo->flinfo->fn_mcxt);
5231  }
5232 
5233  fcinfo->flinfo->fn_extra = foutcache;
5234 
5235  return foutcache;
5236 }
5237 
5238 /*
5239  * Implementation of both concat() and concat_ws().
5240  *
5241  * sepstr is the separator string to place between values.
5242  * argidx identifies the first argument to concatenate (counting from zero);
5243  * note that this must be constant across any one series of calls.
5244  *
5245  * Returns NULL if result should be NULL, else text value.
5246  */
5247 static text *
5248 concat_internal(const char *sepstr, int argidx,
5249  FunctionCallInfo fcinfo)
5250 {
5251  text *result;
5253  FmgrInfo *foutcache;
5254  bool first_arg = true;
5255  int i;
5256 
5257  /*
5258  * concat(VARIADIC some-array) is essentially equivalent to
5259  * array_to_text(), ie concat the array elements with the given separator.
5260  * So we just pass the case off to that code.
5261  */
5262  if (get_fn_expr_variadic(fcinfo->flinfo))
5263  {
5264  ArrayType *arr;
5265 
5266  /* Should have just the one argument */
5267  Assert(argidx == PG_NARGS() - 1);
5268 
5269  /* concat(VARIADIC NULL) is defined as NULL */
5270  if (PG_ARGISNULL(argidx))
5271  return NULL;
5272 
5273  /*
5274  * Non-null argument had better be an array. We assume that any call
5275  * context that could let get_fn_expr_variadic return true will have
5276  * checked that a VARIADIC-labeled parameter actually is an array. So
5277  * it should be okay to just Assert that it's an array rather than
5278  * doing a full-fledged error check.
5279  */
5281 
5282  /* OK, safe to fetch the array value */
5283  arr = PG_GETARG_ARRAYTYPE_P(argidx);
5284 
5285  /*
5286  * And serialize the array. We tell array_to_text to ignore null
5287  * elements, which matches the behavior of the loop below.
5288  */
5289  return array_to_text_internal(fcinfo, arr, sepstr, NULL);
5290  }
5291 
5292  /* Normal case without explicit VARIADIC marker */
5293  initStringInfo(&str);
5294 
5295  /* Get output function info, building it if first time through */
5296  foutcache = (FmgrInfo *) fcinfo->flinfo->fn_extra;
5297  if (foutcache == NULL)
5298  foutcache = build_concat_foutcache(fcinfo, argidx);
5299 
5300  for (i = argidx; i < PG_NARGS(); i++)
5301  {
5302  if (!PG_ARGISNULL(i))
5303  {
5305 
5306  /* add separator if appropriate */
5307  if (first_arg)
5308  first_arg = false;
5309  else
5310  appendStringInfoString(&str, sepstr);
5311 
5312  /* call the appropriate type output function, append the result */
5314  OutputFunctionCall(&foutcache[i], value));
5315  }
5316  }
5317 
5318  result = cstring_to_text_with_len(str.data, str.len);
5319  pfree(str.data);
5320 
5321  return result;
5322 }
5323 
5324 /*
5325  * Concatenate all arguments. NULL arguments are ignored.
5326  */
5327 Datum
5329 {
5330  text *result;
5331 
5332  result = concat_internal("", 0, fcinfo);
5333  if (result == NULL)
5334  PG_RETURN_NULL();
5335  PG_RETURN_TEXT_P(result);
5336 }
5337 
5338 /*
5339  * Concatenate all but first argument value with separators. The first
5340  * parameter is used as the separator. NULL arguments are ignored.
5341  */
5342 Datum
5344 {
5345  char *sep;
5346  text *result;
5347 
5348  /* return NULL when separator is NULL */
5349  if (PG_ARGISNULL(0))
5350  PG_RETURN_NULL();
5352 
5353  result = concat_internal(sep, 1, fcinfo);
5354  if (result == NULL)
5355  PG_RETURN_NULL();
5356  PG_RETURN_TEXT_P(result);
5357 }
5358 
5359 /*
5360  * Return first n characters in the string. When n is negative,
5361  * return all but last |n| characters.
5362  */
5363 Datum
5365 {
5366  int n = PG_GETARG_INT32(1);
5367 
5368  if (n < 0)
5369  {
5370  text *str = PG_GETARG_TEXT_PP(0);
5371  const char *p = VARDATA_ANY(str);
5372  int len = VARSIZE_ANY_EXHDR(str);
5373  int rlen;
5374 
5375  n = pg_mbstrlen_with_len(p, len) + n;
5376  rlen = pg_mbcharcliplen(p, len, n);
5378  }
5379  else
5381 }
5382 
5383 /*
5384  * Return last n characters in the string. When n is negative,
5385  * return all but first |n| characters.
5386  */
5387 Datum
5389 {
5390  text *str = PG_GETARG_TEXT_PP(0);
5391  const char *p = VARDATA_ANY(str);
5392  int len = VARSIZE_ANY_EXHDR(str);
5393  int n = PG_GETARG_INT32(1);
5394  int off;
5395 
5396  if (n < 0)
5397  n = -n;
5398  else
5399  n = pg_mbstrlen_with_len(p, len) - n;
5400  off = pg_mbcharcliplen(p, len, n);
5401 
5402  PG_RETURN_TEXT_P(cstring_to_text_with_len(p + off, len - off));
5403 }
5404 
5405 /*
5406  * Return reversed string
5407  */
5408 Datum
5410 {
5411  text *str = PG_GETARG_TEXT_PP(0);
5412  const char *p = VARDATA_ANY(str);
5413  int len = VARSIZE_ANY_EXHDR(str);
5414  const char *endp = p + len;
5415  text *result;
5416  char *dst;
5417 
5418  result = palloc(len + VARHDRSZ);
5419  dst = (char *) VARDATA(result) + len;
5420  SET_VARSIZE(result, len + VARHDRSZ);
5421 
5423  {
5424  /* multibyte version */
5425  while (p < endp)
5426  {
5427  int sz;
5428 
5429  sz = pg_mblen(p);
5430  dst -= sz;
5431  memcpy(dst, p, sz);
5432  p += sz;
5433  }
5434  }
5435  else
5436  {
5437  /* single byte version */
5438  while (p < endp)
5439  *(--dst) = *p++;
5440  }
5441 
5442  PG_RETURN_TEXT_P(result);
5443 }
5444 
5445 
5446 /*
5447  * Support macros for text_format()
5448  */
5449 #define TEXT_FORMAT_FLAG_MINUS 0x0001 /* is minus flag present? */
5450 
5451 #define ADVANCE_PARSE_POINTER(ptr,end_ptr) \
5452  do { \
5453  if (++(ptr) >= (end_ptr)) \
5454  ereport(ERROR, \
5455  (errcode(ERRCODE_INVALID_PARAMETER_VALUE), \
5456  errmsg("unterminated format() type specifier"), \
5457  errhint("For a single \"%%\" use \"%%%%\"."))); \
5458  } while (0)
5459 
5460 /*
5461  * Returns a formatted string
5462  */
5463 Datum
5465 {
5466  text *fmt;
5468  const char *cp;
5469  const char *start_ptr;
5470  const char *end_ptr;
5471  text *result;
5472  int arg;
5473  bool funcvariadic;
5474  int nargs;
5475  Datum *elements = NULL;
5476  bool *nulls = NULL;
5477  Oid element_type = InvalidOid;
5478  Oid prev_type = InvalidOid;
5479  Oid prev_width_type = InvalidOid;
5480  FmgrInfo typoutputfinfo;
5481  FmgrInfo typoutputinfo_width;
5482 
5483  /* When format string is null, immediately return null */
5484  if (PG_ARGISNULL(0))
5485  PG_RETURN_NULL();
5486 
5487  /* If argument is marked VARIADIC, expand array into elements */
5488  if (get_fn_expr_variadic(fcinfo->flinfo))
5489  {
5490  ArrayType *arr;
5491  int16 elmlen;
5492  bool elmbyval;
5493  char elmalign;
5494  int nitems;
5495 
5496  /* Should have just the one argument */
5497  Assert(PG_NARGS() == 2);
5498 
5499  /* If argument is NULL, we treat it as zero-length array */
5500  if (PG_ARGISNULL(1))
5501  nitems = 0;
5502  else
5503  {
5504  /*
5505  * Non-null argument had better be an array. We assume that any
5506  * call context that could let get_fn_expr_variadic return true
5507  * will have checked that a VARIADIC-labeled parameter actually is
5508  * an array. So it should be okay to just Assert that it's an
5509  * array rather than doing a full-fledged error check.
5510  */
5512 
5513  /* OK, safe to fetch the array value */
5514  arr = PG_GETARG_ARRAYTYPE_P(1);
5515 
5516  /* Get info about array element type */
5517  element_type = ARR_ELEMTYPE(arr);
5518  get_typlenbyvalalign(element_type,
5519  &elmlen, &elmbyval, &elmalign);
5520 
5521  /* Extract all array elements */
5522  deconstruct_array(arr, element_type, elmlen, elmbyval, elmalign,
5523  &elements, &nulls, &nitems);
5524  }
5525 
5526  nargs = nitems + 1;
5527  funcvariadic = true;
5528  }
5529  else
5530  {
5531  /* Non-variadic case, we'll process the arguments individually */
5532  nargs = PG_NARGS();
5533  funcvariadic = false;
5534  }
5535 
5536  /* Setup for main loop. */
5537  fmt = PG_GETARG_TEXT_PP(0);
5538  start_ptr = VARDATA_ANY(fmt);
5539  end_ptr = start_ptr + VARSIZE_ANY_EXHDR(fmt);
5540  initStringInfo(&str);
5541  arg = 1; /* next argument position to print */
5542 
5543  /* Scan format string, looking for conversion specifiers. */
5544  for (cp = start_ptr; cp < end_ptr; cp++)
5545  {
5546  int argpos;
5547  int widthpos;
5548  int flags;
5549  int width;
5550  Datum value;
5551  bool isNull;
5552  Oid typid;
5553 
5554  /*
5555  * If it's not the start of a conversion specifier, just copy it to
5556  * the output buffer.
5557  */
5558  if (*cp != '%')
5559  {
5560  appendStringInfoCharMacro(&str, *cp);
5561  continue;
5562  }
5563 
5564  ADVANCE_PARSE_POINTER(cp, end_ptr);
5565 
5566  /* Easy case: %% outputs a single % */
5567  if (*cp == '%')
5568  {
5569  appendStringInfoCharMacro(&str, *cp);
5570  continue;
5571  }
5572 
5573  /* Parse the optional portions of the format specifier */
5574  cp = text_format_parse_format(cp, end_ptr,
5575  &argpos, &widthpos,
5576  &flags, &width);
5577 
5578  /*
5579  * Next we should see the main conversion specifier. Whether or not
5580  * an argument position was present, it's known that at least one
5581  * character remains in the string at this point. Experience suggests
5582  * that it's worth checking that that character is one of the expected
5583  * ones before we try to fetch arguments, so as to produce the least
5584  * confusing response to a mis-formatted specifier.
5585  */
5586  if (strchr("sIL", *cp) == NULL)
5587  ereport(ERROR,
5588  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5589  errmsg("unrecognized format() type specifier \"%c\"",
5590  *cp),
5591  errhint("For a single \"%%\" use \"%%%%\".")));
5592 
5593  /* If indirect width was specified, get its value */
5594  if (widthpos >= 0)
5595  {
5596  /* Collect the specified or next argument position */
5597  if (widthpos > 0)
5598  arg = widthpos;
5599  if (arg >= nargs)
5600  ereport(ERROR,
5601  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5602  errmsg("too few arguments for format()")));
5603 
5604  /* Get the value and type of the selected argument */
5605  if (!funcvariadic)
5606  {
5607  value = PG_GETARG_DATUM(arg);
5608  isNull = PG_ARGISNULL(arg);
5609  typid = get_fn_expr_argtype(fcinfo->flinfo, arg);
5610  }
5611  else
5612  {
5613  value = elements[arg - 1];
5614  isNull = nulls[arg - 1];
5615  typid = element_type;
5616  }
5617  if (!OidIsValid(typid))
5618  elog(ERROR, "could not determine data type of format() input");
5619 
5620  arg++;
5621 
5622  /* We can treat NULL width the same as zero */
5623  if (isNull)
5624  width = 0;
5625  else if (typid == INT4OID)
5626  width = DatumGetInt32(value);
5627  else if (typid == INT2OID)
5628  width = DatumGetInt16(value);
5629  else
5630  {
5631  /* For less-usual datatypes, convert to text then to int */
5632  char *str;
5633 
5634  if (typid != prev_width_type)
5635  {
5636  Oid typoutputfunc;
5637  bool typIsVarlena;
5638 
5639  getTypeOutputInfo(typid, &typoutputfunc, &typIsVarlena);
5640  fmgr_info(typoutputfunc, &typoutputinfo_width);
5641  prev_width_type = typid;
5642  }
5643 
5644  str = OutputFunctionCall(&typoutputinfo_width, value);
5645 
5646  /* pg_strtoint32 will complain about bad data or overflow */
5647  width = pg_strtoint32(str);
5648 
5649  pfree(str);
5650  }
5651  }
5652 
5653  /* Collect the specified or next argument position */
5654  if (argpos > 0)
5655  arg = argpos;
5656  if (arg >= nargs)
5657  ereport(ERROR,
5658  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5659  errmsg("too few arguments for format()")));
5660 
5661  /* Get the value and type of the selected argument */
5662  if (!funcvariadic)
5663  {
5664  value = PG_GETARG_DATUM(arg);
5665  isNull = PG_ARGISNULL(arg);
5666  typid = get_fn_expr_argtype(fcinfo->flinfo, arg);
5667  }
5668  else
5669  {
5670  value = elements[arg - 1];
5671  isNull = nulls[arg - 1];
5672  typid = element_type;
5673  }
5674  if (!OidIsValid(typid))
5675  elog(ERROR, "could not determine data type of format() input");
5676 
5677  arg++;
5678 
5679  /*
5680  * Get the appropriate typOutput function, reusing previous one if
5681  * same type as previous argument. That's particularly useful in the
5682  * variadic-array case, but often saves work even for ordinary calls.
5683  */
5684  if (typid != prev_type)
5685  {
5686  Oid typoutputfunc;
5687  bool typIsVarlena;
5688 
5689  getTypeOutputInfo(typid, &typoutputfunc, &typIsVarlena);
5690  fmgr_info(typoutputfunc, &typoutputfinfo);
5691  prev_type = typid;
5692  }
5693 
5694  /*
5695  * And now we can format the value.
5696  */
5697  switch (*cp)
5698  {
5699  case 's':
5700  case 'I':
5701  case 'L':
5702  text_format_string_conversion(&str, *cp, &typoutputfinfo,
5703  value, isNull,
5704  flags, width);
5705  break;
5706  default:
5707  /* should not get here, because of previous check */
5708  ereport(ERROR,
5709  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5710  errmsg("unrecognized format() type specifier \"%c\"",
5711  *cp),
5712  errhint("For a single \"%%\" use \"%%%%\".")));
5713  break;
5714  }
5715  }
5716 
5717  /* Don't need deconstruct_array results anymore. */
5718  if (elements != NULL)
5719  pfree(elements);
5720  if (nulls != NULL)
5721  pfree(nulls);
5722 
5723  /* Generate results. */
5724  result = cstring_to_text_with_len(str.data, str.len);
5725  pfree(str.data);
5726 
5727  PG_RETURN_TEXT_P(result);
5728 }
5729 
5730 /*
5731  * Parse contiguous digits as a decimal number.
5732  *
5733  * Returns true if some digits could be parsed.
5734  * The value is returned into *value, and *ptr is advanced to the next
5735  * character to be parsed.
5736  *
5737  * Note parsing invariant: at least one character is known available before
5738  * string end (end_ptr) at entry, and this is still true at exit.
5739  */
5740 static bool
5741 text_format_parse_digits(const char **ptr, const char *end_ptr, int *value)
5742 {
5743  bool found = false;
5744  const char *cp = *ptr;
5745  int val = 0;
5746 
5747  while (*cp >= '0' && *cp <= '9')
5748  {
5749  int8 digit = (*cp - '0');
5750 
5751  if (unlikely(pg_mul_s32_overflow(val, 10, &val)) ||
5752  unlikely(pg_add_s32_overflow(val, digit, &val)))
5753  ereport(ERROR,
5754  (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
5755  errmsg("number is out of range")));
5756  ADVANCE_PARSE_POINTER(cp, end_ptr);
5757  found = true;
5758  }
5759 
5760  *ptr = cp;
5761  *value = val;
5762 
5763  return found;
5764 }
5765 
5766 /*
5767  * Parse a format specifier (generally following the SUS printf spec).
5768  *
5769  * We have already advanced over the initial '%', and we are looking for
5770  * [argpos][flags][width]type (but the type character is not consumed here).
5771  *
5772  * Inputs are start_ptr (the position after '%') and end_ptr (string end + 1).
5773  * Output parameters:
5774  * argpos: argument position for value to be printed. -1 means unspecified.
5775  * widthpos: argument position for width. Zero means the argument position
5776  * was unspecified (ie, take the next arg) and -1 means no width
5777  * argument (width was omitted or specified as a constant).
5778  * flags: bitmask of flags.
5779  * width: directly-specified width value. Zero means the width was omitted
5780  * (note it's not necessary to distinguish this case from an explicit
5781  * zero width value).
5782  *
5783  * The function result is the next character position to be parsed, ie, the
5784  * location where the type character is/should be.
5785  *
5786  * Note parsing invariant: at least one character is known available before
5787  * string end (end_ptr) at entry, and this is still true at exit.
5788  */
5789 static const char *
5790 text_format_parse_format(const char *start_ptr, const char *end_ptr,
5791  int *argpos, int *widthpos,
5792  int *flags, int *width)
5793 {
5794  const char *cp = start_ptr;
5795  int n;
5796 
5797  /* set defaults for output parameters */
5798  *argpos = -1;
5799  *widthpos = -1;
5800  *flags = 0;
5801  *width = 0;
5802 
5803  /* try to identify first number */
5804  if (text_format_parse_digits(&cp, end_ptr, &n))
5805  {
5806  if (*cp != '$')
5807  {
5808  /* Must be just a width and a type, so we're done */
5809  *width = n;
5810  return cp;
5811  }
5812  /* The number was argument position */
5813  *argpos = n;
5814  /* Explicit 0 for argument index is immediately refused */
5815  if (n == 0)
5816  ereport(ERROR,
5817  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5818  errmsg("format specifies argument 0, but arguments are numbered from 1")));
5819  ADVANCE_PARSE_POINTER(cp, end_ptr);
5820  }
5821 
5822  /* Handle flags (only minus is supported now) */
5823  while (*cp == '-')
5824  {
5825  *flags |= TEXT_FORMAT_FLAG_MINUS;
5826  ADVANCE_PARSE_POINTER(cp, end_ptr);
5827  }
5828 
5829  if (*cp == '*')
5830  {
5831  /* Handle indirect width */
5832  ADVANCE_PARSE_POINTER(cp, end_ptr);
5833  if (text_format_parse_digits(&cp, end_ptr, &n))
5834  {
5835  /* number in this position must be closed by $ */
5836  if (*cp != '$')
5837  ereport(ERROR,
5838  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5839  errmsg("width argument position must be ended by \"$\"")));
5840  /* The number was width argument position */
5841  *widthpos = n;
5842  /* Explicit 0 for argument index is immediately refused */
5843  if (n == 0)
5844  ereport(ERROR,
5845  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5846  errmsg("format specifies argument 0, but arguments are numbered from 1")));
5847  ADVANCE_PARSE_POINTER(cp, end_ptr);
5848  }
5849  else
5850  *widthpos = 0; /* width's argument position is unspecified */
5851  }
5852  else
5853  {
5854  /* Check for direct width specification */
5855  if (text_format_parse_digits(&cp, end_ptr, &n))
5856  *width = n;
5857  }
5858 
5859  /* cp should now be pointing at type character */
5860  return cp;
5861 }
5862 
5863 /*
5864  * Format a %s, %I, or %L conversion
5865  */
5866 static void
5868  FmgrInfo *typOutputInfo,
5869  Datum value, bool isNull,
5870  int flags, int width)
5871 {
5872  char *str;
5873 
5874  /* Handle NULL arguments before trying to stringify the value. */
5875  if (isNull)
5876  {
5877  if (conversion == 's')
5878  text_format_append_string(buf, "", flags, width);
5879  else if (conversion == 'L')
5880  text_format_append_string(buf, "NULL", flags, width);
5881  else if (conversion == 'I')
5882  ereport(ERROR,
5883  (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
5884  errmsg("null values cannot be formatted as an SQL identifier")));
5885  return;
5886  }
5887 
5888  /* Stringify. */
5889  str = OutputFunctionCall(typOutputInfo, value);
5890 
5891  /* Escape. */
5892  if (conversion == 'I')
5893  {
5894  /* quote_identifier may or may not allocate a new string. */
5895  text_format_append_string(buf, quote_identifier(str), flags, width);
5896  }
5897  else if (conversion == 'L')
5898  {
5899  char *qstr = quote_literal_cstr(str);
5900 
5901  text_format_append_string(buf, qstr, flags, width);
5902  /* quote_literal_cstr() always allocates a new string */
5903  pfree(qstr);
5904  }
5905  else
5906  text_format_append_string(buf, str, flags, width);
5907 
5908  /* Cleanup. */
5909  pfree(str);
5910 }
5911 
5912 /*
5913  * Append str to buf, padding as directed by flags/width
5914  */
5915 static void
5917  int flags, int width)
5918 {
5919  bool align_to_left = false;
5920  int len;
5921 
5922  /* fast path for typical easy case */
5923  if (width == 0)
5924  {
5925  appendStringInfoString(buf, str);
5926  return;
5927  }
5928 
5929  if (width < 0)
5930  {
5931  /* Negative width: implicit '-' flag, then take absolute value */
5932  align_to_left = true;
5933  /* -INT_MIN is undefined */
5934  if (width <= INT_MIN)
5935  ereport(ERROR,
5936  (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
5937  errmsg("number is out of range")));
5938  width = -width;
5939  }
5940  else if (flags & TEXT_FORMAT_FLAG_MINUS)
5941  align_to_left = true;
5942 
5943  len = pg_mbstrlen(str);
5944  if (align_to_left)
5945  {
5946  /* left justify */
5947  appendStringInfoString(buf, str);
5948  if (len < width)
5949  appendStringInfoSpaces(buf, width - len);
5950  }
5951  else
5952  {
5953  /* right justify */
5954  if (len < width)
5955  appendStringInfoSpaces(buf, width - len);
5956  appendStringInfoString(buf, str);
5957  }
5958 }
5959 
5960 /*
5961  * text_format_nv - nonvariadic wrapper for text_format function.
5962  *
5963  * note: this wrapper is necessary to pass the sanity check in opr_sanity,
5964  * which checks that all built-in functions that share the implementing C
5965  * function take the same number of arguments.
5966  */
5967 Datum
5969 {
5970  return text_format(fcinfo);
5971 }
5972 
5973 /*
5974  * Helper function for Levenshtein distance functions. Faster than memcmp(),
5975  * for this use case.
5976  */
5977 static inline bool
5978 rest_of_char_same(const char *s1, const char *s2, int len)
5979 {
5980  while (len > 0)
5981  {
5982  len--;
5983  if (s1[len] != s2[len])
5984  return false;
5985  }
5986  return true;
5987 }
5988 
5989 /* Expand each Levenshtein distance variant */
5990 #include "levenshtein.c"
5991 #define LEVENSHTEIN_LESS_EQUAL
5992 #include "levenshtein.c"
5993 
5994 
5995 /*
5996  * Unicode support
5997  */
5998 
6000 unicode_norm_form_from_string(const char *formstr)
6001 {
6002  UnicodeNormalizationForm form = -1;
6003 
6004  /*
6005  * Might as well check this while we're here.
6006  */
6007  if (GetDatabaseEncoding() != PG_UTF8)
6008  ereport(ERROR,
6009  (errcode(ERRCODE_SYNTAX_ERROR),
6010  errmsg("Unicode normalization can only be performed if server encoding is UTF8")));
6011 
6012  if (pg_strcasecmp(formstr, "NFC") == 0)
6013  form = UNICODE_NFC;
6014  else if (pg_strcasecmp(formstr, "NFD") == 0)
6015  form = UNICODE_NFD;
6016  else if (pg_strcasecmp(formstr, "NFKC") == 0)
6017  form = UNICODE_NFKC;
6018  else if (pg_strcasecmp(formstr, "NFKD") == 0)
6019  form = UNICODE_NFKD;
6020  else
6021  ereport(ERROR,
6022  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6023  errmsg("invalid normalization form: %s", formstr)));
6024 
6025  return form;
6026 }
6027 
6028 Datum
6030 {
6031  text *input = PG_GETARG_TEXT_PP(0);
6032  char *formstr = text_to_cstring(PG_GETARG_TEXT_PP(1));
6034  int size;
6035  pg_wchar *input_chars;
6036  pg_wchar *output_chars;
6037  unsigned char *p;
6038  text *result;
6039  int i;
6040 
6041  form = unicode_norm_form_from_string(formstr);
6042 
6043  /* convert to pg_wchar */
6044  size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
6045  input_chars = palloc((size + 1) * sizeof(pg_wchar));
6046  p = (unsigned char *) VARDATA_ANY(input);
6047  for (i = 0; i < size; i++)
6048  {
6049  input_chars[i] = utf8_to_unicode(p);
6050  p += pg_utf_mblen(p);
6051  }
6052  input_chars[i] = (pg_wchar) '\0';
6053  Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input));
6054 
6055  /* action */
6056  output_chars = unicode_normalize(form, input_chars);
6057 
6058  /* convert back to UTF-8 string */
6059  size = 0;
6060  for (pg_wchar *wp = output_chars; *wp; wp++)
6061  {
6062  unsigned char buf[4];
6063 
6064  unicode_to_utf8(*wp, buf);
6065  size += pg_utf_mblen(buf);
6066  }
6067 
6068  result = palloc(size + VARHDRSZ);
6069  SET_VARSIZE(result, size + VARHDRSZ);
6070 
6071  p = (unsigned char *) VARDATA_ANY(result);
6072  for (pg_wchar *wp = output_chars; *wp; wp++)
6073  {
6074  unicode_to_utf8(*wp, p);
6075  p += pg_utf_mblen(p);
6076  }
6077  Assert((char *) p == (char *) result + size + VARHDRSZ);
6078 
6079  PG_RETURN_TEXT_P(result);
6080 }
6081 
6082 /*
6083  * Check whether the string is in the specified Unicode normalization form.
6084  *
6085  * This is done by convering the string to the specified normal form and then
6086  * comparing that to the original string. To speed that up, we also apply the
6087  * "quick check" algorithm specified in UAX #15, which can give a yes or no
6088  * answer for many strings by just scanning the string once.
6089  *
6090  * This function should generally be optimized for the case where the string
6091  * is in fact normalized. In that case, we'll end up looking at the entire
6092  * string, so it's probably not worth doing any incremental conversion etc.
6093  */
6094 Datum
6096 {
6097  text *input = PG_GETARG_TEXT_PP(0);
6098  char *formstr = text_to_cstring(PG_GETARG_TEXT_PP(1));
6100  int size;
6101  pg_wchar *input_chars;
6102  pg_wchar *output_chars;
6103  unsigned char *p;
6104  int i;
6105  UnicodeNormalizationQC quickcheck;
6106  int output_size;
6107  bool result;
6108 
6109  form = unicode_norm_form_from_string(formstr);
6110 
6111  /* convert to pg_wchar */
6112  size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
6113  input_chars = palloc((size + 1) * sizeof(pg_wchar));
6114  p = (unsigned char *) VARDATA_ANY(input);
6115  for (i = 0; i < size; i++)
6116  {
6117  input_chars[i] = utf8_to_unicode(p);
6118  p += pg_utf_mblen(p);
6119  }
6120  input_chars[i] = (pg_wchar) '\0';
6121  Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input));
6122 
6123  /* quick check (see UAX #15) */
6124  quickcheck = unicode_is_normalized_quickcheck(form, input_chars);
6125  if (quickcheck == UNICODE_NORM_QC_YES)
6126  PG_RETURN_BOOL(true);
6127  else if (quickcheck == UNICODE_NORM_QC_NO)
6128  PG_RETURN_BOOL(false);
6129 
6130  /* normalize and compare with original */
6131  output_chars = unicode_normalize(form, input_chars);
6132 
6133  output_size = 0;
6134  for (pg_wchar *wp = output_chars; *wp; wp++)
6135  output_size++;
6136 
6137  result = (size == output_size) &&
6138  (memcmp(input_chars, output_chars, size * sizeof(pg_wchar)) == 0);
6139 
6140  PG_RETURN_BOOL(result);
6141 }
Datum bttext_pattern_cmp(PG_FUNCTION_ARGS)
Definition: varlena.c:3131
#define PG_CACHE_LINE_SIZE
Datum text_to_array(PG_FUNCTION_ARGS)
Definition: varlena.c:4680
Datum bytea_substr_no_len(PG_FUNCTION_ARGS)
Definition: varlena.c:3267
struct SortSupportData * SortSupport
Definition: sortsupport.h:58
Value * makeString(char *str)
Definition: value.c:53
signed short int16
Definition: c.h:354
Datum byteaout(PG_FUNCTION_ARGS)
Definition: varlena.c:374
#define PG_RETURN_POINTER(x)
Definition: fmgr.h:360
#define DatumGetUInt32(X)
Definition: postgres.h:486
#define NIL
Definition: pg_list.h:65
Datum text_format(PG_FUNCTION_ARGS)
Definition: varlena.c:5464
static int bpcharfastcmp_c(Datum x, Datum y, SortSupport ssup)
Definition: varlena.c:2178
#define PG_GETARG_INT32(n)
Definition: fmgr.h:269
#define ADVANCE_PARSE_POINTER(ptr, end_ptr)
Definition: varlena.c:5451
Definition: fmgr.h:56
text * replace_text_regexp(text *src_text, void *regexp, text *replace_text, bool glob)
Definition: varlena.c:4438
#define VARATT_IS_COMPRESSED(PTR)
Definition: postgres.h:312
Datum byteaSetBit(PG_FUNCTION_ARGS)
Definition: varlena.c:3537
int pg_mbcharcliplen(const char *mbstr, int len, int limit)
Definition: mbutils.c:1009
Datum split_text(PG_FUNCTION_ARGS)
Definition: varlena.c:4573
pg_wchar utf8_to_unicode(const unsigned char *c)
Definition: wchar.c:686
int errhint(const char *fmt,...)
Definition: elog.c:1071
Datum textoverlay_no_len(PG_FUNCTION_ARGS)
Definition: varlena.c:1056
void getTypeOutputInfo(Oid type, Oid *typOutput, bool *typIsVarlena)
Definition: lsyscache.c:2784
#define VARDATA_ANY(PTR)
Definition: postgres.h:348
#define VARDATA(PTR)
Definition: postgres.h:302
char * quote_literal_cstr(const char *rawstr)
Definition: quote.c:102
Datum namegetext(PG_FUNCTION_ARGS)
Definition: varlena.c:3004
unsigned char * unicode_to_utf8(pg_wchar c, unsigned char *utf8string)
Definition: wchar.c:483
MemoryContext fn_mcxt
Definition: fmgr.h:65
#define att_align_nominal(cur_offset, attalign)
Definition: tupmacs.h:148
const char * quote_identifier(const char *ident)
Definition: ruleutils.c:10737
Datum text_lt(PG_FUNCTION_ARGS)
Definition: varlena.c:1837
Datum text_pattern_le(PG_FUNCTION_ARGS)
Definition: varlena.c:3083
#define DatumGetTextPSlice(X, m, n)
Definition: fmgr.h:303
#define DatumGetInt32(X)
Definition: postgres.h:472
static bool pg_mul_s32_overflow(int32 a, int32 b, int32 *result)
Definition: int.h:140
Datum namelttext(PG_FUNCTION_ARGS)
Definition: varlena.c:2986
Datum text_pattern_gt(PG_FUNCTION_ARGS)
Definition: varlena.c:3115
#define HEXBASE
Definition: varlena.c:5029
char * refpoint
Definition: varlena.c:72
#define VARSIZE(PTR)
Definition: postgres.h:303
Datum replace_text(PG_FUNCTION_ARGS)
Definition: varlena.c:4225
Datum byteagt(PG_FUNCTION_ARGS)
Definition: