PostgreSQL Source Code  git master
varlena.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * varlena.c
4  * Functions for the variable-length built-in types.
5  *
6  * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  * src/backend/utils/adt/varlena.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 #include "postgres.h"
16 
17 #include <ctype.h>
18 #include <limits.h>
19 
20 #include "access/detoast.h"
21 #include "catalog/pg_collation.h"
22 #include "catalog/pg_type.h"
23 #include "common/hashfn.h"
24 #include "common/int.h"
25 #include "common/unicode_norm.h"
26 #include "lib/hyperloglog.h"
27 #include "libpq/pqformat.h"
28 #include "miscadmin.h"
29 #include "nodes/execnodes.h"
30 #include "parser/scansup.h"
31 #include "port/pg_bswap.h"
32 #include "regex/regex.h"
33 #include "utils/builtins.h"
34 #include "utils/bytea.h"
35 #include "utils/lsyscache.h"
36 #include "utils/memutils.h"
37 #include "utils/pg_locale.h"
38 #include "utils/sortsupport.h"
39 #include "utils/varlena.h"
40 
41 
42 /* GUC variable */
44 
45 typedef struct varlena unknown;
46 typedef struct varlena VarString;
47 
48 /*
49  * State for text_position_* functions.
50  */
51 typedef struct
52 {
53  bool is_multibyte; /* T if multibyte encoding */
55 
56  char *str1; /* haystack string */
57  char *str2; /* needle string */
58  int len1; /* string lengths in bytes */
59  int len2;
60 
61  /* Skip table for Boyer-Moore-Horspool search algorithm: */
62  int skiptablemask; /* mask for ANDing with skiptable subscripts */
63  int skiptable[256]; /* skip distance for given mismatched char */
64 
65  char *last_match; /* pointer to last match in 'str1' */
66 
67  /*
68  * Sometimes we need to convert the byte position of a match to a
69  * character position. These store the last position that was converted,
70  * so that on the next call, we can continue from that point, rather than
71  * count characters from the very beginning.
72  */
73  char *refpoint; /* pointer within original haystack string */
74  int refpos; /* 0-based character offset of the same point */
76 
77 typedef struct
78 {
79  char *buf1; /* 1st string, or abbreviation original string
80  * buf */
81  char *buf2; /* 2nd string, or abbreviation strxfrm() buf */
82  int buflen1;
83  int buflen2;
84  int last_len1; /* Length of last buf1 string/strxfrm() input */
85  int last_len2; /* Length of last buf2 string/strxfrm() blob */
86  int last_returned; /* Last comparison result (cache) */
87  bool cache_blob; /* Does buf2 contain strxfrm() blob, etc? */
88  bool collate_c;
89  Oid typid; /* Actual datatype (text/bpchar/bytea/name) */
90  hyperLogLogState abbr_card; /* Abbreviated key cardinality state */
91  hyperLogLogState full_card; /* Full key cardinality state */
92  double prop_card; /* Required cardinality proportion */
95 
96 /*
97  * Output data for split_text(): we output either to an array or a table.
98  * tupstore and tupdesc must be set up in advance to output to a table.
99  */
100 typedef struct
101 {
106 
107 /*
108  * This should be large enough that most strings will fit, but small enough
109  * that we feel comfortable putting it on the stack
110  */
111 #define TEXTBUFLEN 1024
112 
113 #define DatumGetUnknownP(X) ((unknown *) PG_DETOAST_DATUM(X))
114 #define DatumGetUnknownPCopy(X) ((unknown *) PG_DETOAST_DATUM_COPY(X))
115 #define PG_GETARG_UNKNOWN_P(n) DatumGetUnknownP(PG_GETARG_DATUM(n))
116 #define PG_GETARG_UNKNOWN_P_COPY(n) DatumGetUnknownPCopy(PG_GETARG_DATUM(n))
117 #define PG_RETURN_UNKNOWN_P(x) PG_RETURN_POINTER(x)
118 
119 #define DatumGetVarStringP(X) ((VarString *) PG_DETOAST_DATUM(X))
120 #define DatumGetVarStringPP(X) ((VarString *) PG_DETOAST_DATUM_PACKED(X))
121 
122 static int varstrfastcmp_c(Datum x, Datum y, SortSupport ssup);
123 static int bpcharfastcmp_c(Datum x, Datum y, SortSupport ssup);
124 static int namefastcmp_c(Datum x, Datum y, SortSupport ssup);
125 static int varlenafastcmp_locale(Datum x, Datum y, SortSupport ssup);
126 static int namefastcmp_locale(Datum x, Datum y, SortSupport ssup);
127 static int varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup);
128 static int varstrcmp_abbrev(Datum x, Datum y, SortSupport ssup);
129 static Datum varstr_abbrev_convert(Datum original, SortSupport ssup);
130 static bool varstr_abbrev_abort(int memtupcount, SortSupport ssup);
131 static int32 text_length(Datum str);
132 static text *text_catenate(text *t1, text *t2);
133 static text *text_substring(Datum str,
134  int32 start,
135  int32 length,
136  bool length_not_specified);
137 static text *text_overlay(text *t1, text *t2, int sp, int sl);
138 static int text_position(text *t1, text *t2, Oid collid);
139 static void text_position_setup(text *t1, text *t2, Oid collid, TextPositionState *state);
141 static char *text_position_next_internal(char *start_ptr, TextPositionState *state);
145 static void check_collation_set(Oid collid);
146 static int text_cmp(text *arg1, text *arg2, Oid collid);
147 static bytea *bytea_catenate(bytea *t1, bytea *t2);
149  int S,
150  int L,
151  bool length_not_specified);
152 static bytea *bytea_overlay(bytea *t1, bytea *t2, int sp, int sl);
153 static void appendStringInfoText(StringInfo str, const text *t);
154 static bool split_text(FunctionCallInfo fcinfo, SplitTextOutputData *tstate);
155 static void split_text_accum_result(SplitTextOutputData *tstate,
156  text *field_value,
157  text *null_string,
158  Oid collation);
160  const char *fldsep, const char *null_string);
162 static bool text_format_parse_digits(const char **ptr, const char *end_ptr,
163  int *value);
164 static const char *text_format_parse_format(const char *start_ptr,
165  const char *end_ptr,
166  int *argpos, int *widthpos,
167  int *flags, int *width);
168 static void text_format_string_conversion(StringInfo buf, char conversion,
169  FmgrInfo *typOutputInfo,
170  Datum value, bool isNull,
171  int flags, int width);
172 static void text_format_append_string(StringInfo buf, const char *str,
173  int flags, int width);
174 
175 
176 /*****************************************************************************
177  * CONVERSION ROUTINES EXPORTED FOR USE BY C CODE *
178  *****************************************************************************/
179 
180 /*
181  * cstring_to_text
182  *
183  * Create a text value from a null-terminated C string.
184  *
185  * The new text value is freshly palloc'd with a full-size VARHDR.
186  */
187 text *
188 cstring_to_text(const char *s)
189 {
190  return cstring_to_text_with_len(s, strlen(s));
191 }
192 
193 /*
194  * cstring_to_text_with_len
195  *
196  * Same as cstring_to_text except the caller specifies the string length;
197  * the string need not be null_terminated.
198  */
199 text *
200 cstring_to_text_with_len(const char *s, int len)
201 {
202  text *result = (text *) palloc(len + VARHDRSZ);
203 
204  SET_VARSIZE(result, len + VARHDRSZ);
205  memcpy(VARDATA(result), s, len);
206 
207  return result;
208 }
209 
210 /*
211  * text_to_cstring
212  *
213  * Create a palloc'd, null-terminated C string from a text value.
214  *
215  * We support being passed a compressed or toasted text value.
216  * This is a bit bogus since such values shouldn't really be referred to as
217  * "text *", but it seems useful for robustness. If we didn't handle that
218  * case here, we'd need another routine that did, anyway.
219  */
220 char *
222 {
223  /* must cast away the const, unfortunately */
224  text *tunpacked = pg_detoast_datum_packed(unconstify(text *, t));
225  int len = VARSIZE_ANY_EXHDR(tunpacked);
226  char *result;
227 
228  result = (char *) palloc(len + 1);
229  memcpy(result, VARDATA_ANY(tunpacked), len);
230  result[len] = '\0';
231 
232  if (tunpacked != t)
233  pfree(tunpacked);
234 
235  return result;
236 }
237 
238 /*
239  * text_to_cstring_buffer
240  *
241  * Copy a text value into a caller-supplied buffer of size dst_len.
242  *
243  * The text string is truncated if necessary to fit. The result is
244  * guaranteed null-terminated (unless dst_len == 0).
245  *
246  * We support being passed a compressed or toasted text value.
247  * This is a bit bogus since such values shouldn't really be referred to as
248  * "text *", but it seems useful for robustness. If we didn't handle that
249  * case here, we'd need another routine that did, anyway.
250  */
251 void
252 text_to_cstring_buffer(const text *src, char *dst, size_t dst_len)
253 {
254  /* must cast away the const, unfortunately */
255  text *srcunpacked = pg_detoast_datum_packed(unconstify(text *, src));
256  size_t src_len = VARSIZE_ANY_EXHDR(srcunpacked);
257 
258  if (dst_len > 0)
259  {
260  dst_len--;
261  if (dst_len >= src_len)
262  dst_len = src_len;
263  else /* ensure truncation is encoding-safe */
264  dst_len = pg_mbcliplen(VARDATA_ANY(srcunpacked), src_len, dst_len);
265  memcpy(dst, VARDATA_ANY(srcunpacked), dst_len);
266  dst[dst_len] = '\0';
267  }
268 
269  if (srcunpacked != src)
270  pfree(srcunpacked);
271 }
272 
273 
274 /*****************************************************************************
275  * USER I/O ROUTINES *
276  *****************************************************************************/
277 
278 
279 #define VAL(CH) ((CH) - '0')
280 #define DIG(VAL) ((VAL) + '0')
281 
282 /*
283  * byteain - converts from printable representation of byte array
284  *
285  * Non-printable characters must be passed as '\nnn' (octal) and are
286  * converted to internal form. '\' must be passed as '\\'.
287  * ereport(ERROR, ...) if bad form.
288  *
289  * BUGS:
290  * The input is scanned twice.
291  * The error checking of input is minimal.
292  */
293 Datum
295 {
296  char *inputText = PG_GETARG_CSTRING(0);
297  char *tp;
298  char *rp;
299  int bc;
300  bytea *result;
301 
302  /* Recognize hex input */
303  if (inputText[0] == '\\' && inputText[1] == 'x')
304  {
305  size_t len = strlen(inputText);
306 
307  bc = (len - 2) / 2 + VARHDRSZ; /* maximum possible length */
308  result = palloc(bc);
309  bc = hex_decode(inputText + 2, len - 2, VARDATA(result));
310  SET_VARSIZE(result, bc + VARHDRSZ); /* actual length */
311 
312  PG_RETURN_BYTEA_P(result);
313  }
314 
315  /* Else, it's the traditional escaped style */
316  for (bc = 0, tp = inputText; *tp != '\0'; bc++)
317  {
318  if (tp[0] != '\\')
319  tp++;
320  else if ((tp[0] == '\\') &&
321  (tp[1] >= '0' && tp[1] <= '3') &&
322  (tp[2] >= '0' && tp[2] <= '7') &&
323  (tp[3] >= '0' && tp[3] <= '7'))
324  tp += 4;
325  else if ((tp[0] == '\\') &&
326  (tp[1] == '\\'))
327  tp += 2;
328  else
329  {
330  /*
331  * one backslash, not followed by another or ### valid octal
332  */
333  ereport(ERROR,
334  (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
335  errmsg("invalid input syntax for type %s", "bytea")));
336  }
337  }
338 
339  bc += VARHDRSZ;
340 
341  result = (bytea *) palloc(bc);
342  SET_VARSIZE(result, bc);
343 
344  tp = inputText;
345  rp = VARDATA(result);
346  while (*tp != '\0')
347  {
348  if (tp[0] != '\\')
349  *rp++ = *tp++;
350  else if ((tp[0] == '\\') &&
351  (tp[1] >= '0' && tp[1] <= '3') &&
352  (tp[2] >= '0' && tp[2] <= '7') &&
353  (tp[3] >= '0' && tp[3] <= '7'))
354  {
355  bc = VAL(tp[1]);
356  bc <<= 3;
357  bc += VAL(tp[2]);
358  bc <<= 3;
359  *rp++ = bc + VAL(tp[3]);
360 
361  tp += 4;
362  }
363  else if ((tp[0] == '\\') &&
364  (tp[1] == '\\'))
365  {
366  *rp++ = '\\';
367  tp += 2;
368  }
369  else
370  {
371  /*
372  * We should never get here. The first pass should not allow it.
373  */
374  ereport(ERROR,
375  (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
376  errmsg("invalid input syntax for type %s", "bytea")));
377  }
378  }
379 
380  PG_RETURN_BYTEA_P(result);
381 }
382 
383 /*
384  * byteaout - converts to printable representation of byte array
385  *
386  * In the traditional escaped format, non-printable characters are
387  * printed as '\nnn' (octal) and '\' as '\\'.
388  */
389 Datum
391 {
392  bytea *vlena = PG_GETARG_BYTEA_PP(0);
393  char *result;
394  char *rp;
395 
397  {
398  /* Print hex format */
399  rp = result = palloc(VARSIZE_ANY_EXHDR(vlena) * 2 + 2 + 1);
400  *rp++ = '\\';
401  *rp++ = 'x';
402  rp += hex_encode(VARDATA_ANY(vlena), VARSIZE_ANY_EXHDR(vlena), rp);
403  }
404  else if (bytea_output == BYTEA_OUTPUT_ESCAPE)
405  {
406  /* Print traditional escaped format */
407  char *vp;
408  uint64 len;
409  int i;
410 
411  len = 1; /* empty string has 1 char */
412  vp = VARDATA_ANY(vlena);
413  for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++)
414  {
415  if (*vp == '\\')
416  len += 2;
417  else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e)
418  len += 4;
419  else
420  len++;
421  }
422 
423  /*
424  * In principle len can't overflow uint32 if the input fit in 1GB, but
425  * for safety let's check rather than relying on palloc's internal
426  * check.
427  */
428  if (len > MaxAllocSize)
429  ereport(ERROR,
430  (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
431  errmsg_internal("result of bytea output conversion is too large")));
432  rp = result = (char *) palloc(len);
433 
434  vp = VARDATA_ANY(vlena);
435  for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++)
436  {
437  if (*vp == '\\')
438  {
439  *rp++ = '\\';
440  *rp++ = '\\';
441  }
442  else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e)
443  {
444  int val; /* holds unprintable chars */
445 
446  val = *vp;
447  rp[0] = '\\';
448  rp[3] = DIG(val & 07);
449  val >>= 3;
450  rp[2] = DIG(val & 07);
451  val >>= 3;
452  rp[1] = DIG(val & 03);
453  rp += 4;
454  }
455  else
456  *rp++ = *vp;
457  }
458  }
459  else
460  {
461  elog(ERROR, "unrecognized bytea_output setting: %d",
462  bytea_output);
463  rp = result = NULL; /* keep compiler quiet */
464  }
465  *rp = '\0';
466  PG_RETURN_CSTRING(result);
467 }
468 
469 /*
470  * bytearecv - converts external binary format to bytea
471  */
472 Datum
474 {
476  bytea *result;
477  int nbytes;
478 
479  nbytes = buf->len - buf->cursor;
480  result = (bytea *) palloc(nbytes + VARHDRSZ);
481  SET_VARSIZE(result, nbytes + VARHDRSZ);
482  pq_copymsgbytes(buf, VARDATA(result), nbytes);
483  PG_RETURN_BYTEA_P(result);
484 }
485 
486 /*
487  * byteasend - converts bytea to binary format
488  *
489  * This is a special case: just copy the input...
490  */
491 Datum
493 {
494  bytea *vlena = PG_GETARG_BYTEA_P_COPY(0);
495 
496  PG_RETURN_BYTEA_P(vlena);
497 }
498 
499 Datum
501 {
503 
504  state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
505 
506  /* Append the value unless null. */
507  if (!PG_ARGISNULL(1))
508  {
510 
511  /* On the first time through, we ignore the delimiter. */
512  if (state == NULL)
513  state = makeStringAggState(fcinfo);
514  else if (!PG_ARGISNULL(2))
515  {
516  bytea *delim = PG_GETARG_BYTEA_PP(2);
517 
519  }
520 
522  }
523 
524  /*
525  * The transition type for string_agg() is declared to be "internal",
526  * which is a pass-by-value type the same size as a pointer.
527  */
528  PG_RETURN_POINTER(state);
529 }
530 
531 Datum
533 {
535 
536  /* cannot be called directly because of internal-type argument */
537  Assert(AggCheckCallContext(fcinfo, NULL));
538 
539  state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
540 
541  if (state != NULL)
542  {
543  bytea *result;
544 
545  result = (bytea *) palloc(state->len + VARHDRSZ);
546  SET_VARSIZE(result, state->len + VARHDRSZ);
547  memcpy(VARDATA(result), state->data, state->len);
548  PG_RETURN_BYTEA_P(result);
549  }
550  else
551  PG_RETURN_NULL();
552 }
553 
554 /*
555  * textin - converts "..." to internal representation
556  */
557 Datum
559 {
560  char *inputText = PG_GETARG_CSTRING(0);
561 
562  PG_RETURN_TEXT_P(cstring_to_text(inputText));
563 }
564 
565 /*
566  * textout - converts internal representation to "..."
567  */
568 Datum
570 {
571  Datum txt = PG_GETARG_DATUM(0);
572 
574 }
575 
576 /*
577  * textrecv - converts external binary format to text
578  */
579 Datum
581 {
583  text *result;
584  char *str;
585  int nbytes;
586 
587  str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
588 
589  result = cstring_to_text_with_len(str, nbytes);
590  pfree(str);
591  PG_RETURN_TEXT_P(result);
592 }
593 
594 /*
595  * textsend - converts text to binary format
596  */
597 Datum
599 {
600  text *t = PG_GETARG_TEXT_PP(0);
602 
603  pq_begintypsend(&buf);
606 }
607 
608 
609 /*
610  * unknownin - converts "..." to internal representation
611  */
612 Datum
614 {
615  char *str = PG_GETARG_CSTRING(0);
616 
617  /* representation is same as cstring */
619 }
620 
621 /*
622  * unknownout - converts internal representation to "..."
623  */
624 Datum
626 {
627  /* representation is same as cstring */
628  char *str = PG_GETARG_CSTRING(0);
629 
631 }
632 
633 /*
634  * unknownrecv - converts external binary format to unknown
635  */
636 Datum
638 {
640  char *str;
641  int nbytes;
642 
643  str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
644  /* representation is same as cstring */
645  PG_RETURN_CSTRING(str);
646 }
647 
648 /*
649  * unknownsend - converts unknown to binary format
650  */
651 Datum
653 {
654  /* representation is same as cstring */
655  char *str = PG_GETARG_CSTRING(0);
657 
658  pq_begintypsend(&buf);
659  pq_sendtext(&buf, str, strlen(str));
661 }
662 
663 
664 /* ========== PUBLIC ROUTINES ========== */
665 
666 /*
667  * textlen -
668  * returns the logical length of a text*
669  * (which is less than the VARSIZE of the text*)
670  */
671 Datum
673 {
675 
676  /* try to avoid decompressing argument */
678 }
679 
680 /*
681  * text_length -
682  * Does the real work for textlen()
683  *
684  * This is broken out so it can be called directly by other string processing
685  * functions. Note that the argument is passed as a Datum, to indicate that
686  * it may still be in compressed form. We can avoid decompressing it at all
687  * in some cases.
688  */
689 static int32
691 {
692  /* fastpath when max encoding length is one */
695  else
696  {
697  text *t = DatumGetTextPP(str);
698 
700  VARSIZE_ANY_EXHDR(t)));
701  }
702 }
703 
704 /*
705  * textoctetlen -
706  * returns the physical length of a text*
707  * (which is less than the VARSIZE of the text*)
708  */
709 Datum
711 {
713 
714  /* We need not detoast the input at all */
716 }
717 
718 /*
719  * textcat -
720  * takes two text* and returns a text* that is the concatenation of
721  * the two.
722  *
723  * Rewritten by Sapa, sapa@hq.icb.chel.su. 8-Jul-96.
724  * Updated by Thomas, Thomas.Lockhart@jpl.nasa.gov 1997-07-10.
725  * Allocate space for output in all cases.
726  * XXX - thomas 1997-07-10
727  */
728 Datum
730 {
731  text *t1 = PG_GETARG_TEXT_PP(0);
732  text *t2 = PG_GETARG_TEXT_PP(1);
733 
735 }
736 
737 /*
738  * text_catenate
739  * Guts of textcat(), broken out so it can be used by other functions
740  *
741  * Arguments can be in short-header form, but not compressed or out-of-line
742  */
743 static text *
745 {
746  text *result;
747  int len1,
748  len2,
749  len;
750  char *ptr;
751 
752  len1 = VARSIZE_ANY_EXHDR(t1);
753  len2 = VARSIZE_ANY_EXHDR(t2);
754 
755  /* paranoia ... probably should throw error instead? */
756  if (len1 < 0)
757  len1 = 0;
758  if (len2 < 0)
759  len2 = 0;
760 
761  len = len1 + len2 + VARHDRSZ;
762  result = (text *) palloc(len);
763 
764  /* Set size of result string... */
765  SET_VARSIZE(result, len);
766 
767  /* Fill data field of result string... */
768  ptr = VARDATA(result);
769  if (len1 > 0)
770  memcpy(ptr, VARDATA_ANY(t1), len1);
771  if (len2 > 0)
772  memcpy(ptr + len1, VARDATA_ANY(t2), len2);
773 
774  return result;
775 }
776 
777 /*
778  * charlen_to_bytelen()
779  * Compute the number of bytes occupied by n characters starting at *p
780  *
781  * It is caller's responsibility that there actually are n characters;
782  * the string need not be null-terminated.
783  */
784 static int
785 charlen_to_bytelen(const char *p, int n)
786 {
788  {
789  /* Optimization for single-byte encodings */
790  return n;
791  }
792  else
793  {
794  const char *s;
795 
796  for (s = p; n > 0; n--)
797  s += pg_mblen(s);
798 
799  return s - p;
800  }
801 }
802 
803 /*
804  * text_substr()
805  * Return a substring starting at the specified position.
806  * - thomas 1997-12-31
807  *
808  * Input:
809  * - string
810  * - starting position (is one-based)
811  * - string length
812  *
813  * If the starting position is zero or less, then return from the start of the string
814  * adjusting the length to be consistent with the "negative start" per SQL.
815  * If the length is less than zero, return the remaining string.
816  *
817  * Added multibyte support.
818  * - Tatsuo Ishii 1998-4-21
819  * Changed behavior if starting position is less than one to conform to SQL behavior.
820  * Formerly returned the entire string; now returns a portion.
821  * - Thomas Lockhart 1998-12-10
822  * Now uses faster TOAST-slicing interface
823  * - John Gray 2002-02-22
824  * Remove "#ifdef MULTIBYTE" and test for encoding_max_length instead. Change
825  * behaviors conflicting with SQL to meet SQL (if E = S + L < S throw
826  * error; if E < 1, return '', not entire string). Fixed MB related bug when
827  * S > LC and < LC + 4 sometimes garbage characters are returned.
828  * - Joe Conway 2002-08-10
829  */
830 Datum
832 {
834  PG_GETARG_INT32(1),
835  PG_GETARG_INT32(2),
836  false));
837 }
838 
839 /*
840  * text_substr_no_len -
841  * Wrapper to avoid opr_sanity failure due to
842  * one function accepting a different number of args.
843  */
844 Datum
846 {
848  PG_GETARG_INT32(1),
849  -1, true));
850 }
851 
852 /*
853  * text_substring -
854  * Does the real work for text_substr() and text_substr_no_len()
855  *
856  * This is broken out so it can be called directly by other string processing
857  * functions. Note that the argument is passed as a Datum, to indicate that
858  * it may still be in compressed/toasted form. We can avoid detoasting all
859  * of it in some cases.
860  *
861  * The result is always a freshly palloc'd datum.
862  */
863 static text *
864 text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
865 {
867  int32 S = start; /* start position */
868  int32 S1; /* adjusted start position */
869  int32 L1; /* adjusted substring length */
870 
871  /* life is easy if the encoding max length is 1 */
872  if (eml == 1)
873  {
874  S1 = Max(S, 1);
875 
876  if (length_not_specified) /* special case - get length to end of
877  * string */
878  L1 = -1;
879  else
880  {
881  /* end position */
882  int E = S + length;
883 
884  /*
885  * A negative value for L is the only way for the end position to
886  * be before the start. SQL99 says to throw an error.
887  */
888  if (E < S)
889  ereport(ERROR,
890  (errcode(ERRCODE_SUBSTRING_ERROR),
891  errmsg("negative substring length not allowed")));
892 
893  /*
894  * A zero or negative value for the end position can happen if the
895  * start was negative or one. SQL99 says to return a zero-length
896  * string.
897  */
898  if (E < 1)
899  return cstring_to_text("");
900 
901  L1 = E - S1;
902  }
903 
904  /*
905  * If the start position is past the end of the string, SQL99 says to
906  * return a zero-length string -- PG_GETARG_TEXT_P_SLICE() will do
907  * that for us. Convert to zero-based starting position
908  */
909  return DatumGetTextPSlice(str, S1 - 1, L1);
910  }
911  else if (eml > 1)
912  {
913  /*
914  * When encoding max length is > 1, we can't get LC without
915  * detoasting, so we'll grab a conservatively large slice now and go
916  * back later to do the right thing
917  */
918  int32 slice_start;
919  int32 slice_size;
920  int32 slice_strlen;
921  text *slice;
922  int32 E1;
923  int32 i;
924  char *p;
925  char *s;
926  text *ret;
927 
928  /*
929  * if S is past the end of the string, the tuple toaster will return a
930  * zero-length string to us
931  */
932  S1 = Max(S, 1);
933 
934  /*
935  * We need to start at position zero because there is no way to know
936  * in advance which byte offset corresponds to the supplied start
937  * position.
938  */
939  slice_start = 0;
940 
941  if (length_not_specified) /* special case - get length to end of
942  * string */
943  slice_size = L1 = -1;
944  else
945  {
946  int E = S + length;
947 
948  /*
949  * A negative value for L is the only way for the end position to
950  * be before the start. SQL99 says to throw an error.
951  */
952  if (E < S)
953  ereport(ERROR,
954  (errcode(ERRCODE_SUBSTRING_ERROR),
955  errmsg("negative substring length not allowed")));
956 
957  /*
958  * A zero or negative value for the end position can happen if the
959  * start was negative or one. SQL99 says to return a zero-length
960  * string.
961  */
962  if (E < 1)
963  return cstring_to_text("");
964 
965  /*
966  * if E is past the end of the string, the tuple toaster will
967  * truncate the length for us
968  */
969  L1 = E - S1;
970 
971  /*
972  * Total slice size in bytes can't be any longer than the start
973  * position plus substring length times the encoding max length.
974  */
975  slice_size = (S1 + L1) * eml;
976  }
977 
978  /*
979  * If we're working with an untoasted source, no need to do an extra
980  * copying step.
981  */
984  slice = DatumGetTextPSlice(str, slice_start, slice_size);
985  else
986  slice = (text *) DatumGetPointer(str);
987 
988  /* see if we got back an empty string */
989  if (VARSIZE_ANY_EXHDR(slice) == 0)
990  {
991  if (slice != (text *) DatumGetPointer(str))
992  pfree(slice);
993  return cstring_to_text("");
994  }
995 
996  /* Now we can get the actual length of the slice in MB characters */
997  slice_strlen = pg_mbstrlen_with_len(VARDATA_ANY(slice),
998  VARSIZE_ANY_EXHDR(slice));
999 
1000  /*
1001  * Check that the start position wasn't > slice_strlen. If so, SQL99
1002  * says to return a zero-length string.
1003  */
1004  if (S1 > slice_strlen)
1005  {
1006  if (slice != (text *) DatumGetPointer(str))
1007  pfree(slice);
1008  return cstring_to_text("");
1009  }
1010 
1011  /*
1012  * Adjust L1 and E1 now that we know the slice string length. Again
1013  * remember that S1 is one based, and slice_start is zero based.
1014  */
1015  if (L1 > -1)
1016  E1 = Min(S1 + L1, slice_start + 1 + slice_strlen);
1017  else
1018  E1 = slice_start + 1 + slice_strlen;
1019 
1020  /*
1021  * Find the start position in the slice; remember S1 is not zero based
1022  */
1023  p = VARDATA_ANY(slice);
1024  for (i = 0; i < S1 - 1; i++)
1025  p += pg_mblen(p);
1026 
1027  /* hang onto a pointer to our start position */
1028  s = p;
1029 
1030  /*
1031  * Count the actual bytes used by the substring of the requested
1032  * length.
1033  */
1034  for (i = S1; i < E1; i++)
1035  p += pg_mblen(p);
1036 
1037  ret = (text *) palloc(VARHDRSZ + (p - s));
1038  SET_VARSIZE(ret, VARHDRSZ + (p - s));
1039  memcpy(VARDATA(ret), s, (p - s));
1040 
1041  if (slice != (text *) DatumGetPointer(str))
1042  pfree(slice);
1043 
1044  return ret;
1045  }
1046  else
1047  elog(ERROR, "invalid backend encoding: encoding max length < 1");
1048 
1049  /* not reached: suppress compiler warning */
1050  return NULL;
1051 }
1052 
1053 /*
1054  * textoverlay
1055  * Replace specified substring of first string with second
1056  *
1057  * The SQL standard defines OVERLAY() in terms of substring and concatenation.
1058  * This code is a direct implementation of what the standard says.
1059  */
1060 Datum
1062 {
1063  text *t1 = PG_GETARG_TEXT_PP(0);
1064  text *t2 = PG_GETARG_TEXT_PP(1);
1065  int sp = PG_GETARG_INT32(2); /* substring start position */
1066  int sl = PG_GETARG_INT32(3); /* substring length */
1067 
1068  PG_RETURN_TEXT_P(text_overlay(t1, t2, sp, sl));
1069 }
1070 
1071 Datum
1073 {
1074  text *t1 = PG_GETARG_TEXT_PP(0);
1075  text *t2 = PG_GETARG_TEXT_PP(1);
1076  int sp = PG_GETARG_INT32(2); /* substring start position */
1077  int sl;
1078 
1079  sl = text_length(PointerGetDatum(t2)); /* defaults to length(t2) */
1080  PG_RETURN_TEXT_P(text_overlay(t1, t2, sp, sl));
1081 }
1082 
1083 static text *
1084 text_overlay(text *t1, text *t2, int sp, int sl)
1085 {
1086  text *result;
1087  text *s1;
1088  text *s2;
1089  int sp_pl_sl;
1090 
1091  /*
1092  * Check for possible integer-overflow cases. For negative sp, throw a
1093  * "substring length" error because that's what should be expected
1094  * according to the spec's definition of OVERLAY().
1095  */
1096  if (sp <= 0)
1097  ereport(ERROR,
1098  (errcode(ERRCODE_SUBSTRING_ERROR),
1099  errmsg("negative substring length not allowed")));
1100  if (pg_add_s32_overflow(sp, sl, &sp_pl_sl))
1101  ereport(ERROR,
1102  (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
1103  errmsg("integer out of range")));
1104 
1105  s1 = text_substring(PointerGetDatum(t1), 1, sp - 1, false);
1106  s2 = text_substring(PointerGetDatum(t1), sp_pl_sl, -1, true);
1107  result = text_catenate(s1, t2);
1108  result = text_catenate(result, s2);
1109 
1110  return result;
1111 }
1112 
1113 /*
1114  * textpos -
1115  * Return the position of the specified substring.
1116  * Implements the SQL POSITION() function.
1117  * Ref: A Guide To The SQL Standard, Date & Darwen, 1997
1118  * - thomas 1997-07-27
1119  */
1120 Datum
1122 {
1123  text *str = PG_GETARG_TEXT_PP(0);
1124  text *search_str = PG_GETARG_TEXT_PP(1);
1125 
1126  PG_RETURN_INT32((int32) text_position(str, search_str, PG_GET_COLLATION()));
1127 }
1128 
1129 /*
1130  * text_position -
1131  * Does the real work for textpos()
1132  *
1133  * Inputs:
1134  * t1 - string to be searched
1135  * t2 - pattern to match within t1
1136  * Result:
1137  * Character index of the first matched char, starting from 1,
1138  * or 0 if no match.
1139  *
1140  * This is broken out so it can be called directly by other string processing
1141  * functions.
1142  */
1143 static int
1144 text_position(text *t1, text *t2, Oid collid)
1145 {
1147  int result;
1148 
1149  /* Empty needle always matches at position 1 */
1150  if (VARSIZE_ANY_EXHDR(t2) < 1)
1151  return 1;
1152 
1153  /* Otherwise, can't match if haystack is shorter than needle */
1154  if (VARSIZE_ANY_EXHDR(t1) < VARSIZE_ANY_EXHDR(t2))
1155  return 0;
1156 
1157  text_position_setup(t1, t2, collid, &state);
1158  if (!text_position_next(&state))
1159  result = 0;
1160  else
1161  result = text_position_get_match_pos(&state);
1162  text_position_cleanup(&state);
1163  return result;
1164 }
1165 
1166 
1167 /*
1168  * text_position_setup, text_position_next, text_position_cleanup -
1169  * Component steps of text_position()
1170  *
1171  * These are broken out so that a string can be efficiently searched for
1172  * multiple occurrences of the same pattern. text_position_next may be
1173  * called multiple times, and it advances to the next match on each call.
1174  * text_position_get_match_ptr() and text_position_get_match_pos() return
1175  * a pointer or 1-based character position of the last match, respectively.
1176  *
1177  * The "state" variable is normally just a local variable in the caller.
1178  *
1179  * NOTE: text_position_next skips over the matched portion. For example,
1180  * searching for "xx" in "xxx" returns only one match, not two.
1181  */
1182 
1183 static void
1185 {
1186  int len1 = VARSIZE_ANY_EXHDR(t1);
1187  int len2 = VARSIZE_ANY_EXHDR(t2);
1188  pg_locale_t mylocale = 0;
1189 
1190  check_collation_set(collid);
1191 
1192  if (!lc_collate_is_c(collid) && collid != DEFAULT_COLLATION_OID)
1193  mylocale = pg_newlocale_from_collation(collid);
1194 
1195  if (mylocale && !mylocale->deterministic)
1196  ereport(ERROR,
1197  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1198  errmsg("nondeterministic collations are not supported for substring searches")));
1199 
1200  Assert(len1 > 0);
1201  Assert(len2 > 0);
1202 
1203  /*
1204  * Even with a multi-byte encoding, we perform the search using the raw
1205  * byte sequence, ignoring multibyte issues. For UTF-8, that works fine,
1206  * because in UTF-8 the byte sequence of one character cannot contain
1207  * another character. For other multi-byte encodings, we do the search
1208  * initially as a simple byte search, ignoring multibyte issues, but
1209  * verify afterwards that the match we found is at a character boundary,
1210  * and continue the search if it was a false match.
1211  */
1213  {
1214  state->is_multibyte = false;
1215  state->is_multibyte_char_in_char = false;
1216  }
1217  else if (GetDatabaseEncoding() == PG_UTF8)
1218  {
1219  state->is_multibyte = true;
1220  state->is_multibyte_char_in_char = false;
1221  }
1222  else
1223  {
1224  state->is_multibyte = true;
1225  state->is_multibyte_char_in_char = true;
1226  }
1227 
1228  state->str1 = VARDATA_ANY(t1);
1229  state->str2 = VARDATA_ANY(t2);
1230  state->len1 = len1;
1231  state->len2 = len2;
1232  state->last_match = NULL;
1233  state->refpoint = state->str1;
1234  state->refpos = 0;
1235 
1236  /*
1237  * Prepare the skip table for Boyer-Moore-Horspool searching. In these
1238  * notes we use the terminology that the "haystack" is the string to be
1239  * searched (t1) and the "needle" is the pattern being sought (t2).
1240  *
1241  * If the needle is empty or bigger than the haystack then there is no
1242  * point in wasting cycles initializing the table. We also choose not to
1243  * use B-M-H for needles of length 1, since the skip table can't possibly
1244  * save anything in that case.
1245  */
1246  if (len1 >= len2 && len2 > 1)
1247  {
1248  int searchlength = len1 - len2;
1249  int skiptablemask;
1250  int last;
1251  int i;
1252  const char *str2 = state->str2;
1253 
1254  /*
1255  * First we must determine how much of the skip table to use. The
1256  * declaration of TextPositionState allows up to 256 elements, but for
1257  * short search problems we don't really want to have to initialize so
1258  * many elements --- it would take too long in comparison to the
1259  * actual search time. So we choose a useful skip table size based on
1260  * the haystack length minus the needle length. The closer the needle
1261  * length is to the haystack length the less useful skipping becomes.
1262  *
1263  * Note: since we use bit-masking to select table elements, the skip
1264  * table size MUST be a power of 2, and so the mask must be 2^N-1.
1265  */
1266  if (searchlength < 16)
1267  skiptablemask = 3;
1268  else if (searchlength < 64)
1269  skiptablemask = 7;
1270  else if (searchlength < 128)
1271  skiptablemask = 15;
1272  else if (searchlength < 512)
1273  skiptablemask = 31;
1274  else if (searchlength < 2048)
1275  skiptablemask = 63;
1276  else if (searchlength < 4096)
1277  skiptablemask = 127;
1278  else
1279  skiptablemask = 255;
1280  state->skiptablemask = skiptablemask;
1281 
1282  /*
1283  * Initialize the skip table. We set all elements to the needle
1284  * length, since this is the correct skip distance for any character
1285  * not found in the needle.
1286  */
1287  for (i = 0; i <= skiptablemask; i++)
1288  state->skiptable[i] = len2;
1289 
1290  /*
1291  * Now examine the needle. For each character except the last one,
1292  * set the corresponding table element to the appropriate skip
1293  * distance. Note that when two characters share the same skip table
1294  * entry, the one later in the needle must determine the skip
1295  * distance.
1296  */
1297  last = len2 - 1;
1298 
1299  for (i = 0; i < last; i++)
1300  state->skiptable[(unsigned char) str2[i] & skiptablemask] = last - i;
1301  }
1302 }
1303 
1304 /*
1305  * Advance to the next match, starting from the end of the previous match
1306  * (or the beginning of the string, on first call). Returns true if a match
1307  * is found.
1308  *
1309  * Note that this refuses to match an empty-string needle. Most callers
1310  * will have handled that case specially and we'll never see it here.
1311  */
1312 static bool
1314 {
1315  int needle_len = state->len2;
1316  char *start_ptr;
1317  char *matchptr;
1318 
1319  if (needle_len <= 0)
1320  return false; /* result for empty pattern */
1321 
1322  /* Start from the point right after the previous match. */
1323  if (state->last_match)
1324  start_ptr = state->last_match + needle_len;
1325  else
1326  start_ptr = state->str1;
1327 
1328 retry:
1329  matchptr = text_position_next_internal(start_ptr, state);
1330 
1331  if (!matchptr)
1332  return false;
1333 
1334  /*
1335  * Found a match for the byte sequence. If this is a multibyte encoding,
1336  * where one character's byte sequence can appear inside a longer
1337  * multi-byte character, we need to verify that the match was at a
1338  * character boundary, not in the middle of a multi-byte character.
1339  */
1340  if (state->is_multibyte_char_in_char)
1341  {
1342  /* Walk one character at a time, until we reach the match. */
1343 
1344  /* the search should never move backwards. */
1345  Assert(state->refpoint <= matchptr);
1346 
1347  while (state->refpoint < matchptr)
1348  {
1349  /* step to next character. */
1350  state->refpoint += pg_mblen(state->refpoint);
1351  state->refpos++;
1352 
1353  /*
1354  * If we stepped over the match's start position, then it was a
1355  * false positive, where the byte sequence appeared in the middle
1356  * of a multi-byte character. Skip it, and continue the search at
1357  * the next character boundary.
1358  */
1359  if (state->refpoint > matchptr)
1360  {
1361  start_ptr = state->refpoint;
1362  goto retry;
1363  }
1364  }
1365  }
1366 
1367  state->last_match = matchptr;
1368  return true;
1369 }
1370 
1371 /*
1372  * Subroutine of text_position_next(). This searches for the raw byte
1373  * sequence, ignoring any multi-byte encoding issues. Returns the first
1374  * match starting at 'start_ptr', or NULL if no match is found.
1375  */
1376 static char *
1378 {
1379  int haystack_len = state->len1;
1380  int needle_len = state->len2;
1381  int skiptablemask = state->skiptablemask;
1382  const char *haystack = state->str1;
1383  const char *needle = state->str2;
1384  const char *haystack_end = &haystack[haystack_len];
1385  const char *hptr;
1386 
1387  Assert(start_ptr >= haystack && start_ptr <= haystack_end);
1388 
1389  if (needle_len == 1)
1390  {
1391  /* No point in using B-M-H for a one-character needle */
1392  char nchar = *needle;
1393 
1394  hptr = start_ptr;
1395  while (hptr < haystack_end)
1396  {
1397  if (*hptr == nchar)
1398  return (char *) hptr;
1399  hptr++;
1400  }
1401  }
1402  else
1403  {
1404  const char *needle_last = &needle[needle_len - 1];
1405 
1406  /* Start at startpos plus the length of the needle */
1407  hptr = start_ptr + needle_len - 1;
1408  while (hptr < haystack_end)
1409  {
1410  /* Match the needle scanning *backward* */
1411  const char *nptr;
1412  const char *p;
1413 
1414  nptr = needle_last;
1415  p = hptr;
1416  while (*nptr == *p)
1417  {
1418  /* Matched it all? If so, return 1-based position */
1419  if (nptr == needle)
1420  return (char *) p;
1421  nptr--, p--;
1422  }
1423 
1424  /*
1425  * No match, so use the haystack char at hptr to decide how far to
1426  * advance. If the needle had any occurrence of that character
1427  * (or more precisely, one sharing the same skiptable entry)
1428  * before its last character, then we advance far enough to align
1429  * the last such needle character with that haystack position.
1430  * Otherwise we can advance by the whole needle length.
1431  */
1432  hptr += state->skiptable[(unsigned char) *hptr & skiptablemask];
1433  }
1434  }
1435 
1436  return 0; /* not found */
1437 }
1438 
1439 /*
1440  * Return a pointer to the current match.
1441  *
1442  * The returned pointer points into correct position in the original
1443  * the haystack string.
1444  */
1445 static char *
1447 {
1448  return state->last_match;
1449 }
1450 
1451 /*
1452  * Return the offset of the current match.
1453  *
1454  * The offset is in characters, 1-based.
1455  */
1456 static int
1458 {
1459  if (!state->is_multibyte)
1460  return state->last_match - state->str1 + 1;
1461  else
1462  {
1463  /* Convert the byte position to char position. */
1464  while (state->refpoint < state->last_match)
1465  {
1466  state->refpoint += pg_mblen(state->refpoint);
1467  state->refpos++;
1468  }
1469  Assert(state->refpoint == state->last_match);
1470  return state->refpos + 1;
1471  }
1472 }
1473 
1474 static void
1476 {
1477  /* no cleanup needed */
1478 }
1479 
1480 static void
1482 {
1483  if (!OidIsValid(collid))
1484  {
1485  /*
1486  * This typically means that the parser could not resolve a conflict
1487  * of implicit collations, so report it that way.
1488  */
1489  ereport(ERROR,
1490  (errcode(ERRCODE_INDETERMINATE_COLLATION),
1491  errmsg("could not determine which collation to use for string comparison"),
1492  errhint("Use the COLLATE clause to set the collation explicitly.")));
1493  }
1494 }
1495 
1496 /* varstr_cmp()
1497  * Comparison function for text strings with given lengths.
1498  * Includes locale support, but must copy strings to temporary memory
1499  * to allow null-termination for inputs to strcoll().
1500  * Returns an integer less than, equal to, or greater than zero, indicating
1501  * whether arg1 is less than, equal to, or greater than arg2.
1502  *
1503  * Note: many functions that depend on this are marked leakproof; therefore,
1504  * avoid reporting the actual contents of the input when throwing errors.
1505  * All errors herein should be things that can't happen except on corrupt
1506  * data, anyway; otherwise we will have trouble with indexing strings that
1507  * would cause them.
1508  */
1509 int
1510 varstr_cmp(const char *arg1, int len1, const char *arg2, int len2, Oid collid)
1511 {
1512  int result;
1513 
1514  check_collation_set(collid);
1515 
1516  /*
1517  * Unfortunately, there is no strncoll(), so in the non-C locale case we
1518  * have to do some memory copying. This turns out to be significantly
1519  * slower, so we optimize the case where LC_COLLATE is C. We also try to
1520  * optimize relatively-short strings by avoiding palloc/pfree overhead.
1521  */
1522  if (lc_collate_is_c(collid))
1523  {
1524  result = memcmp(arg1, arg2, Min(len1, len2));
1525  if ((result == 0) && (len1 != len2))
1526  result = (len1 < len2) ? -1 : 1;
1527  }
1528  else
1529  {
1530  char a1buf[TEXTBUFLEN];
1531  char a2buf[TEXTBUFLEN];
1532  char *a1p,
1533  *a2p;
1534  pg_locale_t mylocale = 0;
1535 
1536  if (collid != DEFAULT_COLLATION_OID)
1537  mylocale = pg_newlocale_from_collation(collid);
1538 
1539  /*
1540  * memcmp() can't tell us which of two unequal strings sorts first,
1541  * but it's a cheap way to tell if they're equal. Testing shows that
1542  * memcmp() followed by strcoll() is only trivially slower than
1543  * strcoll() by itself, so we don't lose much if this doesn't work out
1544  * very often, and if it does - for example, because there are many
1545  * equal strings in the input - then we win big by avoiding expensive
1546  * collation-aware comparisons.
1547  */
1548  if (len1 == len2 && memcmp(arg1, arg2, len1) == 0)
1549  return 0;
1550 
1551 #ifdef WIN32
1552  /* Win32 does not have UTF-8, so we need to map to UTF-16 */
1553  if (GetDatabaseEncoding() == PG_UTF8
1554  && (!mylocale || mylocale->provider == COLLPROVIDER_LIBC))
1555  {
1556  int a1len;
1557  int a2len;
1558  int r;
1559 
1560  if (len1 >= TEXTBUFLEN / 2)
1561  {
1562  a1len = len1 * 2 + 2;
1563  a1p = palloc(a1len);
1564  }
1565  else
1566  {
1567  a1len = TEXTBUFLEN;
1568  a1p = a1buf;
1569  }
1570  if (len2 >= TEXTBUFLEN / 2)
1571  {
1572  a2len = len2 * 2 + 2;
1573  a2p = palloc(a2len);
1574  }
1575  else
1576  {
1577  a2len = TEXTBUFLEN;
1578  a2p = a2buf;
1579  }
1580 
1581  /* stupid Microsloth API does not work for zero-length input */
1582  if (len1 == 0)
1583  r = 0;
1584  else
1585  {
1586  r = MultiByteToWideChar(CP_UTF8, 0, arg1, len1,
1587  (LPWSTR) a1p, a1len / 2);
1588  if (!r)
1589  ereport(ERROR,
1590  (errmsg("could not convert string to UTF-16: error code %lu",
1591  GetLastError())));
1592  }
1593  ((LPWSTR) a1p)[r] = 0;
1594 
1595  if (len2 == 0)
1596  r = 0;
1597  else
1598  {
1599  r = MultiByteToWideChar(CP_UTF8, 0, arg2, len2,
1600  (LPWSTR) a2p, a2len / 2);
1601  if (!r)
1602  ereport(ERROR,
1603  (errmsg("could not convert string to UTF-16: error code %lu",
1604  GetLastError())));
1605  }
1606  ((LPWSTR) a2p)[r] = 0;
1607 
1608  errno = 0;
1609 #ifdef HAVE_LOCALE_T
1610  if (mylocale)
1611  result = wcscoll_l((LPWSTR) a1p, (LPWSTR) a2p, mylocale->info.lt);
1612  else
1613 #endif
1614  result = wcscoll((LPWSTR) a1p, (LPWSTR) a2p);
1615  if (result == 2147483647) /* _NLSCMPERROR; missing from mingw
1616  * headers */
1617  ereport(ERROR,
1618  (errmsg("could not compare Unicode strings: %m")));
1619 
1620  /* Break tie if necessary. */
1621  if (result == 0 &&
1622  (!mylocale || mylocale->deterministic))
1623  {
1624  result = memcmp(arg1, arg2, Min(len1, len2));
1625  if ((result == 0) && (len1 != len2))
1626  result = (len1 < len2) ? -1 : 1;
1627  }
1628 
1629  if (a1p != a1buf)
1630  pfree(a1p);
1631  if (a2p != a2buf)
1632  pfree(a2p);
1633 
1634  return result;
1635  }
1636 #endif /* WIN32 */
1637 
1638  if (len1 >= TEXTBUFLEN)
1639  a1p = (char *) palloc(len1 + 1);
1640  else
1641  a1p = a1buf;
1642  if (len2 >= TEXTBUFLEN)
1643  a2p = (char *) palloc(len2 + 1);
1644  else
1645  a2p = a2buf;
1646 
1647  memcpy(a1p, arg1, len1);
1648  a1p[len1] = '\0';
1649  memcpy(a2p, arg2, len2);
1650  a2p[len2] = '\0';
1651 
1652  if (mylocale)
1653  {
1654  if (mylocale->provider == COLLPROVIDER_ICU)
1655  {
1656 #ifdef USE_ICU
1657 #ifdef HAVE_UCOL_STRCOLLUTF8
1658  if (GetDatabaseEncoding() == PG_UTF8)
1659  {
1660  UErrorCode status;
1661 
1662  status = U_ZERO_ERROR;
1663  result = ucol_strcollUTF8(mylocale->info.icu.ucol,
1664  arg1, len1,
1665  arg2, len2,
1666  &status);
1667  if (U_FAILURE(status))
1668  ereport(ERROR,
1669  (errmsg("collation failed: %s", u_errorName(status))));
1670  }
1671  else
1672 #endif
1673  {
1674  int32_t ulen1,
1675  ulen2;
1676  UChar *uchar1,
1677  *uchar2;
1678 
1679  ulen1 = icu_to_uchar(&uchar1, arg1, len1);
1680  ulen2 = icu_to_uchar(&uchar2, arg2, len2);
1681 
1682  result = ucol_strcoll(mylocale->info.icu.ucol,
1683  uchar1, ulen1,
1684  uchar2, ulen2);
1685 
1686  pfree(uchar1);
1687  pfree(uchar2);
1688  }
1689 #else /* not USE_ICU */
1690  /* shouldn't happen */
1691  elog(ERROR, "unsupported collprovider: %c", mylocale->provider);
1692 #endif /* not USE_ICU */
1693  }
1694  else
1695  {
1696 #ifdef HAVE_LOCALE_T
1697  result = strcoll_l(a1p, a2p, mylocale->info.lt);
1698 #else
1699  /* shouldn't happen */
1700  elog(ERROR, "unsupported collprovider: %c", mylocale->provider);
1701 #endif
1702  }
1703  }
1704  else
1705  result = strcoll(a1p, a2p);
1706 
1707  /* Break tie if necessary. */
1708  if (result == 0 &&
1709  (!mylocale || mylocale->deterministic))
1710  result = strcmp(a1p, a2p);
1711 
1712  if (a1p != a1buf)
1713  pfree(a1p);
1714  if (a2p != a2buf)
1715  pfree(a2p);
1716  }
1717 
1718  return result;
1719 }
1720 
1721 /* text_cmp()
1722  * Internal comparison function for text strings.
1723  * Returns -1, 0 or 1
1724  */
1725 static int
1726 text_cmp(text *arg1, text *arg2, Oid collid)
1727 {
1728  char *a1p,
1729  *a2p;
1730  int len1,
1731  len2;
1732 
1733  a1p = VARDATA_ANY(arg1);
1734  a2p = VARDATA_ANY(arg2);
1735 
1736  len1 = VARSIZE_ANY_EXHDR(arg1);
1737  len2 = VARSIZE_ANY_EXHDR(arg2);
1738 
1739  return varstr_cmp(a1p, len1, a2p, len2, collid);
1740 }
1741 
1742 /*
1743  * Comparison functions for text strings.
1744  *
1745  * Note: btree indexes need these routines not to leak memory; therefore,
1746  * be careful to free working copies of toasted datums. Most places don't
1747  * need to be so careful.
1748  */
1749 
1750 Datum
1752 {
1753  Oid collid = PG_GET_COLLATION();
1754  bool result;
1755 
1756  check_collation_set(collid);
1757 
1758  if (lc_collate_is_c(collid) ||
1759  collid == DEFAULT_COLLATION_OID ||
1760  pg_newlocale_from_collation(collid)->deterministic)
1761  {
1762  Datum arg1 = PG_GETARG_DATUM(0);
1763  Datum arg2 = PG_GETARG_DATUM(1);
1764  Size len1,
1765  len2;
1766 
1767  /*
1768  * Since we only care about equality or not-equality, we can avoid all
1769  * the expense of strcoll() here, and just do bitwise comparison. In
1770  * fact, we don't even have to do a bitwise comparison if we can show
1771  * the lengths of the strings are unequal; which might save us from
1772  * having to detoast one or both values.
1773  */
1774  len1 = toast_raw_datum_size(arg1);
1775  len2 = toast_raw_datum_size(arg2);
1776  if (len1 != len2)
1777  result = false;
1778  else
1779  {
1780  text *targ1 = DatumGetTextPP(arg1);
1781  text *targ2 = DatumGetTextPP(arg2);
1782 
1783  result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1784  len1 - VARHDRSZ) == 0);
1785 
1786  PG_FREE_IF_COPY(targ1, 0);
1787  PG_FREE_IF_COPY(targ2, 1);
1788  }
1789  }
1790  else
1791  {
1792  text *arg1 = PG_GETARG_TEXT_PP(0);
1793  text *arg2 = PG_GETARG_TEXT_PP(1);
1794 
1795  result = (text_cmp(arg1, arg2, collid) == 0);
1796 
1797  PG_FREE_IF_COPY(arg1, 0);
1798  PG_FREE_IF_COPY(arg2, 1);
1799  }
1800 
1801  PG_RETURN_BOOL(result);
1802 }
1803 
1804 Datum
1806 {
1807  Oid collid = PG_GET_COLLATION();
1808  bool result;
1809 
1810  check_collation_set(collid);
1811 
1812  if (lc_collate_is_c(collid) ||
1813  collid == DEFAULT_COLLATION_OID ||
1814  pg_newlocale_from_collation(collid)->deterministic)
1815  {
1816  Datum arg1 = PG_GETARG_DATUM(0);
1817  Datum arg2 = PG_GETARG_DATUM(1);
1818  Size len1,
1819  len2;
1820 
1821  /* See comment in texteq() */
1822  len1 = toast_raw_datum_size(arg1);
1823  len2 = toast_raw_datum_size(arg2);
1824  if (len1 != len2)
1825  result = true;
1826  else
1827  {
1828  text *targ1 = DatumGetTextPP(arg1);
1829  text *targ2 = DatumGetTextPP(arg2);
1830 
1831  result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1832  len1 - VARHDRSZ) != 0);
1833 
1834  PG_FREE_IF_COPY(targ1, 0);
1835  PG_FREE_IF_COPY(targ2, 1);
1836  }
1837  }
1838  else
1839  {
1840  text *arg1 = PG_GETARG_TEXT_PP(0);
1841  text *arg2 = PG_GETARG_TEXT_PP(1);
1842 
1843  result = (text_cmp(arg1, arg2, collid) != 0);
1844 
1845  PG_FREE_IF_COPY(arg1, 0);
1846  PG_FREE_IF_COPY(arg2, 1);
1847  }
1848 
1849  PG_RETURN_BOOL(result);
1850 }
1851 
1852 Datum
1854 {
1855  text *arg1 = PG_GETARG_TEXT_PP(0);
1856  text *arg2 = PG_GETARG_TEXT_PP(1);
1857  bool result;
1858 
1859  result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) < 0);
1860 
1861  PG_FREE_IF_COPY(arg1, 0);
1862  PG_FREE_IF_COPY(arg2, 1);
1863 
1864  PG_RETURN_BOOL(result);
1865 }
1866 
1867 Datum
1869 {
1870  text *arg1 = PG_GETARG_TEXT_PP(0);
1871  text *arg2 = PG_GETARG_TEXT_PP(1);
1872  bool result;
1873 
1874  result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) <= 0);
1875 
1876  PG_FREE_IF_COPY(arg1, 0);
1877  PG_FREE_IF_COPY(arg2, 1);
1878 
1879  PG_RETURN_BOOL(result);
1880 }
1881 
1882 Datum
1884 {
1885  text *arg1 = PG_GETARG_TEXT_PP(0);
1886  text *arg2 = PG_GETARG_TEXT_PP(1);
1887  bool result;
1888 
1889  result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) > 0);
1890 
1891  PG_FREE_IF_COPY(arg1, 0);
1892  PG_FREE_IF_COPY(arg2, 1);
1893 
1894  PG_RETURN_BOOL(result);
1895 }
1896 
1897 Datum
1899 {
1900  text *arg1 = PG_GETARG_TEXT_PP(0);
1901  text *arg2 = PG_GETARG_TEXT_PP(1);
1902  bool result;
1903 
1904  result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) >= 0);
1905 
1906  PG_FREE_IF_COPY(arg1, 0);
1907  PG_FREE_IF_COPY(arg2, 1);
1908 
1909  PG_RETURN_BOOL(result);
1910 }
1911 
1912 Datum
1914 {
1915  Datum arg1 = PG_GETARG_DATUM(0);
1916  Datum arg2 = PG_GETARG_DATUM(1);
1917  Oid collid = PG_GET_COLLATION();
1918  pg_locale_t mylocale = 0;
1919  bool result;
1920  Size len1,
1921  len2;
1922 
1923  check_collation_set(collid);
1924 
1925  if (!lc_collate_is_c(collid) && collid != DEFAULT_COLLATION_OID)
1926  mylocale = pg_newlocale_from_collation(collid);
1927 
1928  if (mylocale && !mylocale->deterministic)
1929  ereport(ERROR,
1930  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1931  errmsg("nondeterministic collations are not supported for substring searches")));
1932 
1933  len1 = toast_raw_datum_size(arg1);
1934  len2 = toast_raw_datum_size(arg2);
1935  if (len2 > len1)
1936  result = false;
1937  else
1938  {
1939  text *targ1 = text_substring(arg1, 1, len2, false);
1940  text *targ2 = DatumGetTextPP(arg2);
1941 
1942  result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1943  VARSIZE_ANY_EXHDR(targ2)) == 0);
1944 
1945  PG_FREE_IF_COPY(targ1, 0);
1946  PG_FREE_IF_COPY(targ2, 1);
1947  }
1948 
1949  PG_RETURN_BOOL(result);
1950 }
1951 
1952 Datum
1954 {
1955  text *arg1 = PG_GETARG_TEXT_PP(0);
1956  text *arg2 = PG_GETARG_TEXT_PP(1);
1957  int32 result;
1958 
1959  result = text_cmp(arg1, arg2, PG_GET_COLLATION());
1960 
1961  PG_FREE_IF_COPY(arg1, 0);
1962  PG_FREE_IF_COPY(arg2, 1);
1963 
1964  PG_RETURN_INT32(result);
1965 }
1966 
1967 Datum
1969 {
1971  Oid collid = ssup->ssup_collation;
1972  MemoryContext oldcontext;
1973 
1974  oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
1975 
1976  /* Use generic string SortSupport */
1977  varstr_sortsupport(ssup, TEXTOID, collid);
1978 
1979  MemoryContextSwitchTo(oldcontext);
1980 
1981  PG_RETURN_VOID();
1982 }
1983 
1984 /*
1985  * Generic sortsupport interface for character type's operator classes.
1986  * Includes locale support, and support for BpChar semantics (i.e. removing
1987  * trailing spaces before comparison).
1988  *
1989  * Relies on the assumption that text, VarChar, BpChar, and bytea all have the
1990  * same representation. Callers that always use the C collation (e.g.
1991  * non-collatable type callers like bytea) may have NUL bytes in their strings;
1992  * this will not work with any other collation, though.
1993  */
1994 void
1996 {
1997  bool abbreviate = ssup->abbreviate;
1998  bool collate_c = false;
1999  VarStringSortSupport *sss;
2000  pg_locale_t locale = 0;
2001 
2002  check_collation_set(collid);
2003 
2004  /*
2005  * If possible, set ssup->comparator to a function which can be used to
2006  * directly compare two datums. If we can do this, we'll avoid the
2007  * overhead of a trip through the fmgr layer for every comparison, which
2008  * can be substantial.
2009  *
2010  * Most typically, we'll set the comparator to varlenafastcmp_locale,
2011  * which uses strcoll() to perform comparisons. We use that for the
2012  * BpChar case too, but type NAME uses namefastcmp_locale. However, if
2013  * LC_COLLATE = C, we can make things quite a bit faster with
2014  * varstrfastcmp_c, bpcharfastcmp_c, or namefastcmp_c, all of which use
2015  * memcmp() rather than strcoll().
2016  */
2017  if (lc_collate_is_c(collid))
2018  {
2019  if (typid == BPCHAROID)
2020  ssup->comparator = bpcharfastcmp_c;
2021  else if (typid == NAMEOID)
2022  {
2023  ssup->comparator = namefastcmp_c;
2024  /* Not supporting abbreviation with type NAME, for now */
2025  abbreviate = false;
2026  }
2027  else
2028  ssup->comparator = varstrfastcmp_c;
2029 
2030  collate_c = true;
2031  }
2032  else
2033  {
2034  /*
2035  * We need a collation-sensitive comparison. To make things faster,
2036  * we'll figure out the collation based on the locale id and cache the
2037  * result.
2038  */
2039  if (collid != DEFAULT_COLLATION_OID)
2040  locale = pg_newlocale_from_collation(collid);
2041 
2042  /*
2043  * There is a further exception on Windows. When the database
2044  * encoding is UTF-8 and we are not using the C collation, complex
2045  * hacks are required. We don't currently have a comparator that
2046  * handles that case, so we fall back on the slow method of having the
2047  * sort code invoke bttextcmp() (in the case of text) via the fmgr
2048  * trampoline. ICU locales work just the same on Windows, however.
2049  */
2050 #ifdef WIN32
2051  if (GetDatabaseEncoding() == PG_UTF8 &&
2052  !(locale && locale->provider == COLLPROVIDER_ICU))
2053  return;
2054 #endif
2055 
2056  /*
2057  * We use varlenafastcmp_locale except for type NAME.
2058  */
2059  if (typid == NAMEOID)
2060  {
2062  /* Not supporting abbreviation with type NAME, for now */
2063  abbreviate = false;
2064  }
2065  else
2067  }
2068 
2069  /*
2070  * Unfortunately, it seems that abbreviation for non-C collations is
2071  * broken on many common platforms; testing of multiple versions of glibc
2072  * reveals that, for many locales, strcoll() and strxfrm() do not return
2073  * consistent results, which is fatal to this optimization. While no
2074  * other libc other than Cygwin has so far been shown to have a problem,
2075  * we take the conservative course of action for right now and disable
2076  * this categorically. (Users who are certain this isn't a problem on
2077  * their system can define TRUST_STRXFRM.)
2078  *
2079  * Even apart from the risk of broken locales, it's possible that there
2080  * are platforms where the use of abbreviated keys should be disabled at
2081  * compile time. Having only 4 byte datums could make worst-case
2082  * performance drastically more likely, for example. Moreover, macOS's
2083  * strxfrm() implementation is known to not effectively concentrate a
2084  * significant amount of entropy from the original string in earlier
2085  * transformed blobs. It's possible that other supported platforms are
2086  * similarly encumbered. So, if we ever get past disabling this
2087  * categorically, we may still want or need to disable it for particular
2088  * platforms.
2089  */
2090 #ifndef TRUST_STRXFRM
2091  if (!collate_c && !(locale && locale->provider == COLLPROVIDER_ICU))
2092  abbreviate = false;
2093 #endif
2094 
2095  /*
2096  * If we're using abbreviated keys, or if we're using a locale-aware
2097  * comparison, we need to initialize a VarStringSortSupport object. Both
2098  * cases will make use of the temporary buffers we initialize here for
2099  * scratch space (and to detect requirement for BpChar semantics from
2100  * caller), and the abbreviation case requires additional state.
2101  */
2102  if (abbreviate || !collate_c)
2103  {
2104  sss = palloc(sizeof(VarStringSortSupport));
2105  sss->buf1 = palloc(TEXTBUFLEN);
2106  sss->buflen1 = TEXTBUFLEN;
2107  sss->buf2 = palloc(TEXTBUFLEN);
2108  sss->buflen2 = TEXTBUFLEN;
2109  /* Start with invalid values */
2110  sss->last_len1 = -1;
2111  sss->last_len2 = -1;
2112  /* Initialize */
2113  sss->last_returned = 0;
2114  sss->locale = locale;
2115 
2116  /*
2117  * To avoid somehow confusing a strxfrm() blob and an original string,
2118  * constantly keep track of the variety of data that buf1 and buf2
2119  * currently contain.
2120  *
2121  * Comparisons may be interleaved with conversion calls. Frequently,
2122  * conversions and comparisons are batched into two distinct phases,
2123  * but the correctness of caching cannot hinge upon this. For
2124  * comparison caching, buffer state is only trusted if cache_blob is
2125  * found set to false, whereas strxfrm() caching only trusts the state
2126  * when cache_blob is found set to true.
2127  *
2128  * Arbitrarily initialize cache_blob to true.
2129  */
2130  sss->cache_blob = true;
2131  sss->collate_c = collate_c;
2132  sss->typid = typid;
2133  ssup->ssup_extra = sss;
2134 
2135  /*
2136  * If possible, plan to use the abbreviated keys optimization. The
2137  * core code may switch back to authoritative comparator should
2138  * abbreviation be aborted.
2139  */
2140  if (abbreviate)
2141  {
2142  sss->prop_card = 0.20;
2143  initHyperLogLog(&sss->abbr_card, 10);
2144  initHyperLogLog(&sss->full_card, 10);
2145  ssup->abbrev_full_comparator = ssup->comparator;
2146  ssup->comparator = varstrcmp_abbrev;
2149  }
2150  }
2151 }
2152 
2153 /*
2154  * sortsupport comparison func (for C locale case)
2155  */
2156 static int
2158 {
2159  VarString *arg1 = DatumGetVarStringPP(x);
2160  VarString *arg2 = DatumGetVarStringPP(y);
2161  char *a1p,
2162  *a2p;
2163  int len1,
2164  len2,
2165  result;
2166 
2167  a1p = VARDATA_ANY(arg1);
2168  a2p = VARDATA_ANY(arg2);
2169 
2170  len1 = VARSIZE_ANY_EXHDR(arg1);
2171  len2 = VARSIZE_ANY_EXHDR(arg2);
2172 
2173  result = memcmp(a1p, a2p, Min(len1, len2));
2174  if ((result == 0) && (len1 != len2))
2175  result = (len1 < len2) ? -1 : 1;
2176 
2177  /* We can't afford to leak memory here. */
2178  if (PointerGetDatum(arg1) != x)
2179  pfree(arg1);
2180  if (PointerGetDatum(arg2) != y)
2181  pfree(arg2);
2182 
2183  return result;
2184 }
2185 
2186 /*
2187  * sortsupport comparison func (for BpChar C locale case)
2188  *
2189  * BpChar outsources its sortsupport to this module. Specialization for the
2190  * varstr_sortsupport BpChar case, modeled on
2191  * internal_bpchar_pattern_compare().
2192  */
2193 static int
2195 {
2196  BpChar *arg1 = DatumGetBpCharPP(x);
2197  BpChar *arg2 = DatumGetBpCharPP(y);
2198  char *a1p,
2199  *a2p;
2200  int len1,
2201  len2,
2202  result;
2203 
2204  a1p = VARDATA_ANY(arg1);
2205  a2p = VARDATA_ANY(arg2);
2206 
2207  len1 = bpchartruelen(a1p, VARSIZE_ANY_EXHDR(arg1));
2208  len2 = bpchartruelen(a2p, VARSIZE_ANY_EXHDR(arg2));
2209 
2210  result = memcmp(a1p, a2p, Min(len1, len2));
2211  if ((result == 0) && (len1 != len2))
2212  result = (len1 < len2) ? -1 : 1;
2213 
2214  /* We can't afford to leak memory here. */
2215  if (PointerGetDatum(arg1) != x)
2216  pfree(arg1);
2217  if (PointerGetDatum(arg2) != y)
2218  pfree(arg2);
2219 
2220  return result;
2221 }
2222 
2223 /*
2224  * sortsupport comparison func (for NAME C locale case)
2225  */
2226 static int
2228 {
2229  Name arg1 = DatumGetName(x);
2230  Name arg2 = DatumGetName(y);
2231 
2232  return strncmp(NameStr(*arg1), NameStr(*arg2), NAMEDATALEN);
2233 }
2234 
2235 /*
2236  * sortsupport comparison func (for locale case with all varlena types)
2237  */
2238 static int
2240 {
2241  VarString *arg1 = DatumGetVarStringPP(x);
2242  VarString *arg2 = DatumGetVarStringPP(y);
2243  char *a1p,
2244  *a2p;
2245  int len1,
2246  len2,
2247  result;
2248 
2249  a1p = VARDATA_ANY(arg1);
2250  a2p = VARDATA_ANY(arg2);
2251 
2252  len1 = VARSIZE_ANY_EXHDR(arg1);
2253  len2 = VARSIZE_ANY_EXHDR(arg2);
2254 
2255  result = varstrfastcmp_locale(a1p, len1, a2p, len2, ssup);
2256 
2257  /* We can't afford to leak memory here. */
2258  if (PointerGetDatum(arg1) != x)
2259  pfree(arg1);
2260  if (PointerGetDatum(arg2) != y)
2261  pfree(arg2);
2262 
2263  return result;
2264 }
2265 
2266 /*
2267  * sortsupport comparison func (for locale case with NAME type)
2268  */
2269 static int
2271 {
2272  Name arg1 = DatumGetName(x);
2273  Name arg2 = DatumGetName(y);
2274 
2275  return varstrfastcmp_locale(NameStr(*arg1), strlen(NameStr(*arg1)),
2276  NameStr(*arg2), strlen(NameStr(*arg2)),
2277  ssup);
2278 }
2279 
2280 /*
2281  * sortsupport comparison func for locale cases
2282  */
2283 static int
2284 varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup)
2285 {
2287  int result;
2288  bool arg1_match;
2289 
2290  /* Fast pre-check for equality, as discussed in varstr_cmp() */
2291  if (len1 == len2 && memcmp(a1p, a2p, len1) == 0)
2292  {
2293  /*
2294  * No change in buf1 or buf2 contents, so avoid changing last_len1 or
2295  * last_len2. Existing contents of buffers might still be used by
2296  * next call.
2297  *
2298  * It's fine to allow the comparison of BpChar padding bytes here,
2299  * even though that implies that the memcmp() will usually be
2300  * performed for BpChar callers (though multibyte characters could
2301  * still prevent that from occurring). The memcmp() is still very
2302  * cheap, and BpChar's funny semantics have us remove trailing spaces
2303  * (not limited to padding), so we need make no distinction between
2304  * padding space characters and "real" space characters.
2305  */
2306  return 0;
2307  }
2308 
2309  if (sss->typid == BPCHAROID)
2310  {
2311  /* Get true number of bytes, ignoring trailing spaces */
2312  len1 = bpchartruelen(a1p, len1);
2313  len2 = bpchartruelen(a2p, len2);
2314  }
2315 
2316  if (len1 >= sss->buflen1)
2317  {
2318  pfree(sss->buf1);
2319  sss->buflen1 = Max(len1 + 1, Min(sss->buflen1 * 2, MaxAllocSize));
2320  sss->buf1 = MemoryContextAlloc(ssup->ssup_cxt, sss->buflen1);
2321  }
2322  if (len2 >= sss->buflen2)
2323  {
2324  pfree(sss->buf2);
2325  sss->buflen2 = Max(len2 + 1, Min(sss->buflen2 * 2, MaxAllocSize));
2326  sss->buf2 = MemoryContextAlloc(ssup->ssup_cxt, sss->buflen2);
2327  }
2328 
2329  /*
2330  * We're likely to be asked to compare the same strings repeatedly, and
2331  * memcmp() is so much cheaper than strcoll() that it pays to try to cache
2332  * comparisons, even though in general there is no reason to think that
2333  * that will work out (every string datum may be unique). Caching does
2334  * not slow things down measurably when it doesn't work out, and can speed
2335  * things up by rather a lot when it does. In part, this is because the
2336  * memcmp() compares data from cachelines that are needed in L1 cache even
2337  * when the last comparison's result cannot be reused.
2338  */
2339  arg1_match = true;
2340  if (len1 != sss->last_len1 || memcmp(sss->buf1, a1p, len1) != 0)
2341  {
2342  arg1_match = false;
2343  memcpy(sss->buf1, a1p, len1);
2344  sss->buf1[len1] = '\0';
2345  sss->last_len1 = len1;
2346  }
2347 
2348  /*
2349  * If we're comparing the same two strings as last time, we can return the
2350  * same answer without calling strcoll() again. This is more likely than
2351  * it seems (at least with moderate to low cardinality sets), because
2352  * quicksort compares the same pivot against many values.
2353  */
2354  if (len2 != sss->last_len2 || memcmp(sss->buf2, a2p, len2) != 0)
2355  {
2356  memcpy(sss->buf2, a2p, len2);
2357  sss->buf2[len2] = '\0';
2358  sss->last_len2 = len2;
2359  }
2360  else if (arg1_match && !sss->cache_blob)
2361  {
2362  /* Use result cached following last actual strcoll() call */
2363  return sss->last_returned;
2364  }
2365 
2366  if (sss->locale)
2367  {
2368  if (sss->locale->provider == COLLPROVIDER_ICU)
2369  {
2370 #ifdef USE_ICU
2371 #ifdef HAVE_UCOL_STRCOLLUTF8
2372  if (GetDatabaseEncoding() == PG_UTF8)
2373  {
2374  UErrorCode status;
2375 
2376  status = U_ZERO_ERROR;
2377  result = ucol_strcollUTF8(sss->locale->info.icu.ucol,
2378  a1p, len1,
2379  a2p, len2,
2380  &status);
2381  if (U_FAILURE(status))
2382  ereport(ERROR,
2383  (errmsg("collation failed: %s", u_errorName(status))));
2384  }
2385  else
2386 #endif
2387  {
2388  int32_t ulen1,
2389  ulen2;
2390  UChar *uchar1,
2391  *uchar2;
2392 
2393  ulen1 = icu_to_uchar(&uchar1, a1p, len1);
2394  ulen2 = icu_to_uchar(&uchar2, a2p, len2);
2395 
2396  result = ucol_strcoll(sss->locale->info.icu.ucol,
2397  uchar1, ulen1,
2398  uchar2, ulen2);
2399 
2400  pfree(uchar1);
2401  pfree(uchar2);
2402  }
2403 #else /* not USE_ICU */
2404  /* shouldn't happen */
2405  elog(ERROR, "unsupported collprovider: %c", sss->locale->provider);
2406 #endif /* not USE_ICU */
2407  }
2408  else
2409  {
2410 #ifdef HAVE_LOCALE_T
2411  result = strcoll_l(sss->buf1, sss->buf2, sss->locale->info.lt);
2412 #else
2413  /* shouldn't happen */
2414  elog(ERROR, "unsupported collprovider: %c", sss->locale->provider);
2415 #endif
2416  }
2417  }
2418  else
2419  result = strcoll(sss->buf1, sss->buf2);
2420 
2421  /* Break tie if necessary. */
2422  if (result == 0 &&
2423  (!sss->locale || sss->locale->deterministic))
2424  result = strcmp(sss->buf1, sss->buf2);
2425 
2426  /* Cache result, perhaps saving an expensive strcoll() call next time */
2427  sss->cache_blob = false;
2428  sss->last_returned = result;
2429  return result;
2430 }
2431 
2432 /*
2433  * Abbreviated key comparison func
2434  */
2435 static int
2437 {
2438  /*
2439  * When 0 is returned, the core system will call varstrfastcmp_c()
2440  * (bpcharfastcmp_c() in BpChar case) or varlenafastcmp_locale(). Even a
2441  * strcmp() on two non-truncated strxfrm() blobs cannot indicate *equality*
2442  * authoritatively, for the same reason that there is a strcoll()
2443  * tie-breaker call to strcmp() in varstr_cmp().
2444  */
2445  if (x > y)
2446  return 1;
2447  else if (x == y)
2448  return 0;
2449  else
2450  return -1;
2451 }
2452 
2453 /*
2454  * Conversion routine for sortsupport. Converts original to abbreviated key
2455  * representation. Our encoding strategy is simple -- pack the first 8 bytes
2456  * of a strxfrm() blob into a Datum (on little-endian machines, the 8 bytes are
2457  * stored in reverse order), and treat it as an unsigned integer. When the "C"
2458  * locale is used, or in case of bytea, just memcpy() from original instead.
2459  */
2460 static Datum
2462 {
2464  VarString *authoritative = DatumGetVarStringPP(original);
2465  char *authoritative_data = VARDATA_ANY(authoritative);
2466 
2467  /* working state */
2468  Datum res;
2469  char *pres;
2470  int len;
2471  uint32 hash;
2472 
2473  pres = (char *) &res;
2474  /* memset(), so any non-overwritten bytes are NUL */
2475  memset(pres, 0, sizeof(Datum));
2476  len = VARSIZE_ANY_EXHDR(authoritative);
2477 
2478  /* Get number of bytes, ignoring trailing spaces */
2479  if (sss->typid == BPCHAROID)
2480  len = bpchartruelen(authoritative_data, len);
2481 
2482  /*
2483  * If we're using the C collation, use memcpy(), rather than strxfrm(), to
2484  * abbreviate keys. The full comparator for the C locale is always
2485  * memcmp(). It would be incorrect to allow bytea callers (callers that
2486  * always force the C collation -- bytea isn't a collatable type, but this
2487  * approach is convenient) to use strxfrm(). This is because bytea
2488  * strings may contain NUL bytes. Besides, this should be faster, too.
2489  *
2490  * More generally, it's okay that bytea callers can have NUL bytes in
2491  * strings because varstrcmp_abbrev() need not make a distinction between
2492  * terminating NUL bytes, and NUL bytes representing actual NULs in the
2493  * authoritative representation. Hopefully a comparison at or past one
2494  * abbreviated key's terminating NUL byte will resolve the comparison
2495  * without consulting the authoritative representation; specifically, some
2496  * later non-NUL byte in the longer string can resolve the comparison
2497  * against a subsequent terminating NUL in the shorter string. There will
2498  * usually be what is effectively a "length-wise" resolution there and
2499  * then.
2500  *
2501  * If that doesn't work out -- if all bytes in the longer string
2502  * positioned at or past the offset of the smaller string's (first)
2503  * terminating NUL are actually representative of NUL bytes in the
2504  * authoritative binary string (perhaps with some *terminating* NUL bytes
2505  * towards the end of the longer string iff it happens to still be small)
2506  * -- then an authoritative tie-breaker will happen, and do the right
2507  * thing: explicitly consider string length.
2508  */
2509  if (sss->collate_c)
2510  memcpy(pres, authoritative_data, Min(len, sizeof(Datum)));
2511  else
2512  {
2513  Size bsize;
2514 #ifdef USE_ICU
2515  int32_t ulen = -1;
2516  UChar *uchar = NULL;
2517 #endif
2518 
2519  /*
2520  * We're not using the C collation, so fall back on strxfrm or ICU
2521  * analogs.
2522  */
2523 
2524  /* By convention, we use buffer 1 to store and NUL-terminate */
2525  if (len >= sss->buflen1)
2526  {
2527  pfree(sss->buf1);
2528  sss->buflen1 = Max(len + 1, Min(sss->buflen1 * 2, MaxAllocSize));
2529  sss->buf1 = palloc(sss->buflen1);
2530  }
2531 
2532  /* Might be able to reuse strxfrm() blob from last call */
2533  if (sss->last_len1 == len && sss->cache_blob &&
2534  memcmp(sss->buf1, authoritative_data, len) == 0)
2535  {
2536  memcpy(pres, sss->buf2, Min(sizeof(Datum), sss->last_len2));
2537  /* No change affecting cardinality, so no hashing required */
2538  goto done;
2539  }
2540 
2541  memcpy(sss->buf1, authoritative_data, len);
2542 
2543  /*
2544  * Just like strcoll(), strxfrm() expects a NUL-terminated string. Not
2545  * necessary for ICU, but doesn't hurt.
2546  */
2547  sss->buf1[len] = '\0';
2548  sss->last_len1 = len;
2549 
2550 #ifdef USE_ICU
2551  /* When using ICU and not UTF8, convert string to UChar. */
2552  if (sss->locale && sss->locale->provider == COLLPROVIDER_ICU &&
2554  ulen = icu_to_uchar(&uchar, sss->buf1, len);
2555 #endif
2556 
2557  /*
2558  * Loop: Call strxfrm() or ucol_getSortKey(), possibly enlarge buffer,
2559  * and try again. Both of these functions have the result buffer
2560  * content undefined if the result did not fit, so we need to retry
2561  * until everything fits, even though we only need the first few bytes
2562  * in the end. When using ucol_nextSortKeyPart(), however, we only
2563  * ask for as many bytes as we actually need.
2564  */
2565  for (;;)
2566  {
2567 #ifdef USE_ICU
2568  if (sss->locale && sss->locale->provider == COLLPROVIDER_ICU)
2569  {
2570  /*
2571  * When using UTF8, use the iteration interface so we only
2572  * need to produce as many bytes as we actually need.
2573  */
2574  if (GetDatabaseEncoding() == PG_UTF8)
2575  {
2576  UCharIterator iter;
2577  uint32_t state[2];
2578  UErrorCode status;
2579 
2580  uiter_setUTF8(&iter, sss->buf1, len);
2581  state[0] = state[1] = 0; /* won't need that again */
2582  status = U_ZERO_ERROR;
2583  bsize = ucol_nextSortKeyPart(sss->locale->info.icu.ucol,
2584  &iter,
2585  state,
2586  (uint8_t *) sss->buf2,
2587  Min(sizeof(Datum), sss->buflen2),
2588  &status);
2589  if (U_FAILURE(status))
2590  ereport(ERROR,
2591  (errmsg("sort key generation failed: %s",
2592  u_errorName(status))));
2593  }
2594  else
2595  bsize = ucol_getSortKey(sss->locale->info.icu.ucol,
2596  uchar, ulen,
2597  (uint8_t *) sss->buf2, sss->buflen2);
2598  }
2599  else
2600 #endif
2601 #ifdef HAVE_LOCALE_T
2602  if (sss->locale && sss->locale->provider == COLLPROVIDER_LIBC)
2603  bsize = strxfrm_l(sss->buf2, sss->buf1,
2604  sss->buflen2, sss->locale->info.lt);
2605  else
2606 #endif
2607  bsize = strxfrm(sss->buf2, sss->buf1, sss->buflen2);
2608 
2609  sss->last_len2 = bsize;
2610  if (bsize < sss->buflen2)
2611  break;
2612 
2613  /*
2614  * Grow buffer and retry.
2615  */
2616  pfree(sss->buf2);
2617  sss->buflen2 = Max(bsize + 1,
2618  Min(sss->buflen2 * 2, MaxAllocSize));
2619  sss->buf2 = palloc(sss->buflen2);
2620  }
2621 
2622  /*
2623  * Every Datum byte is always compared. This is safe because the
2624  * strxfrm() blob is itself NUL terminated, leaving no danger of
2625  * misinterpreting any NUL bytes not intended to be interpreted as
2626  * logically representing termination.
2627  *
2628  * (Actually, even if there were NUL bytes in the blob it would be
2629  * okay. See remarks on bytea case above.)
2630  */
2631  memcpy(pres, sss->buf2, Min(sizeof(Datum), bsize));
2632 
2633 #ifdef USE_ICU
2634  if (uchar)
2635  pfree(uchar);
2636 #endif
2637  }
2638 
2639  /*
2640  * Maintain approximate cardinality of both abbreviated keys and original,
2641  * authoritative keys using HyperLogLog. Used as cheap insurance against
2642  * the worst case, where we do many string transformations for no saving
2643  * in full strcoll()-based comparisons. These statistics are used by
2644  * varstr_abbrev_abort().
2645  *
2646  * First, Hash key proper, or a significant fraction of it. Mix in length
2647  * in order to compensate for cases where differences are past
2648  * PG_CACHE_LINE_SIZE bytes, so as to limit the overhead of hashing.
2649  */
2650  hash = DatumGetUInt32(hash_any((unsigned char *) authoritative_data,
2651  Min(len, PG_CACHE_LINE_SIZE)));
2652 
2653  if (len > PG_CACHE_LINE_SIZE)
2654  hash ^= DatumGetUInt32(hash_uint32((uint32) len));
2655 
2656  addHyperLogLog(&sss->full_card, hash);
2657 
2658  /* Hash abbreviated key */
2659 #if SIZEOF_DATUM == 8
2660  {
2661  uint32 lohalf,
2662  hihalf;
2663 
2664  lohalf = (uint32) res;
2665  hihalf = (uint32) (res >> 32);
2666  hash = DatumGetUInt32(hash_uint32(lohalf ^ hihalf));
2667  }
2668 #else /* SIZEOF_DATUM != 8 */
2669  hash = DatumGetUInt32(hash_uint32((uint32) res));
2670 #endif
2671 
2672  addHyperLogLog(&sss->abbr_card, hash);
2673 
2674  /* Cache result, perhaps saving an expensive strxfrm() call next time */
2675  sss->cache_blob = true;
2676 done:
2677 
2678  /*
2679  * Byteswap on little-endian machines.
2680  *
2681  * This is needed so that varstrcmp_abbrev() (an unsigned integer 3-way
2682  * comparator) works correctly on all platforms. If we didn't do this,
2683  * the comparator would have to call memcmp() with a pair of pointers to
2684  * the first byte of each abbreviated key, which is slower.
2685  */
2686  res = DatumBigEndianToNative(res);
2687 
2688  /* Don't leak memory here */
2689  if (PointerGetDatum(authoritative) != original)
2690  pfree(authoritative);
2691 
2692  return res;
2693 }
2694 
2695 /*
2696  * Callback for estimating effectiveness of abbreviated key optimization, using
2697  * heuristic rules. Returns value indicating if the abbreviation optimization
2698  * should be aborted, based on its projected effectiveness.
2699  */
2700 static bool
2701 varstr_abbrev_abort(int memtupcount, SortSupport ssup)
2702 {
2704  double abbrev_distinct,
2705  key_distinct;
2706 
2707  Assert(ssup->abbreviate);
2708 
2709  /* Have a little patience */
2710  if (memtupcount < 100)
2711  return false;
2712 
2713  abbrev_distinct = estimateHyperLogLog(&sss->abbr_card);
2714  key_distinct = estimateHyperLogLog(&sss->full_card);
2715 
2716  /*
2717  * Clamp cardinality estimates to at least one distinct value. While
2718  * NULLs are generally disregarded, if only NULL values were seen so far,
2719  * that might misrepresent costs if we failed to clamp.
2720  */
2721  if (abbrev_distinct <= 1.0)
2722  abbrev_distinct = 1.0;
2723 
2724  if (key_distinct <= 1.0)
2725  key_distinct = 1.0;
2726 
2727  /*
2728  * In the worst case all abbreviated keys are identical, while at the same
2729  * time there are differences within full key strings not captured in
2730  * abbreviations.
2731  */
2732 #ifdef TRACE_SORT
2733  if (trace_sort)
2734  {
2735  double norm_abbrev_card = abbrev_distinct / (double) memtupcount;
2736 
2737  elog(LOG, "varstr_abbrev: abbrev_distinct after %d: %f "
2738  "(key_distinct: %f, norm_abbrev_card: %f, prop_card: %f)",
2739  memtupcount, abbrev_distinct, key_distinct, norm_abbrev_card,
2740  sss->prop_card);
2741  }
2742 #endif
2743 
2744  /*
2745  * If the number of distinct abbreviated keys approximately matches the
2746  * number of distinct authoritative original keys, that's reason enough to
2747  * proceed. We can win even with a very low cardinality set if most
2748  * tie-breakers only memcmp(). This is by far the most important
2749  * consideration.
2750  *
2751  * While comparisons that are resolved at the abbreviated key level are
2752  * considerably cheaper than tie-breakers resolved with memcmp(), both of
2753  * those two outcomes are so much cheaper than a full strcoll() once
2754  * sorting is underway that it doesn't seem worth it to weigh abbreviated
2755  * cardinality against the overall size of the set in order to more
2756  * accurately model costs. Assume that an abbreviated comparison, and an
2757  * abbreviated comparison with a cheap memcmp()-based authoritative
2758  * resolution are equivalent.
2759  */
2760  if (abbrev_distinct > key_distinct * sss->prop_card)
2761  {
2762  /*
2763  * When we have exceeded 10,000 tuples, decay required cardinality
2764  * aggressively for next call.
2765  *
2766  * This is useful because the number of comparisons required on
2767  * average increases at a linearithmic rate, and at roughly 10,000
2768  * tuples that factor will start to dominate over the linear costs of
2769  * string transformation (this is a conservative estimate). The decay
2770  * rate is chosen to be a little less aggressive than halving -- which
2771  * (since we're called at points at which memtupcount has doubled)
2772  * would never see the cost model actually abort past the first call
2773  * following a decay. This decay rate is mostly a precaution against
2774  * a sudden, violent swing in how well abbreviated cardinality tracks
2775  * full key cardinality. The decay also serves to prevent a marginal
2776  * case from being aborted too late, when too much has already been
2777  * invested in string transformation.
2778  *
2779  * It's possible for sets of several million distinct strings with
2780  * mere tens of thousands of distinct abbreviated keys to still
2781  * benefit very significantly. This will generally occur provided
2782  * each abbreviated key is a proxy for a roughly uniform number of the
2783  * set's full keys. If it isn't so, we hope to catch that early and
2784  * abort. If it isn't caught early, by the time the problem is
2785  * apparent it's probably not worth aborting.
2786  */
2787  if (memtupcount > 10000)
2788  sss->prop_card *= 0.65;
2789 
2790  return false;
2791  }
2792 
2793  /*
2794  * Abort abbreviation strategy.
2795  *
2796  * The worst case, where all abbreviated keys are identical while all
2797  * original strings differ will typically only see a regression of about
2798  * 10% in execution time for small to medium sized lists of strings.
2799  * Whereas on modern CPUs where cache stalls are the dominant cost, we can
2800  * often expect very large improvements, particularly with sets of strings
2801  * of moderately high to high abbreviated cardinality. There is little to
2802  * lose but much to gain, which our strategy reflects.
2803  */
2804 #ifdef TRACE_SORT
2805  if (trace_sort)
2806  elog(LOG, "varstr_abbrev: aborted abbreviation at %d "
2807  "(abbrev_distinct: %f, key_distinct: %f, prop_card: %f)",
2808  memtupcount, abbrev_distinct, key_distinct, sss->prop_card);
2809 #endif
2810 
2811  return true;
2812 }
2813 
2814 /*
2815  * Generic equalimage support function for character type's operator classes.
2816  * Disables the use of deduplication with nondeterministic collations.
2817  */
2818 Datum
2820 {
2821  /* Oid opcintype = PG_GETARG_OID(0); */
2822  Oid collid = PG_GET_COLLATION();
2823 
2824  check_collation_set(collid);
2825 
2826  if (lc_collate_is_c(collid) ||
2827  collid == DEFAULT_COLLATION_OID ||
2829  PG_RETURN_BOOL(true);
2830  else
2831  PG_RETURN_BOOL(false);
2832 }
2833 
2834 Datum
2836 {
2837  text *arg1 = PG_GETARG_TEXT_PP(0);
2838  text *arg2 = PG_GETARG_TEXT_PP(1);
2839  text *result;
2840 
2841  result = ((text_cmp(arg1, arg2, PG_GET_COLLATION()) > 0) ? arg1 : arg2);
2842 
2843  PG_RETURN_TEXT_P(result);
2844 }
2845 
2846 Datum
2848 {
2849  text *arg1 = PG_GETARG_TEXT_PP(0);
2850  text *arg2 = PG_GETARG_TEXT_PP(1);
2851  text *result;
2852 
2853  result = ((text_cmp(arg1, arg2, PG_GET_COLLATION()) < 0) ? arg1 : arg2);
2854 
2855  PG_RETURN_TEXT_P(result);
2856 }
2857 
2858 
2859 /*
2860  * Cross-type comparison functions for types text and name.
2861  */
2862 
2863 Datum
2865 {
2866  Name arg1 = PG_GETARG_NAME(0);
2867  text *arg2 = PG_GETARG_TEXT_PP(1);
2868  size_t len1 = strlen(NameStr(*arg1));
2869  size_t len2 = VARSIZE_ANY_EXHDR(arg2);
2870  Oid collid = PG_GET_COLLATION();
2871  bool result;
2872 
2873  check_collation_set(collid);
2874 
2875  if (collid == C_COLLATION_OID)
2876  result = (len1 == len2 &&
2877  memcmp(NameStr(*arg1), VARDATA_ANY(arg2), len1) == 0);
2878  else
2879  result = (varstr_cmp(NameStr(*arg1), len1,
2880  VARDATA_ANY(arg2), len2,
2881  collid) == 0);
2882 
2883  PG_FREE_IF_COPY(arg2, 1);
2884 
2885  PG_RETURN_BOOL(result);
2886 }
2887 
2888 Datum
2890 {
2891  text *arg1 = PG_GETARG_TEXT_PP(0);
2892  Name arg2 = PG_GETARG_NAME(1);
2893  size_t len1 = VARSIZE_ANY_EXHDR(arg1);
2894  size_t len2 = strlen(NameStr(*arg2));
2895  Oid collid = PG_GET_COLLATION();
2896  bool result;
2897 
2898  check_collation_set(collid);
2899 
2900  if (collid == C_COLLATION_OID)
2901  result = (len1 == len2 &&
2902  memcmp(VARDATA_ANY(arg1), NameStr(*arg2), len1) == 0);
2903  else
2904  result = (varstr_cmp(VARDATA_ANY(arg1), len1,
2905  NameStr(*arg2), len2,
2906  collid) == 0);
2907 
2908  PG_FREE_IF_COPY(arg1, 0);
2909 
2910  PG_RETURN_BOOL(result);
2911 }
2912 
2913 Datum
2915 {
2916  Name arg1 = PG_GETARG_NAME(0);
2917  text *arg2 = PG_GETARG_TEXT_PP(1);
2918  size_t len1 = strlen(NameStr(*arg1));
2919  size_t len2 = VARSIZE_ANY_EXHDR(arg2);
2920  Oid collid = PG_GET_COLLATION();
2921  bool result;
2922 
2923  check_collation_set(collid);
2924 
2925  if (collid == C_COLLATION_OID)
2926  result = !(len1 == len2 &&
2927  memcmp(NameStr(*arg1), VARDATA_ANY(arg2), len1) == 0);
2928  else
2929  result = !(varstr_cmp(NameStr(*arg1), len1,
2930  VARDATA_ANY(arg2), len2,
2931  collid) == 0);
2932 
2933  PG_FREE_IF_COPY(arg2, 1);
2934 
2935  PG_RETURN_BOOL(result);
2936 }
2937 
2938 Datum
2940 {
2941  text *arg1 = PG_GETARG_TEXT_PP(0);
2942  Name arg2 = PG_GETARG_NAME(1);
2943  size_t len1 = VARSIZE_ANY_EXHDR(arg1);
2944  size_t len2 = strlen(NameStr(*arg2));
2945  Oid collid = PG_GET_COLLATION();
2946  bool result;
2947 
2948  check_collation_set(collid);
2949 
2950  if (collid == C_COLLATION_OID)
2951  result = !(len1 == len2 &&
2952  memcmp(VARDATA_ANY(arg1), NameStr(*arg2), len1) == 0);
2953  else
2954  result = !(varstr_cmp(VARDATA_ANY(arg1), len1,
2955  NameStr(*arg2), len2,
2956  collid) == 0);
2957 
2958  PG_FREE_IF_COPY(arg1, 0);
2959 
2960  PG_RETURN_BOOL(result);
2961 }
2962 
2963 Datum
2965 {
2966  Name arg1 = PG_GETARG_NAME(0);
2967  text *arg2 = PG_GETARG_TEXT_PP(1);
2968  int32 result;
2969 
2970  result = varstr_cmp(NameStr(*arg1), strlen(NameStr(*arg1)),
2971  VARDATA_ANY(arg2), VARSIZE_ANY_EXHDR(arg2),
2972  PG_GET_COLLATION());
2973 
2974  PG_FREE_IF_COPY(arg2, 1);
2975 
2976  PG_RETURN_INT32(result);
2977 }
2978 
2979 Datum
2981 {
2982  text *arg1 = PG_GETARG_TEXT_PP(0);
2983  Name arg2 = PG_GETARG_NAME(1);
2984  int32 result;
2985 
2986  result = varstr_cmp(VARDATA_ANY(arg1), VARSIZE_ANY_EXHDR(arg1),
2987  NameStr(*arg2), strlen(NameStr(*arg2)),
2988  PG_GET_COLLATION());
2989 
2990  PG_FREE_IF_COPY(arg1, 0);
2991 
2992  PG_RETURN_INT32(result);
2993 }
2994 
2995 #define CmpCall(cmpfunc) \
2996  DatumGetInt32(DirectFunctionCall2Coll(cmpfunc, \
2997  PG_GET_COLLATION(), \
2998  PG_GETARG_DATUM(0), \
2999  PG_GETARG_DATUM(1)))
3000 
3001 Datum
3003 {
3005 }
3006 
3007 Datum
3009 {
3011 }
3012 
3013 Datum
3015 {
3017 }
3018 
3019 Datum
3021 {
3023 }
3024 
3025 Datum
3027 {
3029 }
3030 
3031 Datum
3033 {
3035 }
3036 
3037 Datum
3039 {
3041 }
3042 
3043 Datum
3045 {
3047 }
3048 
3049 #undef CmpCall
3050 
3051 
3052 /*
3053  * The following operators support character-by-character comparison
3054  * of text datums, to allow building indexes suitable for LIKE clauses.
3055  * Note that the regular texteq/textne comparison operators, and regular
3056  * support functions 1 and 2 with "C" collation are assumed to be
3057  * compatible with these!
3058  */
3059 
3060 static int
3062 {
3063  int result;
3064  int len1,
3065  len2;
3066 
3067  len1 = VARSIZE_ANY_EXHDR(arg1);
3068  len2 = VARSIZE_ANY_EXHDR(arg2);
3069 
3070  result = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3071  if (result != 0)
3072  return result;
3073  else if (len1 < len2)
3074  return -1;
3075  else if (len1 > len2)
3076  return 1;
3077  else
3078  return 0;
3079 }
3080 
3081 
3082 Datum
3084 {
3085  text *arg1 = PG_GETARG_TEXT_PP(0);
3086  text *arg2 = PG_GETARG_TEXT_PP(1);
3087  int result;
3088 
3089  result = internal_text_pattern_compare(arg1, arg2);
3090 
3091  PG_FREE_IF_COPY(arg1, 0);
3092  PG_FREE_IF_COPY(arg2, 1);
3093 
3094  PG_RETURN_BOOL(result < 0);
3095 }
3096 
3097 
3098 Datum
3100 {
3101  text *arg1 = PG_GETARG_TEXT_PP(0);
3102  text *arg2 = PG_GETARG_TEXT_PP(1);
3103  int result;
3104 
3105  result = internal_text_pattern_compare(arg1, arg2);
3106 
3107  PG_FREE_IF_COPY(arg1, 0);
3108  PG_FREE_IF_COPY(arg2, 1);
3109 
3110  PG_RETURN_BOOL(result <= 0);
3111 }
3112 
3113 
3114 Datum
3116 {
3117  text *arg1 = PG_GETARG_TEXT_PP(0);
3118  text *arg2 = PG_GETARG_TEXT_PP(1);
3119  int result;
3120 
3121  result = internal_text_pattern_compare(arg1, arg2);
3122 
3123  PG_FREE_IF_COPY(arg1, 0);
3124  PG_FREE_IF_COPY(arg2, 1);
3125 
3126  PG_RETURN_BOOL(result >= 0);
3127 }
3128 
3129 
3130 Datum
3132 {
3133  text *arg1 = PG_GETARG_TEXT_PP(0);
3134  text *arg2 = PG_GETARG_TEXT_PP(1);
3135  int result;
3136 
3137  result = internal_text_pattern_compare(arg1, arg2);
3138 
3139  PG_FREE_IF_COPY(arg1, 0);
3140  PG_FREE_IF_COPY(arg2, 1);
3141 
3142  PG_RETURN_BOOL(result > 0);
3143 }
3144 
3145 
3146 Datum
3148 {
3149  text *arg1 = PG_GETARG_TEXT_PP(0);
3150  text *arg2 = PG_GETARG_TEXT_PP(1);
3151  int result;
3152 
3153  result = internal_text_pattern_compare(arg1, arg2);
3154 
3155  PG_FREE_IF_COPY(arg1, 0);
3156  PG_FREE_IF_COPY(arg2, 1);
3157 
3158  PG_RETURN_INT32(result);
3159 }
3160 
3161 
3162 Datum
3164 {
3166  MemoryContext oldcontext;
3167 
3168  oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
3169 
3170  /* Use generic string SortSupport, forcing "C" collation */
3171  varstr_sortsupport(ssup, TEXTOID, C_COLLATION_OID);
3172 
3173  MemoryContextSwitchTo(oldcontext);
3174 
3175  PG_RETURN_VOID();
3176 }
3177 
3178 
3179 /*-------------------------------------------------------------
3180  * byteaoctetlen
3181  *
3182  * get the number of bytes contained in an instance of type 'bytea'
3183  *-------------------------------------------------------------
3184  */
3185 Datum
3187 {
3188  Datum str = PG_GETARG_DATUM(0);
3189 
3190  /* We need not detoast the input at all */
3192 }
3193 
3194 /*
3195  * byteacat -
3196  * takes two bytea* and returns a bytea* that is the concatenation of
3197  * the two.
3198  *
3199  * Cloned from textcat and modified as required.
3200  */
3201 Datum
3203 {
3204  bytea *t1 = PG_GETARG_BYTEA_PP(0);
3205  bytea *t2 = PG_GETARG_BYTEA_PP(1);
3206 
3208 }
3209 
3210 /*
3211  * bytea_catenate
3212  * Guts of byteacat(), broken out so it can be used by other functions
3213  *
3214  * Arguments can be in short-header form, but not compressed or out-of-line
3215  */
3216 static bytea *
3218 {
3219  bytea *result;
3220  int len1,
3221  len2,
3222  len;
3223  char *ptr;
3224 
3225  len1 = VARSIZE_ANY_EXHDR(t1);
3226  len2 = VARSIZE_ANY_EXHDR(t2);
3227 
3228  /* paranoia ... probably should throw error instead? */
3229  if (len1 < 0)
3230  len1 = 0;
3231  if (len2 < 0)
3232  len2 = 0;
3233 
3234  len = len1 + len2 + VARHDRSZ;
3235  result = (bytea *) palloc(len);
3236 
3237  /* Set size of result string... */
3238  SET_VARSIZE(result, len);
3239 
3240  /* Fill data field of result string... */
3241  ptr = VARDATA(result);
3242  if (len1 > 0)
3243  memcpy(ptr, VARDATA_ANY(t1), len1);
3244  if (len2 > 0)
3245  memcpy(ptr + len1, VARDATA_ANY(t2), len2);
3246 
3247  return result;
3248 }
3249 
3250 #define PG_STR_GET_BYTEA(str_) \
3251  DatumGetByteaPP(DirectFunctionCall1(byteain, CStringGetDatum(str_)))
3252 
3253 /*
3254  * bytea_substr()
3255  * Return a substring starting at the specified position.
3256  * Cloned from text_substr and modified as required.
3257  *
3258  * Input:
3259  * - string
3260  * - starting position (is one-based)
3261  * - string length (optional)
3262  *
3263  * If the starting position is zero or less, then return from the start of the string
3264  * adjusting the length to be consistent with the "negative start" per SQL.
3265  * If the length is less than zero, an ERROR is thrown. If no third argument
3266  * (length) is provided, the length to the end of the string is assumed.
3267  */
3268 Datum
3270 {
3272  PG_GETARG_INT32(1),
3273  PG_GETARG_INT32(2),
3274  false));
3275 }
3276 
3277 /*
3278  * bytea_substr_no_len -
3279  * Wrapper to avoid opr_sanity failure due to
3280  * one function accepting a different number of args.
3281  */
3282 Datum
3284 {
3286  PG_GETARG_INT32(1),
3287  -1,
3288  true));
3289 }
3290 
3291 static bytea *
3293  int S,
3294  int L,
3295  bool length_not_specified)
3296 {
3297  int S1; /* adjusted start position */
3298  int L1; /* adjusted substring length */
3299 
3300  S1 = Max(S, 1);
3301 
3302  if (length_not_specified)
3303  {
3304  /*
3305  * Not passed a length - DatumGetByteaPSlice() grabs everything to the
3306  * end of the string if we pass it a negative value for length.
3307  */
3308  L1 = -1;
3309  }
3310  else
3311  {
3312  /* end position */
3313  int E = S + L;
3314 
3315  /*
3316  * A negative value for L is the only way for the end position to be
3317  * before the start. SQL99 says to throw an error.
3318  */
3319  if (E < S)
3320  ereport(ERROR,
3321  (errcode(ERRCODE_SUBSTRING_ERROR),
3322  errmsg("negative substring length not allowed")));
3323 
3324  /*
3325  * A zero or negative value for the end position can happen if the
3326  * start was negative or one. SQL99 says to return a zero-length
3327  * string.
3328  */
3329  if (E < 1)
3330  return PG_STR_GET_BYTEA("");
3331 
3332  L1 = E - S1;
3333  }
3334 
3335  /*
3336  * If the start position is past the end of the string, SQL99 says to
3337  * return a zero-length string -- DatumGetByteaPSlice() will do that for
3338  * us. Convert to zero-based starting position
3339  */
3340  return DatumGetByteaPSlice(str, S1 - 1, L1);
3341 }
3342 
3343 /*
3344  * byteaoverlay
3345  * Replace specified substring of first string with second
3346  *
3347  * The SQL standard defines OVERLAY() in terms of substring and concatenation.
3348  * This code is a direct implementation of what the standard says.
3349  */
3350 Datum
3352 {
3353  bytea *t1 = PG_GETARG_BYTEA_PP(0);
3354  bytea *t2 = PG_GETARG_BYTEA_PP(1);
3355  int sp = PG_GETARG_INT32(2); /* substring start position */
3356  int sl = PG_GETARG_INT32(3); /* substring length */
3357 
3358  PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl));
3359 }
3360 
3361 Datum
3363 {
3364  bytea *t1 = PG_GETARG_BYTEA_PP(0);
3365  bytea *t2 = PG_GETARG_BYTEA_PP(1);
3366  int sp = PG_GETARG_INT32(2); /* substring start position */
3367  int sl;
3368 
3369  sl = VARSIZE_ANY_EXHDR(t2); /* defaults to length(t2) */
3370  PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl));
3371 }
3372 
3373 static bytea *
3374 bytea_overlay(bytea *t1, bytea *t2, int sp, int sl)
3375 {
3376  bytea *result;
3377  bytea *s1;
3378  bytea *s2;
3379  int sp_pl_sl;
3380 
3381  /*
3382  * Check for possible integer-overflow cases. For negative sp, throw a
3383  * "substring length" error because that's what should be expected
3384  * according to the spec's definition of OVERLAY().
3385  */
3386  if (sp <= 0)
3387  ereport(ERROR,
3388  (errcode(ERRCODE_SUBSTRING_ERROR),
3389  errmsg("negative substring length not allowed")));
3390  if (pg_add_s32_overflow(sp, sl, &sp_pl_sl))
3391  ereport(ERROR,
3392  (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
3393  errmsg("integer out of range")));
3394 
3395  s1 = bytea_substring(PointerGetDatum(t1), 1, sp - 1, false);
3396  s2 = bytea_substring(PointerGetDatum(t1), sp_pl_sl, -1, true);
3397  result = bytea_catenate(s1, t2);
3398  result = bytea_catenate(result, s2);
3399 
3400  return result;
3401 }
3402 
3403 /*
3404  * byteapos -
3405  * Return the position of the specified substring.
3406  * Implements the SQL POSITION() function.
3407  * Cloned from textpos and modified as required.
3408  */
3409 Datum
3411 {
3412  bytea *t1 = PG_GETARG_BYTEA_PP(0);
3413  bytea *t2 = PG_GETARG_BYTEA_PP(1);
3414  int pos;
3415  int px,
3416  p;
3417  int len1,
3418  len2;
3419  char *p1,
3420  *p2;
3421 
3422  len1 = VARSIZE_ANY_EXHDR(t1);
3423  len2 = VARSIZE_ANY_EXHDR(t2);
3424 
3425  if (len2 <= 0)
3426  PG_RETURN_INT32(1); /* result for empty pattern */
3427 
3428  p1 = VARDATA_ANY(t1);
3429  p2 = VARDATA_ANY(t2);
3430 
3431  pos = 0;
3432  px = (len1 - len2);
3433  for (p = 0; p <= px; p++)
3434  {
3435  if ((*p2 == *p1) && (memcmp(p1, p2, len2) == 0))
3436  {
3437  pos = p + 1;
3438  break;
3439  };
3440  p1++;
3441  };
3442 
3443  PG_RETURN_INT32(pos);
3444 }
3445 
3446 /*-------------------------------------------------------------
3447  * byteaGetByte
3448  *
3449  * this routine treats "bytea" as an array of bytes.
3450  * It returns the Nth byte (a number between 0 and 255).
3451  *-------------------------------------------------------------
3452  */
3453 Datum
3455 {
3456  bytea *v = PG_GETARG_BYTEA_PP(0);
3457  int32 n = PG_GETARG_INT32(1);
3458  int len;
3459  int byte;
3460 
3461  len = VARSIZE_ANY_EXHDR(v);
3462 
3463  if (n < 0 || n >= len)
3464  ereport(ERROR,
3465  (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3466  errmsg("index %d out of valid range, 0..%d",
3467  n, len - 1)));
3468 
3469  byte = ((unsigned char *) VARDATA_ANY(v))[n];
3470 
3471  PG_RETURN_INT32(byte);
3472 }
3473 
3474 /*-------------------------------------------------------------
3475  * byteaGetBit
3476  *
3477  * This routine treats a "bytea" type like an array of bits.
3478  * It returns the value of the Nth bit (0 or 1).
3479  *
3480  *-------------------------------------------------------------
3481  */
3482 Datum
3484 {
3485  bytea *v = PG_GETARG_BYTEA_PP(0);
3486  int64 n = PG_GETARG_INT64(1);
3487  int byteNo,
3488  bitNo;
3489  int len;
3490  int byte;
3491 
3492  len = VARSIZE_ANY_EXHDR(v);
3493 
3494  if (n < 0 || n >= (int64) len * 8)
3495  ereport(ERROR,
3496  (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3497  errmsg("index %lld out of valid range, 0..%lld",
3498  (long long) n, (long long) len * 8 - 1)));
3499 
3500  /* n/8 is now known < len, so safe to cast to int */
3501  byteNo = (int) (n / 8);
3502  bitNo = (int) (n % 8);
3503 
3504  byte = ((unsigned char *) VARDATA_ANY(v))[byteNo];
3505 
3506  if (byte & (1 << bitNo))
3507  PG_RETURN_INT32(1);
3508  else
3509  PG_RETURN_INT32(0);
3510 }
3511 
3512 /*-------------------------------------------------------------
3513  * byteaSetByte
3514  *
3515  * Given an instance of type 'bytea' creates a new one with
3516  * the Nth byte set to the given value.
3517  *
3518  *-------------------------------------------------------------
3519  */
3520 Datum
3522 {
3523  bytea *res = PG_GETARG_BYTEA_P_COPY(0);
3524  int32 n = PG_GETARG_INT32(1);
3525  int32 newByte = PG_GETARG_INT32(2);
3526  int len;
3527 
3528  len = VARSIZE(res) - VARHDRSZ;
3529 
3530  if (n < 0 || n >= len)
3531  ereport(ERROR,
3532  (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3533  errmsg("index %d out of valid range, 0..%d",
3534  n, len - 1)));
3535 
3536  /*
3537  * Now set the byte.
3538  */
3539  ((unsigned char *) VARDATA(res))[n] = newByte;
3540 
3541  PG_RETURN_BYTEA_P(res);
3542 }
3543 
3544 /*-------------------------------------------------------------
3545  * byteaSetBit
3546  *
3547  * Given an instance of type 'bytea' creates a new one with
3548  * the Nth bit set to the given value.
3549  *
3550  *-------------------------------------------------------------
3551  */
3552 Datum
3554 {
3555  bytea *res = PG_GETARG_BYTEA_P_COPY(0);
3556  int64 n = PG_GETARG_INT64(1);
3557  int32 newBit = PG_GETARG_INT32(2);
3558  int len;
3559  int oldByte,
3560  newByte;
3561  int byteNo,
3562  bitNo;
3563 
3564  len = VARSIZE(res) - VARHDRSZ;
3565 
3566  if (n < 0 || n >= (int64) len * 8)
3567  ereport(ERROR,
3568  (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3569  errmsg("index %lld out of valid range, 0..%lld",
3570  (long long) n, (long long) len * 8 - 1)));
3571 
3572  /* n/8 is now known < len, so safe to cast to int */
3573  byteNo = (int) (n / 8);
3574  bitNo = (int) (n % 8);
3575 
3576  /*
3577  * sanity check!
3578  */
3579  if (newBit != 0 && newBit != 1)
3580  ereport(ERROR,
3581  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
3582  errmsg("new bit must be 0 or 1")));
3583 
3584  /*
3585  * Update the byte.
3586  */
3587  oldByte = ((unsigned char *) VARDATA(res))[byteNo];
3588 
3589  if (newBit == 0)
3590  newByte = oldByte & (~(1 << bitNo));
3591  else
3592  newByte = oldByte | (1 << bitNo);
3593 
3594  ((unsigned char *) VARDATA(res))[byteNo] = newByte;
3595 
3596  PG_RETURN_BYTEA_P(res);
3597 }
3598 
3599 
3600 /* text_name()
3601  * Converts a text type to a Name type.
3602  */
3603 Datum
3605 {
3606  text *s = PG_GETARG_TEXT_PP(0);
3607  Name result;
3608  int len;
3609 
3610  len = VARSIZE_ANY_EXHDR(s);
3611 
3612  /* Truncate oversize input */
3613  if (len >= NAMEDATALEN)
3614  len = pg_mbcliplen(VARDATA_ANY(s), len, NAMEDATALEN - 1);
3615 
3616  /* We use palloc0 here to ensure result is zero-padded */
3617  result = (Name) palloc0(NAMEDATALEN);
3618  memcpy(NameStr(*result), VARDATA_ANY(s), len);
3619 
3620  PG_RETURN_NAME(result);
3621 }
3622 
3623 /* name_text()
3624  * Converts a Name type to a text type.
3625  */
3626 Datum
3628 {
3629  Name s = PG_GETARG_NAME(0);
3630 
3632 }
3633 
3634 
3635 /*
3636  * textToQualifiedNameList - convert a text object to list of names
3637  *
3638  * This implements the input parsing needed by nextval() and other
3639  * functions that take a text parameter representing a qualified name.
3640  * We split the name at dots, downcase if not double-quoted, and
3641  * truncate names if they're too long.
3642  */
3643 List *
3645 {
3646  char *rawname;
3647  List *result = NIL;
3648  List *namelist;
3649  ListCell *l;
3650 
3651  /* Convert to C string (handles possible detoasting). */
3652  /* Note we rely on being able to modify rawname below. */
3653  rawname = text_to_cstring(textval);
3654 
3655  if (!SplitIdentifierString(rawname, '.', &namelist))
3656  ereport(ERROR,
3657  (errcode(ERRCODE_INVALID_NAME),
3658  errmsg("invalid name syntax")));
3659 
3660  if (namelist == NIL)
3661  ereport(ERROR,
3662  (errcode(ERRCODE_INVALID_NAME),
3663  errmsg("invalid name syntax")));
3664 
3665  foreach(l, namelist)
3666  {
3667  char *curname = (char *) lfirst(l);
3668 
3669  result = lappend(result, makeString(pstrdup(curname)));
3670  }
3671 
3672  pfree(rawname);
3673  list_free(namelist);
3674 
3675  return result;
3676 }
3677 
3678 /*
3679  * SplitIdentifierString --- parse a string containing identifiers
3680  *
3681  * This is the guts of textToQualifiedNameList, and is exported for use in
3682  * other situations such as parsing GUC variables. In the GUC case, it's
3683  * important to avoid memory leaks, so the API is designed to minimize the
3684  * amount of stuff that needs to be allocated and freed.
3685  *
3686  * Inputs:
3687  * rawstring: the input string; must be overwritable! On return, it's
3688  * been modified to contain the separated identifiers.
3689  * separator: the separator punctuation expected between identifiers
3690  * (typically '.' or ','). Whitespace may also appear around
3691  * identifiers.
3692  * Outputs:
3693  * namelist: filled with a palloc'd list of pointers to identifiers within
3694  * rawstring. Caller should list_free() this even on error return.
3695  *
3696  * Returns true if okay, false if there is a syntax error in the string.
3697  *
3698  * Note that an empty string is considered okay here, though not in
3699  * textToQualifiedNameList.
3700  */
3701 bool
3702 SplitIdentifierString(char *rawstring, char separator,
3703  List **namelist)
3704 {
3705  char *nextp = rawstring;
3706  bool done = false;
3707 
3708  *namelist = NIL;
3709 
3710  while (scanner_isspace(*nextp))
3711  nextp++; /* skip leading whitespace */
3712 
3713  if (*nextp == '\0')
3714  return true; /* allow empty string */
3715 
3716  /* At the top of the loop, we are at start of a new identifier. */
3717  do
3718  {
3719  char *curname;
3720  char *endp;
3721 
3722  if (*nextp == '"')
3723  {
3724  /* Quoted name --- collapse quote-quote pairs, no downcasing */
3725  curname = nextp + 1;
3726  for (;;)
3727  {
3728  endp = strchr(nextp + 1, '"');
3729  if (endp == NULL)
3730  return false; /* mismatched quotes */
3731  if (endp[1] != '"')
3732  break; /* found end of quoted name */
3733  /* Collapse adjacent quotes into one quote, and look again */
3734  memmove(endp, endp + 1, strlen(endp));
3735  nextp = endp;
3736  }
3737  /* endp now points at the terminating quote */
3738  nextp = endp + 1;
3739  }
3740  else
3741  {
3742  /* Unquoted name --- extends to separator or whitespace */
3743  char *downname;
3744  int len;
3745 
3746  curname = nextp;
3747  while (*nextp && *nextp != separator &&
3748  !scanner_isspace(*nextp))
3749  nextp++;
3750  endp = nextp;
3751  if (curname == nextp)
3752  return false; /* empty unquoted name not allowed */
3753 
3754  /*
3755  * Downcase the identifier, using same code as main lexer does.
3756  *
3757  * XXX because we want to overwrite the input in-place, we cannot
3758  * support a downcasing transformation that increases the string
3759  * length. This is not a problem given the current implementation
3760  * of downcase_truncate_identifier, but we'll probably have to do
3761  * something about this someday.
3762  */
3763  len = endp - curname;
3764  downname = downcase_truncate_identifier(curname, len, false);
3765  Assert(strlen(downname) <= len);
3766  strncpy(curname, downname, len); /* strncpy is required here */
3767  pfree(downname);
3768  }
3769 
3770  while (scanner_isspace(*nextp))
3771  nextp++; /* skip trailing whitespace */
3772 
3773  if (*nextp == separator)
3774  {
3775  nextp++;
3776  while (scanner_isspace(*nextp))
3777  nextp++; /* skip leading whitespace for next */
3778  /* we expect another name, so done remains false */
3779  }
3780  else if (*nextp == '\0')
3781  done = true;
3782  else
3783  return false; /* invalid syntax */
3784 
3785  /* Now safe to overwrite separator with a null */
3786  *endp = '\0';
3787 
3788  /* Truncate name if it's overlength */
3789  truncate_identifier(curname, strlen(curname), false);
3790 
3791  /*
3792  * Finished isolating current name --- add it to list
3793  */
3794  *namelist = lappend(*namelist, curname);
3795 
3796  /* Loop back if we didn't reach end of string */
3797  } while (!done);
3798 
3799  return true;
3800 }
3801 
3802 
3803 /*
3804  * SplitDirectoriesString --- parse a string containing file/directory names
3805  *
3806  * This works fine on file names too; the function name is historical.
3807  *
3808  * This is similar to SplitIdentifierString, except that the parsing
3809  * rules are meant to handle pathnames instead of identifiers: there is
3810  * no downcasing, embedded spaces are allowed, the max length is MAXPGPATH-1,
3811  * and we apply canonicalize_path() to each extracted string. Because of the
3812  * last, the returned strings are separately palloc'd rather than being
3813  * pointers into rawstring --- but we still scribble on rawstring.
3814  *
3815  * Inputs:
3816  * rawstring: the input string; must be modifiable!
3817  * separator: the separator punctuation expected between directories
3818  * (typically ',' or ';'). Whitespace may also appear around
3819  * directories.
3820  * Outputs:
3821  * namelist: filled with a palloc'd list of directory names.
3822  * Caller should list_free_deep() this even on error return.
3823  *
3824  * Returns true if okay, false if there is a syntax error in the string.
3825  *
3826  * Note that an empty string is considered okay here.
3827  */
3828 bool
3829 SplitDirectoriesString(char *rawstring, char separator,
3830  List **namelist)
3831 {
3832  char *nextp = rawstring;
3833  bool done = false;
3834 
3835  *namelist = NIL;
3836 
3837  while (scanner_isspace(*nextp))
3838  nextp++; /* skip leading whitespace */
3839 
3840  if (*nextp == '\0')
3841  return true; /* allow empty string */
3842 
3843  /* At the top of the loop, we are at start of a new directory. */
3844  do
3845  {
3846  char *curname;
3847  char *endp;
3848 
3849  if (*nextp == '"')
3850  {
3851  /* Quoted name --- collapse quote-quote pairs */
3852  curname = nextp + 1;
3853  for (;;)
3854  {
3855  endp = strchr(nextp + 1, '"');
3856  if (endp == NULL)
3857  return false; /* mismatched quotes */
3858  if (endp[1] != '"')
3859  break; /* found end of quoted name */
3860  /* Collapse adjacent quotes into one quote, and look again */
3861  memmove(endp, endp + 1, strlen(endp));
3862  nextp = endp;
3863  }
3864  /* endp now points at the terminating quote */
3865  nextp = endp + 1;
3866  }
3867  else
3868  {
3869  /* Unquoted name --- extends to separator or end of string */
3870  curname = endp = nextp;
3871  while (*nextp && *nextp != separator)
3872  {
3873  /* trailing whitespace should not be included in name */
3874  if (!scanner_isspace(*nextp))
3875  endp = nextp + 1;
3876  nextp++;
3877  }
3878  if (curname == endp)
3879  return false; /* empty unquoted name not allowed */
3880  }
3881 
3882  while (scanner_isspace(*nextp))
3883  nextp++; /* skip trailing whitespace */
3884 
3885  if (*nextp == separator)
3886  {
3887  nextp++;
3888  while (scanner_isspace(*nextp))
3889  nextp++; /* skip leading whitespace for next */
3890  /* we expect another name, so done remains false */
3891  }
3892  else if (*nextp == '\0')
3893  done = true;
3894  else
3895  return false; /* invalid syntax */
3896 
3897  /* Now safe to overwrite separator with a null */
3898  *endp = '\0';
3899 
3900  /* Truncate path if it's overlength */
3901  if (strlen(curname) >= MAXPGPATH)
3902  curname[MAXPGPATH - 1] = '\0';
3903 
3904  /*
3905  * Finished isolating current name --- add it to list
3906  */
3907  curname = pstrdup(curname);
3908  canonicalize_path(curname);
3909  *namelist = lappend(*namelist, curname);
3910 
3911  /* Loop back if we didn't reach end of string */
3912  } while (!done);
3913 
3914  return true;
3915 }
3916 
3917 
3918 /*
3919  * SplitGUCList --- parse a string containing identifiers or file names
3920  *
3921  * This is used to split the value of a GUC_LIST_QUOTE GUC variable, without
3922  * presuming whether the elements will be taken as identifiers or file names.
3923  * We assume the input has already been through flatten_set_variable_args(),
3924  * so that we need never downcase (if appropriate, that was done already).
3925  * Nor do we ever truncate, since we don't know the correct max length.
3926  * We disallow embedded whitespace for simplicity (it shouldn't matter,
3927  * because any embedded whitespace should have led to double-quoting).
3928  * Otherwise the API is identical to SplitIdentifierString.
3929  *
3930  * XXX it's annoying to have so many copies of this string-splitting logic.
3931  * However, it's not clear that having one function with a bunch of option
3932  * flags would be much better.
3933  *
3934  * XXX there is a version of this function in src/bin/pg_dump/dumputils.c.
3935  * Be sure to update that if you have to change this.
3936  *
3937  * Inputs:
3938  * rawstring: the input string; must be overwritable! On return, it's
3939  * been modified to contain the separated identifiers.
3940  * separator: the separator punctuation expected between identifiers
3941  * (typically '.' or ','). Whitespace may also appear around
3942  * identifiers.
3943  * Outputs:
3944  * namelist: filled with a palloc'd list of pointers to identifiers within
3945  * rawstring. Caller should list_free() this even on error return.
3946  *
3947  * Returns true if okay, false if there is a syntax error in the string.
3948  */
3949 bool
3950 SplitGUCList(char *rawstring, char separator,
3951  List **namelist)
3952 {
3953  char *nextp = rawstring;
3954  bool done = false;
3955 
3956  *namelist = NIL;
3957 
3958  while (scanner_isspace(*nextp))
3959  nextp++; /* skip leading whitespace */
3960 
3961  if (*nextp == '\0')
3962  return true; /* allow empty string */
3963 
3964  /* At the top of the loop, we are at start of a new identifier. */
3965  do
3966  {
3967  char *curname;
3968  char *endp;
3969 
3970  if (*nextp == '"')
3971  {
3972  /* Quoted name --- collapse quote-quote pairs */
3973  curname = nextp + 1;
3974  for (;;)
3975  {
3976  endp = strchr(nextp + 1, '"');
3977  if (endp == NULL)
3978  return false; /* mismatched quotes */
3979  if (endp[1] != '"')
3980  break; /* found end of quoted name */
3981  /* Collapse adjacent quotes into one quote, and look again */
3982  memmove(endp, endp + 1, strlen(endp));
3983  nextp = endp;
3984  }
3985  /* endp now points at the terminating quote */
3986  nextp = endp + 1;
3987  }
3988  else
3989  {
3990  /* Unquoted name --- extends to separator or whitespace */
3991  curname = nextp;
3992  while (*nextp && *nextp != separator &&
3993  !scanner_isspace(*nextp))
3994  nextp++;
3995  endp = nextp;
3996  if (curname == nextp)
3997  return false; /* empty unquoted name not allowed */
3998  }
3999 
4000  while (scanner_isspace(*nextp))
4001  nextp++; /* skip trailing whitespace */
4002 
4003  if (*nextp == separator)
4004  {
4005  nextp++;
4006  while (scanner_isspace(*nextp))
4007  nextp++; /* skip leading whitespace for next */
4008  /* we expect another name, so done remains false */
4009  }
4010  else if (*nextp == '\0')
4011  done = true;
4012  else
4013  return false; /* invalid syntax */
4014 
4015  /* Now safe to overwrite separator with a null */
4016  *endp = '\0';
4017 
4018  /*
4019  * Finished isolating current name --- add it to list
4020  */
4021  *namelist = lappend(*namelist, curname);
4022 
4023  /* Loop back if we didn't reach end of string */
4024  } while (!done);
4025 
4026  return true;
4027 }
4028 
4029 
4030 /*****************************************************************************
4031  * Comparison Functions used for bytea
4032  *
4033  * Note: btree indexes need these routines not to leak memory; therefore,
4034  * be careful to free working copies of toasted datums. Most places don't
4035  * need to be so careful.
4036  *****************************************************************************/
4037 
4038 Datum
4040 {
4041  Datum arg1 = PG_GETARG_DATUM(0);
4042  Datum arg2 = PG_GETARG_DATUM(1);
4043  bool result;
4044  Size len1,
4045  len2;
4046 
4047  /*
4048  * We can use a fast path for unequal lengths, which might save us from
4049  * having to detoast one or both values.
4050  */
4051  len1 = toast_raw_datum_size(arg1);
4052  len2 = toast_raw_datum_size(arg2);
4053  if (len1 != len2)
4054  result = false;
4055  else
4056  {
4057  bytea *barg1 = DatumGetByteaPP(arg1);
4058  bytea *barg2 = DatumGetByteaPP(arg2);
4059 
4060  result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2),
4061  len1 - VARHDRSZ) == 0);
4062 
4063  PG_FREE_IF_COPY(barg1, 0);
4064  PG_FREE_IF_COPY(barg2, 1);
4065  }
4066 
4067  PG_RETURN_BOOL(result);
4068 }
4069 
4070 Datum
4072 {
4073  Datum arg1 = PG_GETARG_DATUM(0);
4074  Datum arg2 = PG_GETARG_DATUM(1);
4075  bool result;
4076  Size len1,
4077  len2;
4078 
4079  /*
4080  * We can use a fast path for unequal lengths, which might save us from
4081  * having to detoast one or both values.
4082  */
4083  len1 = toast_raw_datum_size(arg1);
4084  len2 = toast_raw_datum_size(arg2);
4085  if (len1 != len2)
4086  result = true;
4087  else
4088  {
4089  bytea *barg1 = DatumGetByteaPP(arg1);
4090  bytea *barg2 = DatumGetByteaPP(arg2);
4091 
4092  result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2),
4093  len1 - VARHDRSZ) != 0);
4094 
4095  PG_FREE_IF_COPY(barg1, 0);
4096  PG_FREE_IF_COPY(barg2, 1);
4097  }
4098 
4099  PG_RETURN_BOOL(result);
4100 }
4101 
4102 Datum
4104 {
4105  bytea *arg1 = PG_GETARG_BYTEA_PP(0);
4106  bytea *arg2 = PG_GETARG_BYTEA_PP(1);
4107  int len1,
4108  len2;
4109  int cmp;
4110 
4111  len1 = VARSIZE_ANY_EXHDR(arg1);
4112  len2 = VARSIZE_ANY_EXHDR(arg2);
4113 
4114  cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4115 
4116  PG_FREE_IF_COPY(arg1, 0);
4117  PG_FREE_IF_COPY(arg2, 1);
4118 
4119  PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 < len2)));
4120 }
4121 
4122 Datum
4124 {
4125  bytea *arg1 = PG_GETARG_BYTEA_PP(0);
4126  bytea *arg2 = PG_GETARG_BYTEA_PP(1);
4127  int len1,
4128  len2;
4129  int cmp;
4130 
4131  len1 = VARSIZE_ANY_EXHDR(arg1);
4132  len2 = VARSIZE_ANY_EXHDR(arg2);
4133 
4134  cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4135 
4136  PG_FREE_IF_COPY(arg1, 0);
4137  PG_FREE_IF_COPY(arg2, 1);
4138 
4139  PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 <= len2)));
4140 }
4141 
4142 Datum
4144 {
4145  bytea *arg1 = PG_GETARG_BYTEA_PP(0);
4146  bytea *arg2 = PG_GETARG_BYTEA_PP(1);
4147  int len1,
4148  len2;
4149  int cmp;
4150 
4151  len1 = VARSIZE_ANY_EXHDR(arg1);
4152  len2 = VARSIZE_ANY_EXHDR(arg2);
4153 
4154  cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4155 
4156  PG_FREE_IF_COPY(arg1, 0);
4157  PG_FREE_IF_COPY(arg2, 1);
4158 
4159  PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 > len2)));
4160 }
4161 
4162 Datum
4164 {
4165  bytea *arg1 = PG_GETARG_BYTEA_PP(0);
4166  bytea *arg2 = PG_GETARG_BYTEA_PP(1);
4167  int len1,
4168  len2;
4169  int cmp;
4170 
4171  len1 = VARSIZE_ANY_EXHDR(arg1);
4172  len2 = VARSIZE_ANY_EXHDR(arg2);
4173 
4174  cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4175 
4176  PG_FREE_IF_COPY(arg1, 0);
4177  PG_FREE_IF_COPY(arg2, 1);
4178 
4179  PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 >= len2)));
4180 }
4181 
4182 Datum
4184 {
4185  bytea *arg1 = PG_GETARG_BYTEA_PP(0);
4186  bytea *arg2 = PG_GETARG_BYTEA_PP(1);
4187  int len1,
4188  len2;
4189  int cmp;
4190 
4191  len1 = VARSIZE_ANY_EXHDR(arg1);
4192  len2 = VARSIZE_ANY_EXHDR(arg2);
4193 
4194  cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4195  if ((cmp == 0) && (len1 != len2))
4196  cmp = (len1 < len2) ? -1 : 1;
4197 
4198  PG_FREE_IF_COPY(arg1, 0);
4199  PG_FREE_IF_COPY(arg2, 1);
4200 
4201  PG_RETURN_INT32(cmp);
4202 }
4203 
4204 Datum
4206 {
4208  MemoryContext oldcontext;
4209 
4210  oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
4211 
4212  /* Use generic string SortSupport, forcing "C" collation */
4213  varstr_sortsupport(ssup, BYTEAOID, C_COLLATION_OID);
4214 
4215  MemoryContextSwitchTo(oldcontext);
4216 
4217  PG_RETURN_VOID();
4218 }
4219 
4220 /*
4221  * appendStringInfoText
4222  *
4223  * Append a text to str.
4224  * Like appendStringInfoString(str, text_to_cstring(t)) but faster.
4225  */
4226 static void
4228 {
4230 }
4231 
4232 /*
4233  * replace_text
4234  * replace all occurrences of 'old_sub_str' in 'orig_str'
4235  * with 'new_sub_str' to form 'new_str'
4236  *
4237  * returns 'orig_str' if 'old_sub_str' == '' or 'orig_str' == ''
4238  * otherwise returns 'new_str'
4239  */
4240 Datum
4242 {
4243  text *src_text = PG_GETARG_TEXT_PP(0);
4244  text *from_sub_text = PG_GETARG_TEXT_PP(1);
4245  text *to_sub_text = PG_GETARG_TEXT_PP(2);
4246  int src_text_len;
4247  int from_sub_text_len;
4249  text *ret_text;
4250  int chunk_len;
4251  char *curr_ptr;
4252  char *start_ptr;
4254  bool found;
4255 
4256  src_text_len = VARSIZE_ANY_EXHDR(src_text);
4257  from_sub_text_len = VARSIZE_ANY_EXHDR(from_sub_text);
4258 
4259  /* Return unmodified source string if empty source or pattern */
4260  if (src_text_len < 1 || from_sub_text_len < 1)
4261  {
4262  PG_RETURN_TEXT_P(src_text);
4263  }
4264 
4265  text_position_setup(src_text, from_sub_text, PG_GET_COLLATION(), &state);
4266 
4267  found = text_position_next(&state);
4268 
4269  /* When the from_sub_text is not found, there is nothing to do. */
4270  if (!found)
4271  {
4272  text_position_cleanup(&state);
4273  PG_RETURN_TEXT_P(src_text);
4274  }
4275  curr_ptr = text_position_get_match_ptr(&state);
4276  start_ptr = VARDATA_ANY(src_text);
4277 
4278  initStringInfo(&str);
4279 
4280  do
4281  {
4283 
4284  /* copy the data skipped over by last text_position_next() */
4285  chunk_len = curr_ptr - start_ptr;
4286  appendBinaryStringInfo(&str, start_ptr, chunk_len);
4287 
4288  appendStringInfoText(&str, to_sub_text);
4289 
4290  start_ptr = curr_ptr + from_sub_text_len;
4291 
4292  found = text_position_next(&state);
4293  if (found)
4294  curr_ptr = text_position_get_match_ptr(&state);
4295  }
4296  while (found);
4297 
4298  /* copy trailing data */
4299  chunk_len = ((char *) src_text + VARSIZE_ANY(src_text)) - start_ptr;
4300  appendBinaryStringInfo(&str, start_ptr, chunk_len);
4301 
4302  text_position_cleanup(&state);
4303 
4304  ret_text = cstring_to_text_with_len(str.data, str.len);
4305  pfree(str.data);
4306 
4307  PG_RETURN_TEXT_P(ret_text);
4308 }
4309 
4310 /*
4311  * check_replace_text_has_escape_char
4312  *
4313  * check whether replace_text contains escape char.
4314  */
4315 static bool
4317 {
4318  const char *p = VARDATA_ANY(replace_text);
4319  const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);
4320 
4322  {
4323  for (; p < p_end; p++)
4324  {
4325  if (*p == '\\')
4326  return true;
4327  }
4328  }
4329  else
4330  {
4331  for (; p < p_end; p += pg_mblen(p))
4332  {
4333  if (*p == '\\')
4334  return true;
4335  }
4336  }
4337 
4338  return false;
4339 }
4340 
4341 /*
4342  * appendStringInfoRegexpSubstr
4343  *
4344  * Append replace_text to str, substituting regexp back references for
4345  * \n escapes. start_ptr is the start of the match in the source string,
4346  * at logical character position data_pos.
4347  */
4348 static void
4350  regmatch_t *pmatch,
4351  char *start_ptr, int data_pos)
4352 {
4353  const char *p = VARDATA_ANY(replace_text);
4354  const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);
4355  int eml = pg_database_encoding_max_length();
4356 
4357  for (;;)
4358  {
4359  const char *chunk_start = p;
4360  int so;
4361  int eo;
4362 
4363  /* Find next escape char. */
4364  if (eml == 1)
4365  {
4366  for (; p < p_end && *p != '\\'; p++)
4367  /* nothing */ ;
4368  }
4369  else
4370  {
4371  for (; p < p_end && *p != '\\'; p += pg_mblen(p))
4372  /* nothing */ ;
4373  }
4374 
4375  /* Copy the text we just scanned over, if any. */
4376  if (p > chunk_start)
4377  appendBinaryStringInfo(str, chunk_start, p - chunk_start);
4378 
4379  /* Done if at end of string, else advance over escape char. */
4380  if (p >= p_end)
4381  break;
4382  p++;
4383 
4384  if (p >= p_end)
4385  {
4386  /* Escape at very end of input. Treat same as unexpected char */
4387  appendStringInfoChar(str, '\\');
4388  break;
4389  }
4390 
4391  if (*p >= '1' && *p <= '9')
4392  {
4393  /* Use the back reference of regexp. */
4394  int idx = *p - '0';
4395 
4396  so = pmatch[idx].rm_so;
4397  eo = pmatch[idx].rm_eo;
4398  p++;
4399  }
4400  else if (*p == '&')
4401  {
4402  /* Use the entire matched string. */
4403  so = pmatch[0].rm_so;
4404  eo = pmatch[0].rm_eo;
4405  p++;
4406  }
4407  else if (*p == '\\')
4408  {
4409  /* \\ means transfer one \ to output. */
4410  appendStringInfoChar(str, '\\');
4411  p++;
4412  continue;
4413  }
4414  else
4415  {
4416  /*
4417  * If escape char is not followed by any expected char, just treat
4418  * it as ordinary data to copy. (XXX would it be better to throw
4419  * an error?)
4420  */
4421  appendStringInfoChar(str, '\\');
4422  continue;
4423  }
4424 
4425  if (so != -1 && eo != -1)
4426  {
4427  /*
4428  * Copy the text that is back reference of regexp. Note so and eo
4429  * are counted in characters not bytes.
4430  */
4431  char *chunk_start;
4432  int chunk_len;
4433 
4434  Assert(so >= data_pos);
4435  chunk_start = start_ptr;
4436  chunk_start += charlen_to_bytelen(chunk_start, so - data_pos);
4437  chunk_len = charlen_to_bytelen(chunk_start, eo - so);
4438  appendBinaryStringInfo(str, chunk_start, chunk_len);
4439  }
4440  }
4441 }
4442 
4443 #define REGEXP_REPLACE_BACKREF_CNT 10
4444 
4445 /*
4446  * replace_text_regexp
4447  *
4448  * replace text that matches to regexp in src_text to replace_text.
4449  *
4450  * Note: to avoid having to include regex.h in builtins.h, we declare
4451  * the regexp argument as void *, but really it's regex_t *.
4452  */
4453 text *
4454 replace_text_regexp(text *src_text, void *regexp,
4455  text *replace_text, bool glob)
4456 {
4457  text *ret_text;
4458  regex_t *re = (regex_t *) regexp;
4459  int src_text_len = VARSIZE_ANY_EXHDR(src_text);
4462  pg_wchar *data;
4463  size_t data_len;
4464  int search_start;
4465  int data_pos;
4466  char *start_ptr;
4467  bool have_escape;
4468 
4469  initStringInfo(&buf);
4470 
4471  /* Convert data string to wide characters. */
4472  data = (pg_wchar *) palloc((src_text_len + 1) * sizeof(pg_wchar));
4473  data_len = pg_mb2wchar_with_len(VARDATA_ANY(src_text), data, src_text_len);
4474 
4475  /* Check whether replace_text has escape char. */
4476  have_escape = check_replace_text_has_escape_char(replace_text);
4477 
4478  /* start_ptr points to the data_pos'th character of src_text */
4479  start_ptr = (char *) VARDATA_ANY(src_text);
4480  data_pos = 0;
4481 
4482  search_start = 0;
4483  while (search_start <= data_len)
4484  {
4485  int regexec_result;
4486 
4488 
4489  regexec_result = pg_regexec(re,
4490  data,
4491  data_len,
4492  search_start,
4493  NULL, /* no details */
4495  pmatch,
4496  0);
4497 
4498  if (regexec_result == REG_NOMATCH)
4499  break;
4500 
4501  if (regexec_result != REG_OKAY)
4502  {
4503  char errMsg[100];
4504 
4506  pg_regerror(regexec_result, re, errMsg, sizeof(errMsg));
4507  ereport(ERROR,
4508  (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
4509  errmsg("regular expression failed: %s", errMsg)));
4510  }
4511 
4512  /*
4513  * Copy the text to the left of the match position. Note we are given
4514  * character not byte indexes.
4515  */
4516  if (pmatch[0].rm_so - data_pos > 0)
4517  {
4518  int chunk_len;
4519 
4520  chunk_len = charlen_to_bytelen(start_ptr,
4521  pmatch[0].rm_so - data_pos);
4522  appendBinaryStringInfo(&buf, start_ptr, chunk_len);
4523 
4524  /*
4525  * Advance start_ptr over that text, to avoid multiple rescans of
4526  * it if the replace_text contains multiple back-references.
4527  */
4528  start_ptr += chunk_len;
4529  data_pos = pmatch[0].rm_so;
4530  }
4531 
4532  /*
4533  * Copy the replace_text. Process back references when the
4534  * replace_text has escape characters.
4535  */
4536  if (have_escape)
4537  appendStringInfoRegexpSubstr(&buf, replace_text, pmatch,
4538  start_ptr, data_pos);
4539  else
4540  appendStringInfoText(&buf, replace_text);
4541 
4542  /* Advance start_ptr and data_pos over the matched text. */
4543  start_ptr += charlen_to_bytelen(start_ptr,
4544  pmatch[0].rm_eo - data_pos);
4545  data_pos = pmatch[0].rm_eo;
4546 
4547  /*
4548  * When global option is off, replace the first instance only.
4549  */
4550  if (!glob)
4551  break;
4552 
4553  /*
4554  * Advance search position. Normally we start the next search at the
4555  * end of the previous match; but if the match was of zero length, we
4556  * have to advance by one character, or we'd just find the same match
4557  * again.
4558  */
4559  search_start = data_pos;
4560  if (pmatch[0].rm_so == pmatch[0].rm_eo)
4561  search_start++;
4562  }
4563 
4564  /*
4565  * Copy the text to the right of the last match.
4566  */
4567  if (data_pos < data_len)
4568  {
4569  int chunk_len;
4570 
4571  chunk_len = ((char *) src_text + VARSIZE_ANY(src_text)) - start_ptr;
4572  appendBinaryStringInfo(&buf, start_ptr, chunk_len);
4573  }
4574 
4575  ret_text = cstring_to_text_with_len(buf.data, buf.len);
4576  pfree(buf.data);
4577  pfree(data);
4578 
4579  return ret_text;
4580 }
4581 
4582 /*
4583  * split_part
4584  * parse input string
4585  * return ord item (1 based)
4586  * based on provided field separator
4587  */
4588 Datum
4590 {
4591  text *inputstring = PG_GETARG_TEXT_PP(0);
4592  text *fldsep = PG_GETARG_TEXT_PP(1);
4593  int fldnum = PG_GETARG_INT32(2);
4594  int inputstring_len;
4595  int fldsep_len;
4597  char *start_ptr;
4598  char *end_ptr;
4599  text *result_text;
4600  bool found;
4601 
4602  /* field number is 1 based */
4603  if (fldnum < 1)
4604  ereport(ERROR,
4605  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4606  errmsg("field position must be greater than zero")));
4607 
4608  inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
4609  fldsep_len = VARSIZE_ANY_EXHDR(fldsep);
4610 
4611  /* return empty string for empty input string */
4612  if (inputstring_len < 1)
4614 
4615  /* empty field separator */
4616  if (fldsep_len < 1)
4617  {
4618  /* if first field, return input string, else empty string */
4619  if (fldnum == 1)
4620  PG_RETURN_TEXT_P(inputstring);
4621  else
4623  }
4624 
4625  text_position_setup(inputstring, fldsep, PG_GET_COLLATION(), &state);
4626 
4627  /* identify bounds of first field */
4628  start_ptr = VARDATA_ANY(inputstring);
4629  found = text_position_next(&state);
4630 
4631  /* special case if fldsep not found at all */
4632  if (!found)
4633  {
4634  text_position_cleanup(&state);
4635  /* if field 1 requested, return input string, else empty string */
4636  if (fldnum == 1)
4637  PG_RETURN_TEXT_P(inputstring);
4638  else
4640  }
4641  end_ptr = text_position_get_match_ptr(&state);
4642 
4643  while (found && --fldnum > 0)
4644  {
4645  /* identify bounds of next field */
4646  start_ptr = end_ptr + fldsep_len;
4647  found = text_position_next(&state);
4648  if (found)
4649  end_ptr = text_position_get_match_ptr(&state);
4650  }
4651 
4652  text_position_cleanup(&state);
4653 
4654  if (fldnum > 0)
4655  {
4656  /* N'th field separator not found */
4657  /* if last field requested, return it, else empty string */
4658  if (fldnum == 1)
4659  {
4660  int last_len = start_ptr - VARDATA_ANY(inputstring);
4661 
4662  result_text = cstring_to_text_with_len(start_ptr,
4663  inputstring_len - last_len);
4664  }
4665  else
4666  result_text = cstring_to_text("");
4667  }
4668  else
4669  {
4670  /* non-last field requested */
4671  result_text = cstring_to_text_with_len(start_ptr, end_ptr - start_ptr);
4672  }
4673 
4674  PG_RETURN_TEXT_P(result_text);
4675 }
4676 
4677 /*
4678  * Convenience function to return true when two text params are equal.
4679  */
4680 static bool
4681 text_isequal(text *txt1, text *txt2, Oid collid)
4682 {
4684  collid,
4685  PointerGetDatum(txt1),
4686  PointerGetDatum(txt2)));
4687 }
4688 
4689 /*
4690  * text_to_array
4691  * parse input string and return text array of elements,
4692  * based on provided field separator
4693  */
4694 Datum
4696 {
4697  SplitTextOutputData tstate;
4698 
4699  /* For array output, tstate should start as all zeroes */
4700  memset(&tstate, 0, sizeof(tstate));
4701 
4702  if (!split_text(fcinfo, &tstate))
4703  PG_RETURN_NULL();
4704 
4705  if (tstate.astate == NULL)
4707 
4710 }
4711 
4712 /*
4713  * text_to_array_null
4714  * parse input string and return text array of elements,
4715  * based on provided field separator and null string
4716  *
4717  * This is a separate entry point only to prevent the regression tests from
4718  * complaining about different argument sets for the same internal function.
4719  */
4720 Datum
4722 {
4723  return text_to_array(fcinfo);
4724 }
4725 
4726 /*
4727  * text_to_table
4728  * parse input string and return table of elements,
4729  * based on provided field separator
4730  */
4731 Datum
4733 {
4734  ReturnSetInfo *rsi = (ReturnSetInfo *) fcinfo->resultinfo;
4735  SplitTextOutputData tstate;
4736  MemoryContext old_cxt;
4737 
4738  /* check to see if caller supports us returning a tuplestore */
4739  if (rsi == NULL || !IsA(rsi, ReturnSetInfo))
4740  ereport(ERROR,
4741  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
4742  errmsg("set-valued function called in context that cannot accept a set")));
4743  if (!(rsi->allowedModes & SFRM_Materialize))
4744  ereport(ERROR,
4745  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
4746  errmsg("materialize mode required, but it is not allowed in this context")));
4747 
4748  /* OK, prepare tuplestore in per-query memory */
4750 
4751  tstate.astate = NULL;
4752  tstate.tupdesc = CreateTupleDescCopy(rsi->expectedDesc);
4753  tstate.tupstore = tuplestore_begin_heap(true, false, work_mem);
4754 
4755  MemoryContextSwitchTo(old_cxt);
4756 
4757  (void) split_text(fcinfo, &tstate);
4758 
4759  tuplestore_donestoring(tstate.tupstore);
4760 
4762  rsi->setResult = tstate.tupstore;
4763  rsi->setDesc = tstate.tupdesc;
4764 
4765  return (Datum) 0;
4766 }
4767 
4768 /*
4769  * text_to_table_null
4770  * parse input string and return table of elements,
4771  * based on provided field separator and null string
4772  *
4773  * This is a separate entry point only to prevent the regression tests from
4774  * complaining about different argument sets for the same internal function.
4775  */
4776 Datum
4778 {
4779  return text_to_table(fcinfo);
4780 }
4781 
4782 /*
4783  * Common code for text_to_array, text_to_array_null, text_to_table
4784  * and text_to_table_null functions.
4785  *
4786  * These are not strict so we have to test for null inputs explicitly.
4787  * Returns false if result is to be null, else returns true.
4788  *
4789  * Note that if the result is valid but empty (zero elements), we return
4790  * without changing *tstate --- caller must handle that case, too.
4791  */
4792 static bool
4794 {
4795  text *inputstring;
4796  text *fldsep;
4797  text *null_string;
4798  Oid collation = PG_GET_COLLATION();
4799  int inputstring_len;
4800  int fldsep_len;
4801  char *start_ptr;
4802  text *result_text;
4803 
4804  /* when input string is NULL, then result is NULL too */
4805  if (PG_ARGISNULL(0))
4806  return false;
4807 
4808  inputstring = PG_GETARG_TEXT_PP(0);
4809 
4810  /* fldsep can be NULL */
4811  if (!PG_ARGISNULL(1))
4812  fldsep = PG_GETARG_TEXT_PP(1);
4813  else
4814  fldsep = NULL;
4815 
4816  /* null_string can be NULL or omitted */
4817  if (PG_NARGS() > 2 && !PG_ARGISNULL(2))
4818  null_string = PG_GETARG_TEXT_PP(2);
4819  else
4820  null_string = NULL;
4821 
4822  if (fldsep != NULL)
4823  {
4824  /*
4825  * Normal case with non-null fldsep. Use the text_position machinery
4826  * to search for occurrences of fldsep.
4827  */
4829 
4830  inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
4831  fldsep_len = VARSIZE_ANY_EXHDR(fldsep);
4832 
4833  /* return empty set for empty input string */
4834  if (inputstring_len < 1)
4835  return true;
4836 
4837  /* empty field separator: return input string as a one-element set */
4838  if (fldsep_len < 1)
4839  {
4840  split_text_accum_result(tstate, inputstring,
4841  null_string, collation);
4842  return true;
4843  }
4844 
4845  text_position_setup(inputstring, fldsep, collation, &state);
4846 
4847  start_ptr = VARDATA_ANY(inputstring);
4848 
4849  for (;;)
4850  {
4851  bool found;
4852  char *end_ptr;
4853  int chunk_len;
4854 
4856 
4857  found = text_position_next(&state);
4858  if (!found)
4859  {
4860  /* fetch last field */
4861  chunk_len = ((char *) inputstring + VARSIZE_ANY(inputstring)) - start_ptr;
4862  end_ptr = NULL; /* not used, but some compilers complain */
4863  }
4864  else
4865  {
4866  /* fetch non-last field */
4867  end_ptr = text_position_get_match_ptr(&state);
4868  chunk_len = end_ptr - start_ptr;
4869  }
4870 
4871  /* build a temp text datum to pass to split_text_accum_result */
4872  result_text = cstring_to_text_with_len(start_ptr, chunk_len);
4873 
4874  /* stash away this field */
4875  split_text_accum_result(tstate, result_text,
4876  null_string, collation);
4877 
4878  pfree(result_text);
4879 
4880  if (!found)
4881  break;
4882 
4883  start_ptr = end_ptr + fldsep_len;
4884  }
4885 
4886  text_position_cleanup(&state);
4887  }
4888  else
4889  {
4890  /*
4891  * When fldsep is NULL, each character in the input string becomes a
4892  * separate element in the result set. The separator is effectively
4893  * the space between characters.
4894  */
4895  inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
4896 
4897  start_ptr = VARDATA_ANY(inputstring);
4898 
4899  while (inputstring_len > 0)
4900  {
4901  int chunk_len = pg_mblen(start_ptr);
4902 
4904 
4905  /* build a temp text datum to pass to split_text_accum_result */
4906  result_text = cstring_to_text_with_len(start_ptr, chunk_len);
4907 
4908  /* stash away this field */
4909  split_text_accum_result(tstate, result_text,
4910  null_string, collation);
4911 
4912  pfree(result_text);
4913 
4914  start_ptr += chunk_len;
4915  inputstring_len -= chunk_len;
4916  }
4917  }
4918 
4919  return true;
4920 }
4921 
4922 /*
4923  * Add text item to result set (table or array).
4924  *
4925  * This is also responsible for checking to see if the item matches
4926  * the null_string, in which case we should emit NULL instead.
4927  */
4928 static void
4930  text *field_value,
4931  text *null_string,
4932  Oid collation)
4933 {
4934  bool is_null = false;
4935 
4936  if (null_string && text_isequal(field_value, null_string, collation))
4937  is_null = true;
4938 
4939  if (tstate->tupstore)
4940  {
4941  Datum values[1];
4942  bool nulls[1];
4943 
4944  values[0] = PointerGetDatum(field_value);
4945  nulls[0] = is_null;
4946 
4948  tstate->tupdesc,
4949  values,
4950  nulls);
4951  }
4952  else
4953  {
4954  tstate->astate = accumArrayResult(tstate->astate,
4955  PointerGetDatum(field_value),
4956  is_null,
4957  TEXTOID,
4959  }
4960 }
4961 
4962 /*
4963  * array_to_text
4964  * concatenate Cstring representation of input array elements
4965  * using provided field separator
4966  */
4967 Datum
4969 {
4971  char *fldsep = text_to_cstring(PG_GETARG_TEXT_PP(1));
4972 
4973  PG_RETURN_TEXT_P(array_to_text_internal(fcinfo, v, fldsep, NULL));
4974 }
4975 
4976 /*
4977  * array_to_text_null
4978  * concatenate Cstring representation of input array elements
4979  * using provided field separator and null string
4980  *
4981  * This version is not strict so we have to test for null inputs explicitly.
4982  */
4983 Datum
4985 {
4986  ArrayType *v;
4987  char *fldsep;
4988  char *null_string;
4989 
4990  /* returns NULL when first or second parameter is NULL */
4991  if (PG_ARGISNULL(0) || PG_ARGISNULL(1))
4992  PG_RETURN_NULL();
4993 
4994  v = PG_GETARG_ARRAYTYPE_P(0);
4995  fldsep = text_to_cstring(PG_GETARG_TEXT_PP(1));
4996 
4997  /* NULL null string is passed through as a null pointer */
4998  if (!PG_ARGISNULL(2))
4999  null_string = text_to_cstring(PG_GETARG_TEXT_PP(2));
5000  else
5001  null_string = NULL;
5002 
5003  PG_RETURN_TEXT_P(array_to_text_internal(fcinfo, v, fldsep, null_string));
5004 }
5005 
5006 /*
5007  * common code for array_to_text and array_to_text_null functions
5008  */
5009 static text *
5011  const char *fldsep, const char *null_string)
5012 {
5013  text *result;
5014  int nitems,
5015  *dims,
5016  ndims;
5017  Oid element_type;
5018  int typlen;
5019  bool typbyval;
5020  char typalign;
5022  bool printed = false;
5023  char *p;
5024  bits8 *bitmap;
5025  int bitmask;
5026  int i;
5027  ArrayMetaState *my_extra;
5028 
5029  ndims = ARR_NDIM(v);
5030  dims = ARR_DIMS(v);
5031  nitems = ArrayGetNItems(ndims, dims);
5032 
5033  /* if there are no elements, return an empty string */
5034  if (nitems == 0)
5035  return cstring_to_text_with_len("", 0);
5036 
5037  element_type = ARR_ELEMTYPE(v);
5038  initStringInfo(&buf);
5039 
5040  /*
5041  * We arrange to look up info about element type, including its output
5042  * conversion proc, only once per series of calls, assuming the element
5043  * type doesn't change underneath us.
5044  */
5045  my_extra = (ArrayMetaState *) fcinfo->flinfo->fn_extra;
5046  if (my_extra == NULL)
5047  {
5048  fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
5049  sizeof(ArrayMetaState));
5050  my_extra = (ArrayMetaState *) fcinfo->flinfo->fn_extra;
5051  my_extra->element_type = ~element_type;
5052  }
5053 
5054  if (my_extra->element_type != element_type)
5055  {
5056  /*
5057  * Get info about element type, including its output conversion proc
5058  */
5059  get_type_io_data(element_type, IOFunc_output,
5060  &my_extra->typlen, &my_extra->typbyval,
5061  &my_extra->typalign, &my_extra->typdelim,
5062  &my_extra->typioparam, &my_extra->typiofunc);
5063  fmgr_info_cxt(my_extra->typiofunc, &my_extra->proc,
5064  fcinfo->flinfo->fn_mcxt);
5065  my_extra->element_type = element_type;
5066  }
5067  typlen = my_extra->typlen;
5068  typbyval = my_extra->typbyval;
5069  typalign = my_extra->typalign;
5070 
5071  p = ARR_DATA_PTR(v);
5072  bitmap = ARR_NULLBITMAP(v);
5073  bitmask = 1;
5074 
5075  for (i = 0; i < nitems; i++)
5076  {
5077  Datum itemvalue;
5078  char *value;
5079 
5080  /* Get source element, checking for NULL */
5081  if (bitmap && (*bitmap & bitmask) == 0)
5082  {
5083  /* if null_string is NULL, we just ignore null elements */
5084  if (null_string != NULL)
5085  {
5086  if (printed)
5087  appendStringInfo(&buf, "%s%s", fldsep, null_string);
5088  else
5089  appendStringInfoString(&buf, null_string);
5090  printed = true;
5091  }
5092  }
5093  else
5094  {
5095  itemvalue = fetch_att(p, typbyval, typlen);
5096 
5097  value = OutputFunctionCall(&my_extra->proc, itemvalue);
5098 
5099  if (printed)
5100  appendStringInfo(&buf, "%s%s", fldsep, value);
5101  else
5102  appendStringInfoString(&buf, value);
5103  printed = true;
5104 
5105  p = att_addlength_pointer(p, typlen, p);
5106  p = (char *) att_align_nominal(p, typalign);
5107  }
5108 
5109  /* advance bitmap pointer if any */
5110  if (bitmap)
5111  {
5112  bitmask <<= 1;
5113  if (bitmask == 0x100)
5114  {
5115  bitmap++;
5116  bitmask = 1;
5117  }
5118  }
5119  }
5120 
5121  result = cstring_to_text_with_len(buf.data, buf.len);
5122  pfree(buf.data);
5123 
5124  return result;
5125 }
5126 
5127 #define HEXBASE 16
5128 /*
5129  * Convert an int32 to a string containing a base 16 (hex) representation of
5130  * the number.
5131  */
5132 Datum
5134 {
5136  char *ptr;
5137  const char *digits = "0123456789abcdef";
5138  char buf[32]; /* bigger than needed, but reasonable */
5139 
5140  ptr = buf + sizeof(buf) - 1;
5141  *ptr = '\0';
5142 
5143  do
5144  {
5145  *--ptr = digits[value % HEXBASE];
5146  value /= HEXBASE;
5147  } while (ptr > buf && value);
5148 
5150 }
5151 
5152 /*
5153  * Convert an int64 to a string containing a base 16 (hex) representation of
5154  * the number.
5155  */
5156 Datum
5158 {
5159  uint64 value = (uint64) PG_GETARG_INT64(0);
5160  char *ptr;
5161  const char *digits = "0123456789abcdef";
5162  char buf[32]; /* bigger than needed, but reasonable */
5163 
5164  ptr = buf + sizeof(buf) - 1;
5165  *ptr = '\0';
5166 
5167  do
5168  {
5169  *--ptr = digits[value % HEXBASE];
5170  value /= HEXBASE;
5171  } while (ptr > buf && value);
5172 
5174 }
5175 
5176 /*
5177  * Return the size of a datum, possibly compressed
5178  *
5179  * Works on any data type
5180  */
5181 Datum
5183 {
5185  int32 result;
5186  int typlen;
5187 
5188  /* On first call, get the input type's typlen, and save at *fn_extra */
5189  if (fcinfo->flinfo->fn_extra == NULL)
5190  {
5191  /* Lookup the datatype of the supplied argument */
5192  Oid argtypeid = get_fn_expr_argtype(fcinfo->flinfo, 0);
5193 
5194  typlen = get_typlen(argtypeid);
5195  if (typlen == 0) /* should not happen */
5196  elog(ERROR, "cache lookup failed for type %u", argtypeid);
5197 
5198  fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
5199  sizeof(int));
5200  *((int *) fcinfo->flinfo->fn_extra) = typlen;
5201  }
5202  else
5203  typlen = *((int *) fcinfo->flinfo->fn_extra);
5204 
5205  if (typlen == -1)
5206  {
5207  /* varlena type, possibly toasted */
5208  result = toast_datum_size(value);
5209  }
5210  else if (typlen == -2)
5211  {
5212  /* cstring */
5213  result = strlen(DatumGetCString(value)) + 1;
5214  }
5215  else
5216  {
5217  /* ordinary fixed-width type */
5218  result = typlen;
5219  }
5220 
5221  PG_RETURN_INT32(result);
5222 }
5223 
5224 /*
5225  * string_agg - Concatenates values and returns string.
5226  *
5227  * Syntax: string_agg(value text, delimiter text) RETURNS text
5228  *
5229  * Note: Any NULL values are ignored. The first-call delimiter isn't
5230  * actually used at all, and on subsequent calls the delimiter precedes
5231  * the associated value.
5232  */
5233 
5234 /* subroutine to initialize state */
5235 static StringInfo
5237 {
5238  StringInfo state;
5239  MemoryContext aggcontext;
5240  MemoryContext oldcontext;
5241 
5242  if (!AggCheckCallContext(fcinfo, &aggcontext))
5243  {
5244  /* cannot be called directly because of internal-type argument */
5245  elog(ERROR, "string_agg_transfn called in non-aggregate context");
5246  }
5247 
5248  /*
5249  * Create state in aggregate context. It'll stay there across subsequent
5250  * calls.
5251  */
5252  oldcontext = MemoryContextSwitchTo(aggcontext);
5253  state = makeStringInfo();
5254  MemoryContextSwitchTo(oldcontext);
5255 
5256  return state;
5257 }
5258 
5259 Datum
5261 {
5262  StringInfo state;
5263 
5264  state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
5265 
5266  /* Append the value unless null. */
5267  if (!PG_ARGISNULL(1))
5268  {
5269  /* On the first time through, we ignore the delimiter. */
5270  if (state == NULL)
5271  state = makeStringAggState(fcinfo);
5272  else if (!PG_ARGISNULL(2))
5273  appendStringInfoText(state, PG_GETARG_TEXT_PP(2)); /* delimiter */
5274 
5275  appendStringInfoText(state, PG_GETARG_TEXT_PP(1)); /* value */
5276  }
5277 
5278  /*
5279  * The transition type for string_agg() is declared to be "internal",
5280  * which is a pass-by-value type the same size as a pointer.
5281  */
5282  PG_RETURN_POINTER(state);
5283 }
5284 
5285 Datum
5287 {
5288  StringInfo state;
5289 
5290  /* cannot be called directly because of internal-type argument */
5291  Assert(AggCheckCallContext(fcinfo, NULL));
5292 
5293  state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
5294 
5295  if (state != NULL)
5297  else
5298  PG_RETURN_NULL();
5299 }
5300 
5301 /*
5302  * Prepare cache with fmgr info for the output functions of the datatypes of
5303  * the arguments of a concat-like function, beginning with argument "argidx".
5304  * (Arguments before that will have corresponding slots in the resulting
5305  * FmgrInfo array, but we don't fill those slots.)
5306  */
5307 static FmgrInfo *
5309 {
5310  FmgrInfo *foutcache;
5311  int i;
5312 
5313  /* We keep the info in fn_mcxt so it survives across calls */
5314  foutcache = (FmgrInfo *) MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
5315  PG_NARGS() * sizeof(FmgrInfo));
5316 
5317  for (i = argidx; i < PG_NARGS(); i++)
5318  {
5319  Oid valtype;
5320  Oid typOutput;
5321  bool typIsVarlena;
5322 
5323  valtype = get_fn_expr_argtype(fcinfo->flinfo, i);
5324  if (!OidIsValid(valtype))
5325  elog(ERROR, "could not determine data type of concat() input");
5326 
5327  getTypeOutputInfo(valtype, &typOutput, &typIsVarlena);
5328  fmgr_info_cxt(typOutput, &foutcache[i], fcinfo->flinfo->fn_mcxt);
5329  }
5330 
5331  fcinfo->flinfo->fn_extra = foutcache;
5332 
5333  return foutcache;
5334 }
5335 
5336 /*
5337  * Implementation of both concat() and concat_ws().
5338  *
5339  * sepstr is the separator string to place between values.
5340  * argidx identifies the first argument to concatenate (counting from zero);
5341  * note that this must be constant across any one series of calls.
5342  *
5343  * Returns NULL if result should be NULL, else text value.
5344  */
5345 static text *
5346 concat_internal(const char *sepstr, int argidx,
5347  FunctionCallInfo fcinfo)
5348 {
5349  text *result;
5351  FmgrInfo *foutcache;
5352  bool first_arg = true;
5353  int i;
5354 
5355  /*
5356  * concat(VARIADIC some-array) is essentially equivalent to
5357  * array_to_text(), ie concat the array elements with the given separator.
5358  * So we just pass the case off to that code.
5359  */
5360  if (get_fn_expr_variadic(fcinfo->flinfo))
5361  {
5362  ArrayType *arr;
5363 
5364  /* Should have just the one argument */
5365  Assert(argidx == PG_NARGS() - 1);
5366 
5367  /* concat(VARIADIC NULL) is defined as NULL */
5368  if (PG_ARGISNULL(argidx))
5369  return NULL;
5370 
5371  /*
5372  * Non-null argument had better be an array. We assume that any call
5373  * context that could let get_fn_expr_variadic return true will have
5374  * checked that a VARIADIC-labeled parameter actually is an array. So
5375  * it should be okay to just Assert that it's an array rather than
5376  * doing a full-fledged error check.
5377  */
5379 
5380  /* OK, safe to fetch the array value */
5381  arr = PG_GETARG_ARRAYTYPE_P(argidx);
5382 
5383  /*
5384  * And serialize the array. We tell array_to_text to ignore null
5385  * elements, which matches the behavior of the loop below.
5386  */
5387  return array_to_text_internal(fcinfo, arr, sepstr, NULL);
5388  }
5389 
5390  /* Normal case without explicit VARIADIC marker */
5391  initStringInfo(&str);
5392 
5393  /* Get output function info, building it if first time through */
5394  foutcache = (FmgrInfo *) fcinfo->flinfo->fn_extra;
5395  if (foutcache == NULL)
5396  foutcache = build_concat_foutcache(fcinfo, argidx);
5397 
5398  for (i = argidx; i < PG_NARGS(); i++)
5399  {
5400  if (!PG_ARGISNULL(i))
5401  {
5403 
5404  /* add separator if appropriate */
5405  if (first_arg)
5406  first_arg = false;
5407  else
5408  appendStringInfoString(&str, sepstr);
5409 
5410  /* call the appropriate type output function, append the result */
5412  OutputFunctionCall(&foutcache[i], value));
5413  }
5414  }
5415 
5416  result = cstring_to_text_with_len(str.data, str.len);
5417  pfree(str.data);
5418 
5419  return result;
5420 }
5421 
5422 /*
5423  * Concatenate all arguments. NULL arguments are ignored.
5424  */
5425 Datum
5427 {
5428  text *result;
5429 
5430  result = concat_internal("", 0, fcinfo);
5431  if (result == NULL)
5432  PG_RETURN_NULL();
5433  PG_RETURN_TEXT_P(result);
5434 }
5435 
5436 /*
5437  * Concatenate all but first argument value with separators. The first
5438  * parameter is used as the separator. NULL arguments are ignored.
5439  */
5440 Datum
5442 {
5443  char *sep;
5444  text *result;
5445 
5446  /* return NULL when separator is NULL */
5447  if (PG_ARGISNULL(0))
5448  PG_RETURN_NULL();
5450 
5451  result = concat_internal(sep, 1, fcinfo);
5452  if (result == NULL)
5453  PG_RETURN_NULL();
5454  PG_RETURN_TEXT_P(result);
5455 }
5456 
5457 /*
5458  * Return first n characters in the string. When n is negative,
5459  * return all but last |n| characters.
5460  */
5461 Datum
5463 {
5464  int n = PG_GETARG_INT32(1);
5465 
5466  if (n < 0)
5467  {
5468  text *str = PG_GETARG_TEXT_PP(0);
5469  const char *p = VARDATA_ANY(str);
5470  int len = VARSIZE_ANY_EXHDR(str);
5471  int rlen;
5472 
5473  n = pg_mbstrlen_with_len(p, len) + n;
5474  rlen = pg_mbcharcliplen(p, len, n);
5476  }
5477  else
5479 }
5480 
5481 /*
5482  * Return last n characters in the string. When n is negative,
5483  * return all but first |n| characters.
5484  */
5485 Datum
5487 {
5488  text *str = PG_GETARG_TEXT_PP(0);
5489  const char *p = VARDATA_ANY(str);
5490  int len = VARSIZE_ANY_EXHDR(str);
5491  int n = PG_GETARG_INT32(1);
5492  int off;
5493 
5494  if (n < 0)
5495  n = -n;
5496  else
5497  n = pg_mbstrlen_with_len(p, len) - n;
5498  off = pg_mbcharcliplen(p, len, n);
5499 
5500  PG_RETURN_TEXT_P(cstring_to_text_with_len(p + off, len - off));
5501 }
5502 
5503 /*
5504  * Return reversed string
5505  */
5506 Datum
5508 {
5509  text *str = PG_GETARG_TEXT_PP(0);
5510  const char *p = VARDATA_ANY(str);
5511  int len = VARSIZE_ANY_EXHDR(str);
5512  const char *endp = p + len;
5513  text *result;
5514  char *dst;
5515 
5516  result = palloc(len + VARHDRSZ);
5517  dst = (char *) VARDATA(result) + len;
5518  SET_VARSIZE(result, len + VARHDRSZ);
5519 
5521  {
5522  /* multibyte version */
5523  while (p < endp)
5524  {
5525  int sz;
5526 
5527  sz = pg_mblen(p);
5528  dst -= sz;
5529  memcpy(dst, p, sz);
5530  p += sz;
5531  }
5532  }
5533  else
5534  {
5535  /* single byte version */
5536  while (p < endp)
5537  *(--dst) = *p++;
5538  }
5539 
5540  PG_RETURN_TEXT_P(result);
5541 }
5542 
5543 
5544 /*
5545  * Support macros for text_format()
5546  */
5547 #define TEXT_FORMAT_FLAG_MINUS 0x0001 /* is minus flag present? */
5548 
5549 #define ADVANCE_PARSE_POINTER(ptr,end_ptr) \
5550  do { \
5551  if (++(ptr) >= (end_ptr)) \
5552  ereport(ERROR, \
5553  (errcode(ERRCODE_INVALID_PARAMETER_VALUE), \
5554  errmsg("unterminated format() type specifier"), \
5555  errhint("For a single \"%%\" use \"%%%%\"."))); \
5556  } while (0)
5557 
5558 /*
5559  * Returns a formatted string
5560  */
5561 Datum
5563 {
5564  text *fmt;
5566  const char *cp;
5567  const char *start_ptr;
5568  const char *end_ptr;
5569  text *result;
5570  int arg;
5571  bool funcvariadic;
5572  int nargs;
5573  Datum *elements = NULL;
5574  bool *nulls = NULL;
5575  Oid element_type = InvalidOid;
5576  Oid prev_type = InvalidOid;
5577  Oid prev_width_type = InvalidOid;
5578  FmgrInfo typoutputfinfo;
5579  FmgrInfo typoutputinfo_width;
5580 
5581  /* When format string is null, immediately return null */
5582  if (PG_ARGISNULL(0))
5583  PG_RETURN_NULL();
5584 
5585  /* If argument is marked VARIADIC, expand array into elements */
5586  if (get_fn_expr_variadic(fcinfo->flinfo))
5587  {
5588  ArrayType *arr;
5589  int16 elmlen;
5590  bool elmbyval;
5591  char elmalign;
5592  int nitems;
5593 
5594  /* Should have just the one argument */
5595  Assert(PG_NARGS() == 2);
5596 
5597  /* If argument is NULL, we treat it as zero-length array */
5598  if (PG_ARGISNULL(1))
5599  nitems = 0;
5600  else
5601  {
5602  /*
5603  * Non-null argument had better be an array. We assume that any
5604  * call context that could let get_fn_expr_variadic return true
5605  * will have checked that a VARIADIC-labeled parameter actually is
5606  * an array. So it should be okay to just Assert that it's an
5607  * array rather than doing a full-fledged error check.
5608  */
5610 
5611  /* OK, safe to fetch the array value */
5612  arr = PG_GETARG_ARRAYTYPE_P(1);
5613 
5614  /* Get info about array element type */
5615  element_type = ARR_ELEMTYPE(arr);
5616  get_typlenbyvalalign(element_type,
5617  &elmlen, &elmbyval, &elmalign);
5618 
5619  /* Extract all array elements */
5620  deconstruct_array(arr, element_type, elmlen, elmbyval, elmalign,
5621  &elements, &nulls, &nitems);
5622  }
5623 
5624  nargs = nitems + 1;
5625  funcvariadic = true;
5626  }
5627  else
5628  {
5629  /* Non-variadic case, we'll process the arguments individually */
5630  nargs = PG_NARGS();
5631  funcvariadic = false;
5632  }
5633 
5634  /* Setup for main loop. */
5635  fmt = PG_GETARG_TEXT_PP(0);
5636  start_ptr = VARDATA_ANY(fmt);
5637  end_ptr = start_ptr + VARSIZE_ANY_EXHDR(fmt);
5638  initStringInfo(&str);
5639  arg = 1; /* next argument position to print */
5640 
5641  /* Scan format string, looking for conversion specifiers. */
5642  for (cp = start_ptr; cp < end_ptr; cp++)
5643  {
5644  int argpos;
5645  int widthpos;
5646  int flags;
5647  int width;
5648  Datum value;
5649  bool isNull;
5650  Oid typid;
5651 
5652  /*
5653  * If it's not the start of a conversion specifier, just copy it to
5654  * the output buffer.
5655  */
5656  if (*cp != '%')
5657  {
5658  appendStringInfoCharMacro(&str, *cp);
5659  continue;
5660  }
5661 
5662  ADVANCE_PARSE_POINTER(cp, end_ptr);
5663 
5664  /* Easy case: %% outputs a single % */
5665  if (*cp == '%')
5666  {
5667  appendStringInfoCharMacro(&str, *cp);
5668  continue;
5669  }
5670 
5671  /* Parse the optional portions of the format specifier */
5672  cp = text_format_parse_format(cp, end_ptr,
5673  &argpos, &widthpos,
5674  &flags, &width);
5675 
5676  /*
5677  * Next we should see the main conversion specifier. Whether or not
5678  * an argument position was present, it's known that at least one
5679  * character remains in the string at this point. Experience suggests
5680  * that it's worth checking that that character is one of the expected
5681  * ones before we try to fetch arguments, so as to produce the least
5682  * confusing response to a mis-formatted specifier.
5683  */
5684  if (strchr("sIL", *cp) == NULL)
5685  ereport(ERROR,
5686  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5687  errmsg("unrecognized format() type specifier \"%.*s\"",
5688  pg_mblen(cp), cp),
5689  errhint("For a single \"%%\" use \"%%%%\".")));
5690 
5691  /* If indirect width was specified, get its value */
5692  if (widthpos >= 0)
5693  {
5694  /* Collect the specified or next argument position */
5695  if (widthpos > 0)
5696  arg = widthpos;
5697  if (arg >= nargs)
5698  ereport(ERROR,
5699  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5700  errmsg("too few arguments for format()")));
5701 
5702  /* Get the value and type of the selected argument */
5703  if (!funcvariadic)
5704  {
5705  value = PG_GETARG_DATUM(arg);
5706  isNull = PG_ARGISNULL(arg);
5707  typid = get_fn_expr_argtype(fcinfo->flinfo, arg);
5708  }
5709  else
5710  {
5711  value = elements[arg - 1];
5712  isNull = nulls[arg - 1];
5713  typid = element_type;
5714  }
5715  if (!OidIsValid(typid))
5716  elog(ERROR, "could not determine data type of format() input");
5717 
5718  arg++;
5719 
5720  /* We can treat NULL width the same as zero */
5721  if (isNull)
5722  width = 0;
5723  else if (typid == INT4OID)
5724  width = DatumGetInt32(value);
5725  else if (typid == INT2OID)
5726  width = DatumGetInt16(value);
5727  else
5728  {
5729  /* For less-usual datatypes, convert to text then to int */
5730  char *str;
5731 
5732  if (typid != prev_width_type)
5733  {
5734  Oid typoutputfunc;
5735  bool typIsVarlena;
5736 
5737  getTypeOutputInfo(typid, &typoutputfunc, &typIsVarlena);
5738  fmgr_info(typoutputfunc, &typoutputinfo_width);
5739  prev_width_type = typid;
5740  }
5741 
5742  str = OutputFunctionCall(&typoutputinfo_width, value);
5743 
5744  /* pg_strtoint32 will complain about bad data or overflow */
5745  width = pg_strtoint32(str);
5746 
5747  pfree(str);
5748  }
5749  }
5750 
5751  /* Collect the specified or next argument position */
5752  if (argpos > 0)
5753  arg = argpos;
5754  if (arg >= nargs)
5755  ereport(ERROR,
5756  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5757  errmsg("too few arguments for format()")));
5758 
5759  /* Get the value and type of the selected argument */
5760  if (!funcvariadic)
5761  {
5762  value = PG_GETARG_DATUM(arg);
5763  isNull = PG_ARGISNULL(arg);
5764  typid = get_fn_expr_argtype(fcinfo->flinfo, arg);
5765  }
5766  else
5767  {
5768  value = elements[arg - 1];
5769  isNull = nulls[arg - 1];
5770  typid = element_type;
5771  }
5772  if (!OidIsValid(typid))
5773  elog(ERROR, "could not determine data type of format() input");
5774 
5775  arg++;
5776 
5777  /*
5778  * Get the appropriate typOutput function, reusing previous one if
5779  * same type as previous argument. That's particularly useful in the
5780  * variadic-array case, but often saves work even for ordinary calls.
5781  */
5782  if (typid != prev_type)
5783  {
5784  Oid typoutputfunc;
5785  bool typIsVarlena;
5786 
5787  getTypeOutputInfo(typid, &typoutputfunc, &typIsVarlena);
5788  fmgr_info(typoutputfunc, &typoutputfinfo);
5789  prev_type = typid;
5790  }
5791 
5792  /*
5793  * And now we can format the value.
5794  */
5795  switch (*cp)
5796  {
5797  case 's':
5798  case 'I':
5799  case 'L':
5800  text_format_string_conversion(&str, *cp, &typoutputfinfo,
5801  value, isNull,
5802  flags, width);
5803  break;
5804  default:
5805  /* should not get here, because of previous check */
5806  ereport(ERROR,
5807  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5808  errmsg("unrecognized format() type specifier \"%.*s\"",
5809  pg_mblen(cp), cp),
5810  errhint("For a single \"%%\" use \"%%%%\".")));
5811  break;
5812  }
5813  }
5814 
5815  /* Don't need deconstruct_array results anymore. */
5816  if (elements != NULL)
5817  pfree(elements);
5818  if (nulls != NULL)
5819  pfree(nulls);
5820 
5821  /* Generate results. */
5822  result = cstring_to_text_with_len(str.data, str.len);
5823  pfree(str.data);
5824 
5825  PG_RETURN_TEXT_P(result);
5826 }
5827 
5828 /*
5829  * Parse contiguous digits as a decimal number.
5830  *
5831  * Returns true if some digits could be parsed.
5832  * The value is returned into *value, and *ptr is advanced to the next
5833  * character to be parsed.
5834  *
5835  * Note parsing invariant: at least one character is known available before
5836  * string end (end_ptr) at entry, and this is still true at exit.
5837  */
5838 static bool
5839 text_format_parse_digits(const char **ptr, const char *end_ptr, int *value)
5840 {
5841  bool found = false;
5842  const char *cp = *ptr;
5843  int val = 0;
5844 
5845  while (*cp >= '0' && *cp <= '9')
5846  {
5847  int8 digit = (*cp - '0');
5848 
5849  if (unlikely(pg_mul_s32_overflow(val, 10, &val)) ||
5850  unlikely(pg_add_s32_overflow(val, digit, &val)))
5851  ereport(ERROR,
5852  (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
5853  errmsg("number is out of range")));
5854  ADVANCE_PARSE_POINTER(cp, end_ptr);
5855  found = true;
5856  }
5857 
5858  *ptr = cp;
5859  *value = val;
5860 
5861  return found;
5862 }
5863 
5864 /*
5865  * Parse a format specifier (generally following the SUS printf spec).
5866  *
5867  * We have already advanced over the initial '%', and we are looking for
5868  * [argpos][flags][width]type (but the type character is not consumed here).
5869  *
5870  * Inputs are start_ptr (the position after '%') and end_ptr (string end + 1).
5871  * Output parameters:
5872  * argpos: argument position for value to be printed. -1 means unspecified.
5873  * widthpos: argument position for width. Zero means the argument position
5874  * was unspecified (ie, take the next arg) and -1 means no width
5875  * argument (width was omitted or specified as a constant).
5876  * flags: bitmask of flags.
5877  * width: directly-specified width value. Zero means the width was omitted
5878  * (note it's not necessary to distinguish this case from an explicit
5879  * zero width value).
5880  *
5881  * The function result is the next character position to be parsed, ie, the
5882  * location where the type character is/should be.
5883  *
5884  * Note parsing invariant: at least one character is known available before
5885  * string end (end_ptr) at entry, and this is still true at exit.
5886  */
5887 static const char *
5888 text_format_parse_format(const char *start_ptr, const char *end_ptr,
5889  int *argpos, int *widthpos,
5890  int *flags, int *width)
5891 {
5892  const char *cp = start_ptr;
5893  int n;
5894 
5895  /* set defaults for output parameters */
5896  *argpos = -1;
5897  *widthpos = -1;
5898  *flags = 0;
5899  *width = 0;
5900 
5901  /* try to identify first number */
5902  if (text_format_parse_digits(&cp, end_ptr, &n))
5903  {
5904  if (*cp != '$')
5905  {
5906  /* Must be just a width and a type, so we're done */
5907  *width = n;
5908  return cp;
5909  }
5910  /* The number was argument position */
5911  *argpos = n;
5912  /* Explicit 0 for argument index is immediately refused */
5913  if (n == 0)
5914  ereport(ERROR,
5915  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5916  errmsg("format specifies argument 0, but arguments are numbered from 1")));
5917  ADVANCE_PARSE_POINTER(cp, end_ptr);
5918  }
5919 
5920  /* Handle flags (only minus is supported now) */
5921  while (*cp == '-')
5922  {
5923  *flags |= TEXT_FORMAT_FLAG_MINUS;
5924  ADVANCE_PARSE_POINTER(cp, end_ptr);
5925  }
5926 
5927  if (*cp == '*')
5928  {
5929  /* Handle indirect width */
5930  ADVANCE_PARSE_POINTER(cp, end_ptr);
5931  if (text_format_parse_digits(&cp, end_ptr, &n))
5932  {
5933  /* number in this position must be closed by $ */
5934  if (*cp != '$')
5935  ereport(ERROR,
5936  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5937  errmsg("width argument position must be ended by \"$\"")));
5938  /* The number was width argument position */
5939  *widthpos = n;
5940  /* Explicit 0 for argument index is immediately refused */
5941  if (n == 0)
5942  ereport(ERROR,
5943  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5944  errmsg("format specifies argument 0, but arguments are numbered from 1")));
5945  ADVANCE_PARSE_POINTER(cp, end_ptr);
5946  }
5947  else
5948  *widthpos = 0; /* width's argument position is unspecified */
5949  }
5950  else
5951  {
5952  /* Check for direct width specification */
5953  if (text_format_parse_digits(&cp, end_ptr, &n))
5954  *width = n;
5955  }
5956 
5957  /* cp should now be pointing at type character */
5958  return cp;
5959 }
5960 
5961 /*
5962  * Format a %s, %I, or %L conversion
5963  */
5964 static void
5966  FmgrInfo *typOutputInfo,
5967  Datum value, bool isNull,
5968  int flags, int width)
5969 {
5970  char *str;
5971 
5972  /* Handle NULL arguments before trying to stringify the value. */
5973  if (isNull)
5974  {
5975  if (conversion == 's')
5976  text_format_append_string(buf, "", flags, width);
5977  else if (conversion == 'L')
5978  text_format_append_string(buf, "NULL", flags, width);
5979  else if (conversion == 'I')
5980  ereport(ERROR,
5981  (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
5982  errmsg("null values cannot be formatted as an SQL identifier")));
5983  return;
5984  }
5985 
5986  /* Stringify. */
5987  str = OutputFunctionCall(typOutputInfo, value);
5988 
5989  /* Escape. */
5990  if (conversion == 'I')
5991  {
5992  /* quote_identifier may or may not allocate a new string. */
5993  text_format_append_string(buf, quote_identifier(str), flags, width);
5994  }
5995  else if (conversion == 'L')
5996  {
5997  char *qstr = quote_literal_cstr(str);
5998 
5999  text_format_append_string(buf, qstr, flags, width);
6000  /* quote_literal_cstr() always allocates a new string */
6001  pfree(qstr);
6002  }
6003  else
6004  text_format_append_string(buf, str, flags, width);
6005 
6006  /* Cleanup. */
6007  pfree(str);
6008 }
6009 
6010 /*
6011  * Append str to buf, padding as directed by flags/width
6012  */
6013 static void
6015  int flags, int width)
6016 {
6017  bool align_to_left = false;
6018  int len;
6019 
6020  /* fast path for typical easy case */
6021  if (width == 0)
6022  {
6023  appendStringInfoString(buf, str);
6024  return;
6025  }
6026 
6027  if (width < 0)
6028  {
6029  /* Negative width: implicit '-' flag, then take absolute value */
6030  align_to_left = true;
6031  /* -INT_MIN is undefined */
6032  if (width <= INT_MIN)
6033  ereport(ERROR,
6034  (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
6035  errmsg("number is out of range")));
6036  width = -width;
6037  }
6038  else if (flags & TEXT_FORMAT_FLAG_MINUS)
6039  align_to_left = true;
6040 
6041  len = pg_mbstrlen(str);
6042  if (align_to_left)
6043  {
6044  /* left justify */
6045  appendStringInfoString(buf, str);
6046  if (len < width)
6047  appendStringInfoSpaces(buf, width - len);
6048  }
6049  else
6050  {
6051  /* right justify */
6052  if (len < width)
6053  appendStringInfoSpaces(buf, width - len);
6054  appendStringInfoString(buf, str);
6055  }
6056 }
6057 
6058 /*
6059  * text_format_nv - nonvariadic wrapper for text_format function.
6060  *
6061  * note: this wrapper is necessary to pass the sanity check in opr_sanity,
6062  * which checks that all built-in functions that share the implementing C
6063  * function take the same number of arguments.
6064  */
6065 Datum
6067 {
6068  return text_format(fcinfo);
6069 }
6070 
6071 /*
6072  * Helper function for Levenshtein distance functions. Faster than memcmp(),
6073  * for this use case.
6074  */
6075 static inline bool
6076 rest_of_char_same(const char *s1, const char *s2, int len)
6077 {
6078  while (len > 0)
6079  {
6080  len--;
6081  if (s1[len] != s2[len])
6082  return false;
6083  }
6084  return true;
6085 }
6086 
6087 /* Expand each Levenshtein distance variant */
6088 #include "levenshtein.c"
6089 #define LEVENSHTEIN_LESS_EQUAL
6090 #include "levenshtein.c"
6091 
6092 
6093 /*
6094  * Unicode support
6095  */
6096 
6098 unicode_norm_form_from_string(const char *formstr)
6099 {
6100  UnicodeNormalizationForm form = -1;
6101 
6102  /*
6103  * Might as well check this while we're here.
6104  */
6105  if (GetDatabaseEncoding() != PG_UTF8)
6106  ereport(ERROR,
6107  (errcode(ERRCODE_SYNTAX_ERROR),
6108  errmsg("Unicode normalization can only be performed if server encoding is UTF8")));
6109 
6110  if (pg_strcasecmp(formstr, "NFC") == 0)
6111  form = UNICODE_NFC;
6112  else if (pg_strcasecmp(formstr, "NFD") == 0)
6113  form = UNICODE_NFD;
6114  else if (pg_strcasecmp(formstr, "NFKC") == 0)
6115  form = UNICODE_NFKC;
6116  else if (pg_strcasecmp(formstr, "NFKD") == 0)
6117  form = UNICODE_NFKD;
6118  else
6119  ereport(ERROR,
6120  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6121  errmsg("invalid normalization form: %s", formstr)));
6122 
6123  return form;
6124 }
6125 
6126 Datum
6128 {
6129  text *input = PG_GETARG_TEXT_PP(0);
6130  char *formstr = text_to_cstring(PG_GETARG_TEXT_PP(1));
6132  int size;
6133  pg_wchar *input_chars;
6134  pg_wchar *output_chars;
6135  unsigned char *p;
6136  text *result;
6137  int i;
6138 
6139  form = unicode_norm_form_from_string(formstr);
6140 
6141  /* convert to pg_wchar */
6142  size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
6143  input_chars = palloc((size + 1) * sizeof(pg_wchar));
6144  p = (unsigned char *) VARDATA_ANY(input);
6145  for (i = 0; i < size; i++)
6146  {
6147  input_chars[i] = utf8_to_unicode(p);
6148  p += pg_utf_mblen(p);
6149  }
6150  input_chars[i] = (pg_wchar) '\0';
6151  Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input));
6152 
6153  /* action */
6154  output_chars = unicode_normalize(form, input_chars);
6155 
6156  /* convert back to UTF-8 string */
6157  size = 0;
6158  for (pg_wchar *wp = output_chars; *wp; wp++)
6159  {
6160  unsigned char buf[4];
6161 
6162  unicode_to_utf8(*wp, buf);
6163  size += pg_utf_mblen(buf);
6164  }
6165 
6166  result = palloc(size + VARHDRSZ);
6167  SET_VARSIZE(result, size + VARHDRSZ);
6168 
6169  p = (unsigned char *) VARDATA_ANY(result);
6170  for (pg_wchar *wp = output_chars; *wp; wp++)
6171  {
6172  unicode_to_utf8(*wp, p);
6173  p += pg_utf_mblen(p);
6174  }
6175  Assert((char *) p == (char *) result + size + VARHDRSZ);
6176 
6177  PG_RETURN_TEXT_P(result);
6178 }
6179 
6180 /*
6181  * Check whether the string is in the specified Unicode normalization form.
6182  *
6183  * This is done by convering the string to the specified normal form and then
6184  * comparing that to the original string. To speed that up, we also apply the
6185  * "quick check" algorithm specified in UAX #15, which can give a yes or no
6186  * answer for many strings by just scanning the string once.
6187  *
6188  * This function should generally be optimized for the case where the string
6189  * is in fact normalized. In that case, we'll end up looking at the entire
6190  * string, so it's probably not worth doing any incremental conversion etc.
6191  */
6192 Datum
6194 {
6195  text *input = PG_GETARG_TEXT_PP(0);
6196  char *formstr = text_to_cstring(PG_GETARG_TEXT_PP(1));
6198  int size;
6199  pg_wchar *input_chars;
6200  pg_wchar *output_chars;
6201  unsigned char *p;
6202  int i;
6203  UnicodeNormalizationQC quickcheck;
6204  int output_size;
6205  bool result;
6206 
6207  form = unicode_norm_form_from_string(formstr);
6208 
6209  /* convert to pg_wchar */
6210  size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
6211  input_chars = palloc((size + 1) * sizeof(pg_wchar));
6212  p = (unsigned char *) VARDATA_ANY(input);
6213  for (i = 0; i < size; i++)
6214  {
6215  input_chars[i] = utf8_to_unicode(p);
6216  p += pg_utf_mblen(p);
6217  }
6218  input_chars[i] = (pg_wchar) '\0';
6219  Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input));
6220 
6221  /* quick check (see UAX #15) */
6222  quickcheck = unicode_is_normalized_quickcheck(form, input_chars);
6223  if (quickcheck == UNICODE_NORM_QC_YES)
6224  PG_RETURN_BOOL(true);
6225  else if (quickcheck == UNICODE_NORM_QC_NO)
6226  PG_RETURN_BOOL(false);
6227 
6228  /* normalize and compare with original */
6229  output_chars = unicode_normalize(form, input_chars);
6230 
6231  output_size = 0;
6232  for (pg_wchar *wp = output_chars; *wp; wp++)
6233  output_size++;
6234 
6235  result = (size == output_size) &&
6236  (memcmp(input_chars, output_chars, size * sizeof(pg_wchar)) == 0);
6237 
6238  PG_RETURN_BOOL(result);
<