PostgreSQL Source Code  git master
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros
varlena.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * varlena.c
4  * Functions for the variable-length built-in types.
5  *
6  * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  * src/backend/utils/adt/varlena.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 #include "postgres.h"
16 
17 #include <ctype.h>
18 #include <limits.h>
19 
20 #include "access/hash.h"
21 #include "access/tuptoaster.h"
22 #include "catalog/pg_collation.h"
23 #include "catalog/pg_type.h"
24 #include "lib/hyperloglog.h"
25 #include "libpq/md5.h"
26 #include "libpq/pqformat.h"
27 #include "miscadmin.h"
28 #include "parser/scansup.h"
29 #include "regex/regex.h"
30 #include "utils/builtins.h"
31 #include "utils/bytea.h"
32 #include "utils/lsyscache.h"
33 #include "utils/memutils.h"
34 #include "utils/pg_locale.h"
35 #include "utils/sortsupport.h"
36 
37 
38 /* GUC variable */
40 
41 typedef struct varlena unknown;
42 
43 typedef struct
44 {
45  bool use_wchar; /* T if multibyte encoding */
46  char *str1; /* use these if not use_wchar */
47  char *str2; /* note: these point to original texts */
48  pg_wchar *wstr1; /* use these if use_wchar */
49  pg_wchar *wstr2; /* note: these are palloc'd */
50  int len1; /* string lengths in logical characters */
51  int len2;
52  /* Skip table for Boyer-Moore-Horspool search algorithm: */
53  int skiptablemask; /* mask for ANDing with skiptable subscripts */
54  int skiptable[256]; /* skip distance for given mismatched char */
56 
57 typedef struct
58 {
59  char *buf1; /* 1st string, or abbreviation original string buf */
60  char *buf2; /* 2nd string, or abbreviation strxfrm() buf */
61  int buflen1;
62  int buflen2;
63  bool collate_c;
64  hyperLogLogState abbr_card; /* Abbreviated key cardinality state */
65  hyperLogLogState full_card; /* Full key cardinality state */
66  double prop_card; /* Required cardinality proportion */
67 #ifdef HAVE_LOCALE_T
69 #endif
71 
72 /*
73  * This should be large enough that most strings will fit, but small enough
74  * that we feel comfortable putting it on the stack
75  */
76 #define TEXTBUFLEN 1024
77 
78 #define DatumGetUnknownP(X) ((unknown *) PG_DETOAST_DATUM(X))
79 #define DatumGetUnknownPCopy(X) ((unknown *) PG_DETOAST_DATUM_COPY(X))
80 #define PG_GETARG_UNKNOWN_P(n) DatumGetUnknownP(PG_GETARG_DATUM(n))
81 #define PG_GETARG_UNKNOWN_P_COPY(n) DatumGetUnknownPCopy(PG_GETARG_DATUM(n))
82 #define PG_RETURN_UNKNOWN_P(x) PG_RETURN_POINTER(x)
83 
84 static void btsortsupport_worker(SortSupport ssup, Oid collid);
85 static int bttextfastcmp_c(Datum x, Datum y, SortSupport ssup);
86 static int bttextfastcmp_locale(Datum x, Datum y, SortSupport ssup);
87 static int bttextcmp_abbrev(Datum x, Datum y, SortSupport ssup);
88 static Datum bttext_abbrev_convert(Datum original, SortSupport ssup);
89 static bool bttext_abbrev_abort(int memtupcount, SortSupport ssup);
90 static int32 text_length(Datum str);
91 static text *text_catenate(text *t1, text *t2);
92 static text *text_substring(Datum str,
93  int32 start,
94  int32 length,
95  bool length_not_specified);
96 static text *text_overlay(text *t1, text *t2, int sp, int sl);
97 static int text_position(text *t1, text *t2);
98 static void text_position_setup(text *t1, text *t2, TextPositionState *state);
99 static int text_position_next(int start_pos, TextPositionState *state);
101 static int text_cmp(text *arg1, text *arg2, Oid collid);
102 static bytea *bytea_catenate(bytea *t1, bytea *t2);
103 static bytea *bytea_substring(Datum str,
104  int S,
105  int L,
106  bool length_not_specified);
107 static bytea *bytea_overlay(bytea *t1, bytea *t2, int sp, int sl);
108 static void appendStringInfoText(StringInfo str, const text *t);
111  const char *fldsep, const char *null_string);
113 static bool text_format_parse_digits(const char **ptr, const char *end_ptr,
114  int *value);
115 static const char *text_format_parse_format(const char *start_ptr,
116  const char *end_ptr,
117  int *argpos, int *widthpos,
118  int *flags, int *width);
119 static void text_format_string_conversion(StringInfo buf, char conversion,
120  FmgrInfo *typOutputInfo,
121  Datum value, bool isNull,
122  int flags, int width);
123 static void text_format_append_string(StringInfo buf, const char *str,
124  int flags, int width);
125 
126 
127 /*****************************************************************************
128  * CONVERSION ROUTINES EXPORTED FOR USE BY C CODE *
129  *****************************************************************************/
130 
131 /*
132  * cstring_to_text
133  *
134  * Create a text value from a null-terminated C string.
135  *
136  * The new text value is freshly palloc'd with a full-size VARHDR.
137  */
138 text *
139 cstring_to_text(const char *s)
140 {
141  return cstring_to_text_with_len(s, strlen(s));
142 }
143 
144 /*
145  * cstring_to_text_with_len
146  *
147  * Same as cstring_to_text except the caller specifies the string length;
148  * the string need not be null_terminated.
149  */
150 text *
151 cstring_to_text_with_len(const char *s, int len)
152 {
153  text *result = (text *) palloc(len + VARHDRSZ);
154 
155  SET_VARSIZE(result, len + VARHDRSZ);
156  memcpy(VARDATA(result), s, len);
157 
158  return result;
159 }
160 
161 /*
162  * text_to_cstring
163  *
164  * Create a palloc'd, null-terminated C string from a text value.
165  *
166  * We support being passed a compressed or toasted text value.
167  * This is a bit bogus since such values shouldn't really be referred to as
168  * "text *", but it seems useful for robustness. If we didn't handle that
169  * case here, we'd need another routine that did, anyway.
170  */
171 char *
173 {
174  /* must cast away the const, unfortunately */
175  text *tunpacked = pg_detoast_datum_packed((struct varlena *) t);
176  int len = VARSIZE_ANY_EXHDR(tunpacked);
177  char *result;
178 
179  result = (char *) palloc(len + 1);
180  memcpy(result, VARDATA_ANY(tunpacked), len);
181  result[len] = '\0';
182 
183  if (tunpacked != t)
184  pfree(tunpacked);
185 
186  return result;
187 }
188 
189 /*
190  * text_to_cstring_buffer
191  *
192  * Copy a text value into a caller-supplied buffer of size dst_len.
193  *
194  * The text string is truncated if necessary to fit. The result is
195  * guaranteed null-terminated (unless dst_len == 0).
196  *
197  * We support being passed a compressed or toasted text value.
198  * This is a bit bogus since such values shouldn't really be referred to as
199  * "text *", but it seems useful for robustness. If we didn't handle that
200  * case here, we'd need another routine that did, anyway.
201  */
202 void
203 text_to_cstring_buffer(const text *src, char *dst, size_t dst_len)
204 {
205  /* must cast away the const, unfortunately */
206  text *srcunpacked = pg_detoast_datum_packed((struct varlena *) src);
207  size_t src_len = VARSIZE_ANY_EXHDR(srcunpacked);
208 
209  if (dst_len > 0)
210  {
211  dst_len--;
212  if (dst_len >= src_len)
213  dst_len = src_len;
214  else /* ensure truncation is encoding-safe */
215  dst_len = pg_mbcliplen(VARDATA_ANY(srcunpacked), src_len, dst_len);
216  memcpy(dst, VARDATA_ANY(srcunpacked), dst_len);
217  dst[dst_len] = '\0';
218  }
219 
220  if (srcunpacked != src)
221  pfree(srcunpacked);
222 }
223 
224 
225 /*****************************************************************************
226  * USER I/O ROUTINES *
227  *****************************************************************************/
228 
229 
230 #define VAL(CH) ((CH) - '0')
231 #define DIG(VAL) ((VAL) + '0')
232 
233 /*
234  * byteain - converts from printable representation of byte array
235  *
236  * Non-printable characters must be passed as '\nnn' (octal) and are
237  * converted to internal form. '\' must be passed as '\\'.
238  * ereport(ERROR, ...) if bad form.
239  *
240  * BUGS:
241  * The input is scanned twice.
242  * The error checking of input is minimal.
243  */
244 Datum
246 {
247  char *inputText = PG_GETARG_CSTRING(0);
248  char *tp;
249  char *rp;
250  int bc;
251  bytea *result;
252 
253  /* Recognize hex input */
254  if (inputText[0] == '\\' && inputText[1] == 'x')
255  {
256  size_t len = strlen(inputText);
257 
258  bc = (len - 2) / 2 + VARHDRSZ; /* maximum possible length */
259  result = palloc(bc);
260  bc = hex_decode(inputText + 2, len - 2, VARDATA(result));
261  SET_VARSIZE(result, bc + VARHDRSZ); /* actual length */
262 
263  PG_RETURN_BYTEA_P(result);
264  }
265 
266  /* Else, it's the traditional escaped style */
267  for (bc = 0, tp = inputText; *tp != '\0'; bc++)
268  {
269  if (tp[0] != '\\')
270  tp++;
271  else if ((tp[0] == '\\') &&
272  (tp[1] >= '0' && tp[1] <= '3') &&
273  (tp[2] >= '0' && tp[2] <= '7') &&
274  (tp[3] >= '0' && tp[3] <= '7'))
275  tp += 4;
276  else if ((tp[0] == '\\') &&
277  (tp[1] == '\\'))
278  tp += 2;
279  else
280  {
281  /*
282  * one backslash, not followed by another or ### valid octal
283  */
284  ereport(ERROR,
285  (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
286  errmsg("invalid input syntax for type bytea")));
287  }
288  }
289 
290  bc += VARHDRSZ;
291 
292  result = (bytea *) palloc(bc);
293  SET_VARSIZE(result, bc);
294 
295  tp = inputText;
296  rp = VARDATA(result);
297  while (*tp != '\0')
298  {
299  if (tp[0] != '\\')
300  *rp++ = *tp++;
301  else if ((tp[0] == '\\') &&
302  (tp[1] >= '0' && tp[1] <= '3') &&
303  (tp[2] >= '0' && tp[2] <= '7') &&
304  (tp[3] >= '0' && tp[3] <= '7'))
305  {
306  bc = VAL(tp[1]);
307  bc <<= 3;
308  bc += VAL(tp[2]);
309  bc <<= 3;
310  *rp++ = bc + VAL(tp[3]);
311 
312  tp += 4;
313  }
314  else if ((tp[0] == '\\') &&
315  (tp[1] == '\\'))
316  {
317  *rp++ = '\\';
318  tp += 2;
319  }
320  else
321  {
322  /*
323  * We should never get here. The first pass should not allow it.
324  */
325  ereport(ERROR,
326  (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
327  errmsg("invalid input syntax for type bytea")));
328  }
329  }
330 
331  PG_RETURN_BYTEA_P(result);
332 }
333 
334 /*
335  * byteaout - converts to printable representation of byte array
336  *
337  * In the traditional escaped format, non-printable characters are
338  * printed as '\nnn' (octal) and '\' as '\\'.
339  */
340 Datum
342 {
343  bytea *vlena = PG_GETARG_BYTEA_PP(0);
344  char *result;
345  char *rp;
346 
348  {
349  /* Print hex format */
350  rp = result = palloc(VARSIZE_ANY_EXHDR(vlena) * 2 + 2 + 1);
351  *rp++ = '\\';
352  *rp++ = 'x';
353  rp += hex_encode(VARDATA_ANY(vlena), VARSIZE_ANY_EXHDR(vlena), rp);
354  }
355  else if (bytea_output == BYTEA_OUTPUT_ESCAPE)
356  {
357  /* Print traditional escaped format */
358  char *vp;
359  int len;
360  int i;
361 
362  len = 1; /* empty string has 1 char */
363  vp = VARDATA_ANY(vlena);
364  for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++)
365  {
366  if (*vp == '\\')
367  len += 2;
368  else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e)
369  len += 4;
370  else
371  len++;
372  }
373  rp = result = (char *) palloc(len);
374  vp = VARDATA_ANY(vlena);
375  for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++)
376  {
377  if (*vp == '\\')
378  {
379  *rp++ = '\\';
380  *rp++ = '\\';
381  }
382  else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e)
383  {
384  int val; /* holds unprintable chars */
385 
386  val = *vp;
387  rp[0] = '\\';
388  rp[3] = DIG(val & 07);
389  val >>= 3;
390  rp[2] = DIG(val & 07);
391  val >>= 3;
392  rp[1] = DIG(val & 03);
393  rp += 4;
394  }
395  else
396  *rp++ = *vp;
397  }
398  }
399  else
400  {
401  elog(ERROR, "unrecognized bytea_output setting: %d",
402  bytea_output);
403  rp = result = NULL; /* keep compiler quiet */
404  }
405  *rp = '\0';
406  PG_RETURN_CSTRING(result);
407 }
408 
409 /*
410  * bytearecv - converts external binary format to bytea
411  */
412 Datum
414 {
416  bytea *result;
417  int nbytes;
418 
419  nbytes = buf->len - buf->cursor;
420  result = (bytea *) palloc(nbytes + VARHDRSZ);
421  SET_VARSIZE(result, nbytes + VARHDRSZ);
422  pq_copymsgbytes(buf, VARDATA(result), nbytes);
423  PG_RETURN_BYTEA_P(result);
424 }
425 
426 /*
427  * byteasend - converts bytea to binary format
428  *
429  * This is a special case: just copy the input...
430  */
431 Datum
433 {
434  bytea *vlena = PG_GETARG_BYTEA_P_COPY(0);
435 
436  PG_RETURN_BYTEA_P(vlena);
437 }
438 
439 Datum
441 {
443 
444  state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
445 
446  /* Append the value unless null. */
447  if (!PG_ARGISNULL(1))
448  {
450 
451  /* On the first time through, we ignore the delimiter. */
452  if (state == NULL)
453  state = makeStringAggState(fcinfo);
454  else if (!PG_ARGISNULL(2))
455  {
456  bytea *delim = PG_GETARG_BYTEA_PP(2);
457 
459  }
460 
462  }
463 
464  /*
465  * The transition type for string_agg() is declared to be "internal",
466  * which is a pass-by-value type the same size as a pointer.
467  */
468  PG_RETURN_POINTER(state);
469 }
470 
471 Datum
473 {
475 
476  /* cannot be called directly because of internal-type argument */
477  Assert(AggCheckCallContext(fcinfo, NULL));
478 
479  state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
480 
481  if (state != NULL)
482  {
483  bytea *result;
484 
485  result = (bytea *) palloc(state->len + VARHDRSZ);
486  SET_VARSIZE(result, state->len + VARHDRSZ);
487  memcpy(VARDATA(result), state->data, state->len);
488  PG_RETURN_BYTEA_P(result);
489  }
490  else
491  PG_RETURN_NULL();
492 }
493 
494 /*
495  * textin - converts "..." to internal representation
496  */
497 Datum
499 {
500  char *inputText = PG_GETARG_CSTRING(0);
501 
502  PG_RETURN_TEXT_P(cstring_to_text(inputText));
503 }
504 
505 /*
506  * textout - converts internal representation to "..."
507  */
508 Datum
510 {
511  Datum txt = PG_GETARG_DATUM(0);
512 
514 }
515 
516 /*
517  * textrecv - converts external binary format to text
518  */
519 Datum
521 {
523  text *result;
524  char *str;
525  int nbytes;
526 
527  str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
528 
529  result = cstring_to_text_with_len(str, nbytes);
530  pfree(str);
531  PG_RETURN_TEXT_P(result);
532 }
533 
534 /*
535  * textsend - converts text to binary format
536  */
537 Datum
539 {
540  text *t = PG_GETARG_TEXT_PP(0);
542 
543  pq_begintypsend(&buf);
546 }
547 
548 
549 /*
550  * unknownin - converts "..." to internal representation
551  */
552 Datum
554 {
555  char *str = PG_GETARG_CSTRING(0);
556 
557  /* representation is same as cstring */
559 }
560 
561 /*
562  * unknownout - converts internal representation to "..."
563  */
564 Datum
566 {
567  /* representation is same as cstring */
568  char *str = PG_GETARG_CSTRING(0);
569 
571 }
572 
573 /*
574  * unknownrecv - converts external binary format to unknown
575  */
576 Datum
578 {
580  char *str;
581  int nbytes;
582 
583  str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
584  /* representation is same as cstring */
585  PG_RETURN_CSTRING(str);
586 }
587 
588 /*
589  * unknownsend - converts unknown to binary format
590  */
591 Datum
593 {
594  /* representation is same as cstring */
595  char *str = PG_GETARG_CSTRING(0);
597 
598  pq_begintypsend(&buf);
599  pq_sendtext(&buf, str, strlen(str));
601 }
602 
603 
604 /* ========== PUBLIC ROUTINES ========== */
605 
606 /*
607  * textlen -
608  * returns the logical length of a text*
609  * (which is less than the VARSIZE of the text*)
610  */
611 Datum
613 {
614  Datum str = PG_GETARG_DATUM(0);
615 
616  /* try to avoid decompressing argument */
618 }
619 
620 /*
621  * text_length -
622  * Does the real work for textlen()
623  *
624  * This is broken out so it can be called directly by other string processing
625  * functions. Note that the argument is passed as a Datum, to indicate that
626  * it may still be in compressed form. We can avoid decompressing it at all
627  * in some cases.
628  */
629 static int32
631 {
632  /* fastpath when max encoding length is one */
635  else
636  {
637  text *t = DatumGetTextPP(str);
638 
640  VARSIZE_ANY_EXHDR(t)));
641  }
642 }
643 
644 /*
645  * textoctetlen -
646  * returns the physical length of a text*
647  * (which is less than the VARSIZE of the text*)
648  */
649 Datum
651 {
652  Datum str = PG_GETARG_DATUM(0);
653 
654  /* We need not detoast the input at all */
656 }
657 
658 /*
659  * textcat -
660  * takes two text* and returns a text* that is the concatenation of
661  * the two.
662  *
663  * Rewritten by Sapa, sapa@hq.icb.chel.su. 8-Jul-96.
664  * Updated by Thomas, Thomas.Lockhart@jpl.nasa.gov 1997-07-10.
665  * Allocate space for output in all cases.
666  * XXX - thomas 1997-07-10
667  */
668 Datum
670 {
671  text *t1 = PG_GETARG_TEXT_PP(0);
672  text *t2 = PG_GETARG_TEXT_PP(1);
673 
675 }
676 
677 /*
678  * text_catenate
679  * Guts of textcat(), broken out so it can be used by other functions
680  *
681  * Arguments can be in short-header form, but not compressed or out-of-line
682  */
683 static text *
685 {
686  text *result;
687  int len1,
688  len2,
689  len;
690  char *ptr;
691 
692  len1 = VARSIZE_ANY_EXHDR(t1);
693  len2 = VARSIZE_ANY_EXHDR(t2);
694 
695  /* paranoia ... probably should throw error instead? */
696  if (len1 < 0)
697  len1 = 0;
698  if (len2 < 0)
699  len2 = 0;
700 
701  len = len1 + len2 + VARHDRSZ;
702  result = (text *) palloc(len);
703 
704  /* Set size of result string... */
705  SET_VARSIZE(result, len);
706 
707  /* Fill data field of result string... */
708  ptr = VARDATA(result);
709  if (len1 > 0)
710  memcpy(ptr, VARDATA_ANY(t1), len1);
711  if (len2 > 0)
712  memcpy(ptr + len1, VARDATA_ANY(t2), len2);
713 
714  return result;
715 }
716 
717 /*
718  * charlen_to_bytelen()
719  * Compute the number of bytes occupied by n characters starting at *p
720  *
721  * It is caller's responsibility that there actually are n characters;
722  * the string need not be null-terminated.
723  */
724 static int
725 charlen_to_bytelen(const char *p, int n)
726 {
728  {
729  /* Optimization for single-byte encodings */
730  return n;
731  }
732  else
733  {
734  const char *s;
735 
736  for (s = p; n > 0; n--)
737  s += pg_mblen(s);
738 
739  return s - p;
740  }
741 }
742 
743 /*
744  * text_substr()
745  * Return a substring starting at the specified position.
746  * - thomas 1997-12-31
747  *
748  * Input:
749  * - string
750  * - starting position (is one-based)
751  * - string length
752  *
753  * If the starting position is zero or less, then return from the start of the string
754  * adjusting the length to be consistent with the "negative start" per SQL.
755  * If the length is less than zero, return the remaining string.
756  *
757  * Added multibyte support.
758  * - Tatsuo Ishii 1998-4-21
759  * Changed behavior if starting position is less than one to conform to SQL behavior.
760  * Formerly returned the entire string; now returns a portion.
761  * - Thomas Lockhart 1998-12-10
762  * Now uses faster TOAST-slicing interface
763  * - John Gray 2002-02-22
764  * Remove "#ifdef MULTIBYTE" and test for encoding_max_length instead. Change
765  * behaviors conflicting with SQL to meet SQL (if E = S + L < S throw
766  * error; if E < 1, return '', not entire string). Fixed MB related bug when
767  * S > LC and < LC + 4 sometimes garbage characters are returned.
768  * - Joe Conway 2002-08-10
769  */
770 Datum
772 {
774  PG_GETARG_INT32(1),
775  PG_GETARG_INT32(2),
776  false));
777 }
778 
779 /*
780  * text_substr_no_len -
781  * Wrapper to avoid opr_sanity failure due to
782  * one function accepting a different number of args.
783  */
784 Datum
786 {
788  PG_GETARG_INT32(1),
789  -1, true));
790 }
791 
792 /*
793  * text_substring -
794  * Does the real work for text_substr() and text_substr_no_len()
795  *
796  * This is broken out so it can be called directly by other string processing
797  * functions. Note that the argument is passed as a Datum, to indicate that
798  * it may still be in compressed/toasted form. We can avoid detoasting all
799  * of it in some cases.
800  *
801  * The result is always a freshly palloc'd datum.
802  */
803 static text *
804 text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
805 {
807  int32 S = start; /* start position */
808  int32 S1; /* adjusted start position */
809  int32 L1; /* adjusted substring length */
810 
811  /* life is easy if the encoding max length is 1 */
812  if (eml == 1)
813  {
814  S1 = Max(S, 1);
815 
816  if (length_not_specified) /* special case - get length to end of
817  * string */
818  L1 = -1;
819  else
820  {
821  /* end position */
822  int E = S + length;
823 
824  /*
825  * A negative value for L is the only way for the end position to
826  * be before the start. SQL99 says to throw an error.
827  */
828  if (E < S)
829  ereport(ERROR,
830  (errcode(ERRCODE_SUBSTRING_ERROR),
831  errmsg("negative substring length not allowed")));
832 
833  /*
834  * A zero or negative value for the end position can happen if the
835  * start was negative or one. SQL99 says to return a zero-length
836  * string.
837  */
838  if (E < 1)
839  return cstring_to_text("");
840 
841  L1 = E - S1;
842  }
843 
844  /*
845  * If the start position is past the end of the string, SQL99 says to
846  * return a zero-length string -- PG_GETARG_TEXT_P_SLICE() will do
847  * that for us. Convert to zero-based starting position
848  */
849  return DatumGetTextPSlice(str, S1 - 1, L1);
850  }
851  else if (eml > 1)
852  {
853  /*
854  * When encoding max length is > 1, we can't get LC without
855  * detoasting, so we'll grab a conservatively large slice now and go
856  * back later to do the right thing
857  */
858  int32 slice_start;
859  int32 slice_size;
860  int32 slice_strlen;
861  text *slice;
862  int32 E1;
863  int32 i;
864  char *p;
865  char *s;
866  text *ret;
867 
868  /*
869  * if S is past the end of the string, the tuple toaster will return a
870  * zero-length string to us
871  */
872  S1 = Max(S, 1);
873 
874  /*
875  * We need to start at position zero because there is no way to know
876  * in advance which byte offset corresponds to the supplied start
877  * position.
878  */
879  slice_start = 0;
880 
881  if (length_not_specified) /* special case - get length to end of
882  * string */
883  slice_size = L1 = -1;
884  else
885  {
886  int E = S + length;
887 
888  /*
889  * A negative value for L is the only way for the end position to
890  * be before the start. SQL99 says to throw an error.
891  */
892  if (E < S)
893  ereport(ERROR,
894  (errcode(ERRCODE_SUBSTRING_ERROR),
895  errmsg("negative substring length not allowed")));
896 
897  /*
898  * A zero or negative value for the end position can happen if the
899  * start was negative or one. SQL99 says to return a zero-length
900  * string.
901  */
902  if (E < 1)
903  return cstring_to_text("");
904 
905  /*
906  * if E is past the end of the string, the tuple toaster will
907  * truncate the length for us
908  */
909  L1 = E - S1;
910 
911  /*
912  * Total slice size in bytes can't be any longer than the start
913  * position plus substring length times the encoding max length.
914  */
915  slice_size = (S1 + L1) * eml;
916  }
917 
918  /*
919  * If we're working with an untoasted source, no need to do an extra
920  * copying step.
921  */
924  slice = DatumGetTextPSlice(str, slice_start, slice_size);
925  else
926  slice = (text *) DatumGetPointer(str);
927 
928  /* see if we got back an empty string */
929  if (VARSIZE_ANY_EXHDR(slice) == 0)
930  {
931  if (slice != (text *) DatumGetPointer(str))
932  pfree(slice);
933  return cstring_to_text("");
934  }
935 
936  /* Now we can get the actual length of the slice in MB characters */
937  slice_strlen = pg_mbstrlen_with_len(VARDATA_ANY(slice),
938  VARSIZE_ANY_EXHDR(slice));
939 
940  /*
941  * Check that the start position wasn't > slice_strlen. If so, SQL99
942  * says to return a zero-length string.
943  */
944  if (S1 > slice_strlen)
945  {
946  if (slice != (text *) DatumGetPointer(str))
947  pfree(slice);
948  return cstring_to_text("");
949  }
950 
951  /*
952  * Adjust L1 and E1 now that we know the slice string length. Again
953  * remember that S1 is one based, and slice_start is zero based.
954  */
955  if (L1 > -1)
956  E1 = Min(S1 + L1, slice_start + 1 + slice_strlen);
957  else
958  E1 = slice_start + 1 + slice_strlen;
959 
960  /*
961  * Find the start position in the slice; remember S1 is not zero based
962  */
963  p = VARDATA_ANY(slice);
964  for (i = 0; i < S1 - 1; i++)
965  p += pg_mblen(p);
966 
967  /* hang onto a pointer to our start position */
968  s = p;
969 
970  /*
971  * Count the actual bytes used by the substring of the requested
972  * length.
973  */
974  for (i = S1; i < E1; i++)
975  p += pg_mblen(p);
976 
977  ret = (text *) palloc(VARHDRSZ + (p - s));
978  SET_VARSIZE(ret, VARHDRSZ + (p - s));
979  memcpy(VARDATA(ret), s, (p - s));
980 
981  if (slice != (text *) DatumGetPointer(str))
982  pfree(slice);
983 
984  return ret;
985  }
986  else
987  elog(ERROR, "invalid backend encoding: encoding max length < 1");
988 
989  /* not reached: suppress compiler warning */
990  return NULL;
991 }
992 
993 /*
994  * textoverlay
995  * Replace specified substring of first string with second
996  *
997  * The SQL standard defines OVERLAY() in terms of substring and concatenation.
998  * This code is a direct implementation of what the standard says.
999  */
1000 Datum
1002 {
1003  text *t1 = PG_GETARG_TEXT_PP(0);
1004  text *t2 = PG_GETARG_TEXT_PP(1);
1005  int sp = PG_GETARG_INT32(2); /* substring start position */
1006  int sl = PG_GETARG_INT32(3); /* substring length */
1007 
1008  PG_RETURN_TEXT_P(text_overlay(t1, t2, sp, sl));
1009 }
1010 
1011 Datum
1013 {
1014  text *t1 = PG_GETARG_TEXT_PP(0);
1015  text *t2 = PG_GETARG_TEXT_PP(1);
1016  int sp = PG_GETARG_INT32(2); /* substring start position */
1017  int sl;
1018 
1019  sl = text_length(PointerGetDatum(t2)); /* defaults to length(t2) */
1020  PG_RETURN_TEXT_P(text_overlay(t1, t2, sp, sl));
1021 }
1022 
1023 static text *
1024 text_overlay(text *t1, text *t2, int sp, int sl)
1025 {
1026  text *result;
1027  text *s1;
1028  text *s2;
1029  int sp_pl_sl;
1030 
1031  /*
1032  * Check for possible integer-overflow cases. For negative sp, throw a
1033  * "substring length" error because that's what should be expected
1034  * according to the spec's definition of OVERLAY().
1035  */
1036  if (sp <= 0)
1037  ereport(ERROR,
1038  (errcode(ERRCODE_SUBSTRING_ERROR),
1039  errmsg("negative substring length not allowed")));
1040  sp_pl_sl = sp + sl;
1041  if (sp_pl_sl <= sl)
1042  ereport(ERROR,
1043  (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
1044  errmsg("integer out of range")));
1045 
1046  s1 = text_substring(PointerGetDatum(t1), 1, sp - 1, false);
1047  s2 = text_substring(PointerGetDatum(t1), sp_pl_sl, -1, true);
1048  result = text_catenate(s1, t2);
1049  result = text_catenate(result, s2);
1050 
1051  return result;
1052 }
1053 
1054 /*
1055  * textpos -
1056  * Return the position of the specified substring.
1057  * Implements the SQL POSITION() function.
1058  * Ref: A Guide To The SQL Standard, Date & Darwen, 1997
1059  * - thomas 1997-07-27
1060  */
1061 Datum
1063 {
1064  text *str = PG_GETARG_TEXT_PP(0);
1065  text *search_str = PG_GETARG_TEXT_PP(1);
1066 
1067  PG_RETURN_INT32((int32) text_position(str, search_str));
1068 }
1069 
1070 /*
1071  * text_position -
1072  * Does the real work for textpos()
1073  *
1074  * Inputs:
1075  * t1 - string to be searched
1076  * t2 - pattern to match within t1
1077  * Result:
1078  * Character index of the first matched char, starting from 1,
1079  * or 0 if no match.
1080  *
1081  * This is broken out so it can be called directly by other string processing
1082  * functions.
1083  */
1084 static int
1086 {
1088  int result;
1089 
1090  text_position_setup(t1, t2, &state);
1091  result = text_position_next(1, &state);
1092  text_position_cleanup(&state);
1093  return result;
1094 }
1095 
1096 
1097 /*
1098  * text_position_setup, text_position_next, text_position_cleanup -
1099  * Component steps of text_position()
1100  *
1101  * These are broken out so that a string can be efficiently searched for
1102  * multiple occurrences of the same pattern. text_position_next may be
1103  * called multiple times with increasing values of start_pos, which is
1104  * the 1-based character position to start the search from. The "state"
1105  * variable is normally just a local variable in the caller.
1106  */
1107 
1108 static void
1110 {
1111  int len1 = VARSIZE_ANY_EXHDR(t1);
1112  int len2 = VARSIZE_ANY_EXHDR(t2);
1113 
1115  {
1116  /* simple case - single byte encoding */
1117  state->use_wchar = false;
1118  state->str1 = VARDATA_ANY(t1);
1119  state->str2 = VARDATA_ANY(t2);
1120  state->len1 = len1;
1121  state->len2 = len2;
1122  }
1123  else
1124  {
1125  /* not as simple - multibyte encoding */
1126  pg_wchar *p1,
1127  *p2;
1128 
1129  p1 = (pg_wchar *) palloc((len1 + 1) * sizeof(pg_wchar));
1130  len1 = pg_mb2wchar_with_len(VARDATA_ANY(t1), p1, len1);
1131  p2 = (pg_wchar *) palloc((len2 + 1) * sizeof(pg_wchar));
1132  len2 = pg_mb2wchar_with_len(VARDATA_ANY(t2), p2, len2);
1133 
1134  state->use_wchar = true;
1135  state->wstr1 = p1;
1136  state->wstr2 = p2;
1137  state->len1 = len1;
1138  state->len2 = len2;
1139  }
1140 
1141  /*
1142  * Prepare the skip table for Boyer-Moore-Horspool searching. In these
1143  * notes we use the terminology that the "haystack" is the string to be
1144  * searched (t1) and the "needle" is the pattern being sought (t2).
1145  *
1146  * If the needle is empty or bigger than the haystack then there is no
1147  * point in wasting cycles initializing the table. We also choose not to
1148  * use B-M-H for needles of length 1, since the skip table can't possibly
1149  * save anything in that case.
1150  */
1151  if (len1 >= len2 && len2 > 1)
1152  {
1153  int searchlength = len1 - len2;
1154  int skiptablemask;
1155  int last;
1156  int i;
1157 
1158  /*
1159  * First we must determine how much of the skip table to use. The
1160  * declaration of TextPositionState allows up to 256 elements, but for
1161  * short search problems we don't really want to have to initialize so
1162  * many elements --- it would take too long in comparison to the
1163  * actual search time. So we choose a useful skip table size based on
1164  * the haystack length minus the needle length. The closer the needle
1165  * length is to the haystack length the less useful skipping becomes.
1166  *
1167  * Note: since we use bit-masking to select table elements, the skip
1168  * table size MUST be a power of 2, and so the mask must be 2^N-1.
1169  */
1170  if (searchlength < 16)
1171  skiptablemask = 3;
1172  else if (searchlength < 64)
1173  skiptablemask = 7;
1174  else if (searchlength < 128)
1175  skiptablemask = 15;
1176  else if (searchlength < 512)
1177  skiptablemask = 31;
1178  else if (searchlength < 2048)
1179  skiptablemask = 63;
1180  else if (searchlength < 4096)
1181  skiptablemask = 127;
1182  else
1183  skiptablemask = 255;
1184  state->skiptablemask = skiptablemask;
1185 
1186  /*
1187  * Initialize the skip table. We set all elements to the needle
1188  * length, since this is the correct skip distance for any character
1189  * not found in the needle.
1190  */
1191  for (i = 0; i <= skiptablemask; i++)
1192  state->skiptable[i] = len2;
1193 
1194  /*
1195  * Now examine the needle. For each character except the last one,
1196  * set the corresponding table element to the appropriate skip
1197  * distance. Note that when two characters share the same skip table
1198  * entry, the one later in the needle must determine the skip
1199  * distance.
1200  */
1201  last = len2 - 1;
1202 
1203  if (!state->use_wchar)
1204  {
1205  const char *str2 = state->str2;
1206 
1207  for (i = 0; i < last; i++)
1208  state->skiptable[(unsigned char) str2[i] & skiptablemask] = last - i;
1209  }
1210  else
1211  {
1212  const pg_wchar *wstr2 = state->wstr2;
1213 
1214  for (i = 0; i < last; i++)
1215  state->skiptable[wstr2[i] & skiptablemask] = last - i;
1216  }
1217  }
1218 }
1219 
1220 static int
1222 {
1223  int haystack_len = state->len1;
1224  int needle_len = state->len2;
1225  int skiptablemask = state->skiptablemask;
1226 
1227  Assert(start_pos > 0); /* else caller error */
1228 
1229  if (needle_len <= 0)
1230  return start_pos; /* result for empty pattern */
1231 
1232  start_pos--; /* adjust for zero based arrays */
1233 
1234  /* Done if the needle can't possibly fit */
1235  if (haystack_len < start_pos + needle_len)
1236  return 0;
1237 
1238  if (!state->use_wchar)
1239  {
1240  /* simple case - single byte encoding */
1241  const char *haystack = state->str1;
1242  const char *needle = state->str2;
1243  const char *haystack_end = &haystack[haystack_len];
1244  const char *hptr;
1245 
1246  if (needle_len == 1)
1247  {
1248  /* No point in using B-M-H for a one-character needle */
1249  char nchar = *needle;
1250 
1251  hptr = &haystack[start_pos];
1252  while (hptr < haystack_end)
1253  {
1254  if (*hptr == nchar)
1255  return hptr - haystack + 1;
1256  hptr++;
1257  }
1258  }
1259  else
1260  {
1261  const char *needle_last = &needle[needle_len - 1];
1262 
1263  /* Start at startpos plus the length of the needle */
1264  hptr = &haystack[start_pos + needle_len - 1];
1265  while (hptr < haystack_end)
1266  {
1267  /* Match the needle scanning *backward* */
1268  const char *nptr;
1269  const char *p;
1270 
1271  nptr = needle_last;
1272  p = hptr;
1273  while (*nptr == *p)
1274  {
1275  /* Matched it all? If so, return 1-based position */
1276  if (nptr == needle)
1277  return p - haystack + 1;
1278  nptr--, p--;
1279  }
1280 
1281  /*
1282  * No match, so use the haystack char at hptr to decide how
1283  * far to advance. If the needle had any occurrence of that
1284  * character (or more precisely, one sharing the same
1285  * skiptable entry) before its last character, then we advance
1286  * far enough to align the last such needle character with
1287  * that haystack position. Otherwise we can advance by the
1288  * whole needle length.
1289  */
1290  hptr += state->skiptable[(unsigned char) *hptr & skiptablemask];
1291  }
1292  }
1293  }
1294  else
1295  {
1296  /* The multibyte char version. This works exactly the same way. */
1297  const pg_wchar *haystack = state->wstr1;
1298  const pg_wchar *needle = state->wstr2;
1299  const pg_wchar *haystack_end = &haystack[haystack_len];
1300  const pg_wchar *hptr;
1301 
1302  if (needle_len == 1)
1303  {
1304  /* No point in using B-M-H for a one-character needle */
1305  pg_wchar nchar = *needle;
1306 
1307  hptr = &haystack[start_pos];
1308  while (hptr < haystack_end)
1309  {
1310  if (*hptr == nchar)
1311  return hptr - haystack + 1;
1312  hptr++;
1313  }
1314  }
1315  else
1316  {
1317  const pg_wchar *needle_last = &needle[needle_len - 1];
1318 
1319  /* Start at startpos plus the length of the needle */
1320  hptr = &haystack[start_pos + needle_len - 1];
1321  while (hptr < haystack_end)
1322  {
1323  /* Match the needle scanning *backward* */
1324  const pg_wchar *nptr;
1325  const pg_wchar *p;
1326 
1327  nptr = needle_last;
1328  p = hptr;
1329  while (*nptr == *p)
1330  {
1331  /* Matched it all? If so, return 1-based position */
1332  if (nptr == needle)
1333  return p - haystack + 1;
1334  nptr--, p--;
1335  }
1336 
1337  /*
1338  * No match, so use the haystack char at hptr to decide how
1339  * far to advance. If the needle had any occurrence of that
1340  * character (or more precisely, one sharing the same
1341  * skiptable entry) before its last character, then we advance
1342  * far enough to align the last such needle character with
1343  * that haystack position. Otherwise we can advance by the
1344  * whole needle length.
1345  */
1346  hptr += state->skiptable[*hptr & skiptablemask];
1347  }
1348  }
1349  }
1350 
1351  return 0; /* not found */
1352 }
1353 
1354 static void
1356 {
1357  if (state->use_wchar)
1358  {
1359  pfree(state->wstr1);
1360  pfree(state->wstr2);
1361  }
1362 }
1363 
1364 /* varstr_cmp()
1365  * Comparison function for text strings with given lengths.
1366  * Includes locale support, but must copy strings to temporary memory
1367  * to allow null-termination for inputs to strcoll().
1368  * Returns an integer less than, equal to, or greater than zero, indicating
1369  * whether arg1 is less than, equal to, or greater than arg2.
1370  */
1371 int
1372 varstr_cmp(char *arg1, int len1, char *arg2, int len2, Oid collid)
1373 {
1374  int result;
1375 
1376  /*
1377  * Unfortunately, there is no strncoll(), so in the non-C locale case we
1378  * have to do some memory copying. This turns out to be significantly
1379  * slower, so we optimize the case where LC_COLLATE is C. We also try to
1380  * optimize relatively-short strings by avoiding palloc/pfree overhead.
1381  */
1382  if (lc_collate_is_c(collid))
1383  {
1384  result = memcmp(arg1, arg2, Min(len1, len2));
1385  if ((result == 0) && (len1 != len2))
1386  result = (len1 < len2) ? -1 : 1;
1387  }
1388  else
1389  {
1390  char a1buf[TEXTBUFLEN];
1391  char a2buf[TEXTBUFLEN];
1392  char *a1p,
1393  *a2p;
1394 
1395 #ifdef HAVE_LOCALE_T
1396  pg_locale_t mylocale = 0;
1397 #endif
1398 
1399  if (collid != DEFAULT_COLLATION_OID)
1400  {
1401  if (!OidIsValid(collid))
1402  {
1403  /*
1404  * This typically means that the parser could not resolve a
1405  * conflict of implicit collations, so report it that way.
1406  */
1407  ereport(ERROR,
1408  (errcode(ERRCODE_INDETERMINATE_COLLATION),
1409  errmsg("could not determine which collation to use for string comparison"),
1410  errhint("Use the COLLATE clause to set the collation explicitly.")));
1411  }
1412 #ifdef HAVE_LOCALE_T
1413  mylocale = pg_newlocale_from_collation(collid);
1414 #endif
1415  }
1416 
1417  /*
1418  * memcmp() can't tell us which of two unequal strings sorts first, but
1419  * it's a cheap way to tell if they're equal. Testing shows that
1420  * memcmp() followed by strcoll() is only trivially slower than
1421  * strcoll() by itself, so we don't lose much if this doesn't work out
1422  * very often, and if it does - for example, because there are many
1423  * equal strings in the input - then we win big by avoiding expensive
1424  * collation-aware comparisons.
1425  */
1426  if (len1 == len2 && memcmp(arg1, arg2, len1) == 0)
1427  return 0;
1428 
1429 #ifdef WIN32
1430  /* Win32 does not have UTF-8, so we need to map to UTF-16 */
1431  if (GetDatabaseEncoding() == PG_UTF8)
1432  {
1433  int a1len;
1434  int a2len;
1435  int r;
1436 
1437  if (len1 >= TEXTBUFLEN / 2)
1438  {
1439  a1len = len1 * 2 + 2;
1440  a1p = palloc(a1len);
1441  }
1442  else
1443  {
1444  a1len = TEXTBUFLEN;
1445  a1p = a1buf;
1446  }
1447  if (len2 >= TEXTBUFLEN / 2)
1448  {
1449  a2len = len2 * 2 + 2;
1450  a2p = palloc(a2len);
1451  }
1452  else
1453  {
1454  a2len = TEXTBUFLEN;
1455  a2p = a2buf;
1456  }
1457 
1458  /* stupid Microsloth API does not work for zero-length input */
1459  if (len1 == 0)
1460  r = 0;
1461  else
1462  {
1463  r = MultiByteToWideChar(CP_UTF8, 0, arg1, len1,
1464  (LPWSTR) a1p, a1len / 2);
1465  if (!r)
1466  ereport(ERROR,
1467  (errmsg("could not convert string to UTF-16: error code %lu",
1468  GetLastError())));
1469  }
1470  ((LPWSTR) a1p)[r] = 0;
1471 
1472  if (len2 == 0)
1473  r = 0;
1474  else
1475  {
1476  r = MultiByteToWideChar(CP_UTF8, 0, arg2, len2,
1477  (LPWSTR) a2p, a2len / 2);
1478  if (!r)
1479  ereport(ERROR,
1480  (errmsg("could not convert string to UTF-16: error code %lu",
1481  GetLastError())));
1482  }
1483  ((LPWSTR) a2p)[r] = 0;
1484 
1485  errno = 0;
1486 #ifdef HAVE_LOCALE_T
1487  if (mylocale)
1488  result = wcscoll_l((LPWSTR) a1p, (LPWSTR) a2p, mylocale);
1489  else
1490 #endif
1491  result = wcscoll((LPWSTR) a1p, (LPWSTR) a2p);
1492  if (result == 2147483647) /* _NLSCMPERROR; missing from mingw
1493  * headers */
1494  ereport(ERROR,
1495  (errmsg("could not compare Unicode strings: %m")));
1496 
1497  /*
1498  * In some locales wcscoll() can claim that nonidentical strings
1499  * are equal. Believing that would be bad news for a number of
1500  * reasons, so we follow Perl's lead and sort "equal" strings
1501  * according to strcmp (on the UTF-8 representation).
1502  */
1503  if (result == 0)
1504  {
1505  result = memcmp(arg1, arg2, Min(len1, len2));
1506  if ((result == 0) && (len1 != len2))
1507  result = (len1 < len2) ? -1 : 1;
1508  }
1509 
1510  if (a1p != a1buf)
1511  pfree(a1p);
1512  if (a2p != a2buf)
1513  pfree(a2p);
1514 
1515  return result;
1516  }
1517 #endif /* WIN32 */
1518 
1519  if (len1 >= TEXTBUFLEN)
1520  a1p = (char *) palloc(len1 + 1);
1521  else
1522  a1p = a1buf;
1523  if (len2 >= TEXTBUFLEN)
1524  a2p = (char *) palloc(len2 + 1);
1525  else
1526  a2p = a2buf;
1527 
1528  memcpy(a1p, arg1, len1);
1529  a1p[len1] = '\0';
1530  memcpy(a2p, arg2, len2);
1531  a2p[len2] = '\0';
1532 
1533 #ifdef HAVE_LOCALE_T
1534  if (mylocale)
1535  result = strcoll_l(a1p, a2p, mylocale);
1536  else
1537 #endif
1538  result = strcoll(a1p, a2p);
1539 
1540  /*
1541  * In some locales strcoll() can claim that nonidentical strings are
1542  * equal. Believing that would be bad news for a number of reasons,
1543  * so we follow Perl's lead and sort "equal" strings according to
1544  * strcmp().
1545  */
1546  if (result == 0)
1547  result = strcmp(a1p, a2p);
1548 
1549  if (a1p != a1buf)
1550  pfree(a1p);
1551  if (a2p != a2buf)
1552  pfree(a2p);
1553  }
1554 
1555  return result;
1556 }
1557 
1558 /* text_cmp()
1559  * Internal comparison function for text strings.
1560  * Returns -1, 0 or 1
1561  */
1562 static int
1563 text_cmp(text *arg1, text *arg2, Oid collid)
1564 {
1565  char *a1p,
1566  *a2p;
1567  int len1,
1568  len2;
1569 
1570  a1p = VARDATA_ANY(arg1);
1571  a2p = VARDATA_ANY(arg2);
1572 
1573  len1 = VARSIZE_ANY_EXHDR(arg1);
1574  len2 = VARSIZE_ANY_EXHDR(arg2);
1575 
1576  return varstr_cmp(a1p, len1, a2p, len2, collid);
1577 }
1578 
1579 /*
1580  * Comparison functions for text strings.
1581  *
1582  * Note: btree indexes need these routines not to leak memory; therefore,
1583  * be careful to free working copies of toasted datums. Most places don't
1584  * need to be so careful.
1585  */
1586 
1587 Datum
1589 {
1590  Datum arg1 = PG_GETARG_DATUM(0);
1591  Datum arg2 = PG_GETARG_DATUM(1);
1592  bool result;
1593  Size len1,
1594  len2;
1595 
1596  /*
1597  * Since we only care about equality or not-equality, we can avoid all the
1598  * expense of strcoll() here, and just do bitwise comparison. In fact, we
1599  * don't even have to do a bitwise comparison if we can show the lengths
1600  * of the strings are unequal; which might save us from having to detoast
1601  * one or both values.
1602  */
1603  len1 = toast_raw_datum_size(arg1);
1604  len2 = toast_raw_datum_size(arg2);
1605  if (len1 != len2)
1606  result = false;
1607  else
1608  {
1609  text *targ1 = DatumGetTextPP(arg1);
1610  text *targ2 = DatumGetTextPP(arg2);
1611 
1612  result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1613  len1 - VARHDRSZ) == 0);
1614 
1615  PG_FREE_IF_COPY(targ1, 0);
1616  PG_FREE_IF_COPY(targ2, 1);
1617  }
1618 
1619  PG_RETURN_BOOL(result);
1620 }
1621 
1622 Datum
1624 {
1625  Datum arg1 = PG_GETARG_DATUM(0);
1626  Datum arg2 = PG_GETARG_DATUM(1);
1627  bool result;
1628  Size len1,
1629  len2;
1630 
1631  /* See comment in texteq() */
1632  len1 = toast_raw_datum_size(arg1);
1633  len2 = toast_raw_datum_size(arg2);
1634  if (len1 != len2)
1635  result = true;
1636  else
1637  {
1638  text *targ1 = DatumGetTextPP(arg1);
1639  text *targ2 = DatumGetTextPP(arg2);
1640 
1641  result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1642  len1 - VARHDRSZ) != 0);
1643 
1644  PG_FREE_IF_COPY(targ1, 0);
1645  PG_FREE_IF_COPY(targ2, 1);
1646  }
1647 
1648  PG_RETURN_BOOL(result);
1649 }
1650 
1651 Datum
1653 {
1654  text *arg1 = PG_GETARG_TEXT_PP(0);
1655  text *arg2 = PG_GETARG_TEXT_PP(1);
1656  bool result;
1657 
1658  result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) < 0);
1659 
1660  PG_FREE_IF_COPY(arg1, 0);
1661  PG_FREE_IF_COPY(arg2, 1);
1662 
1663  PG_RETURN_BOOL(result);
1664 }
1665 
1666 Datum
1668 {
1669  text *arg1 = PG_GETARG_TEXT_PP(0);
1670  text *arg2 = PG_GETARG_TEXT_PP(1);
1671  bool result;
1672 
1673  result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) <= 0);
1674 
1675  PG_FREE_IF_COPY(arg1, 0);
1676  PG_FREE_IF_COPY(arg2, 1);
1677 
1678  PG_RETURN_BOOL(result);
1679 }
1680 
1681 Datum
1683 {
1684  text *arg1 = PG_GETARG_TEXT_PP(0);
1685  text *arg2 = PG_GETARG_TEXT_PP(1);
1686  bool result;
1687 
1688  result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) > 0);
1689 
1690  PG_FREE_IF_COPY(arg1, 0);
1691  PG_FREE_IF_COPY(arg2, 1);
1692 
1693  PG_RETURN_BOOL(result);
1694 }
1695 
1696 Datum
1698 {
1699  text *arg1 = PG_GETARG_TEXT_PP(0);
1700  text *arg2 = PG_GETARG_TEXT_PP(1);
1701  bool result;
1702 
1703  result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) >= 0);
1704 
1705  PG_FREE_IF_COPY(arg1, 0);
1706  PG_FREE_IF_COPY(arg2, 1);
1707 
1708  PG_RETURN_BOOL(result);
1709 }
1710 
1711 Datum
1713 {
1714  text *arg1 = PG_GETARG_TEXT_PP(0);
1715  text *arg2 = PG_GETARG_TEXT_PP(1);
1716  int32 result;
1717 
1718  result = text_cmp(arg1, arg2, PG_GET_COLLATION());
1719 
1720  PG_FREE_IF_COPY(arg1, 0);
1721  PG_FREE_IF_COPY(arg2, 1);
1722 
1723  PG_RETURN_INT32(result);
1724 }
1725 
1726 Datum
1728 {
1730  Oid collid = ssup->ssup_collation;
1731  MemoryContext oldcontext;
1732 
1733  oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
1734 
1735  btsortsupport_worker(ssup, collid);
1736 
1737  MemoryContextSwitchTo(oldcontext);
1738 
1739  PG_RETURN_VOID();
1740 }
1741 
1742 static void
1744 {
1745  bool abbreviate = ssup->abbreviate;
1746  bool collate_c = false;
1747  TextSortSupport *tss;
1748 
1749 #ifdef HAVE_LOCALE_T
1750  pg_locale_t locale = 0;
1751 #endif
1752 
1753  /*
1754  * If possible, set ssup->comparator to a function which can be used to
1755  * directly compare two datums. If we can do this, we'll avoid the
1756  * overhead of a trip through the fmgr layer for every comparison,
1757  * which can be substantial.
1758  *
1759  * Most typically, we'll set the comparator to bttextfastcmp_locale,
1760  * which uses strcoll() to perform comparisons. However, if LC_COLLATE
1761  * = C, we can make things quite a bit faster with bttextfastcmp_c,
1762  * which uses memcmp() rather than strcoll().
1763  *
1764  * There is a further exception on Windows. When the database encoding
1765  * is UTF-8 and we are not using the C collation, complex hacks are
1766  * required. We don't currently have a comparator that handles that case,
1767  * so we fall back on the slow method of having the sort code invoke
1768  * bttextcmp() via the fmgr trampoline.
1769  */
1770  if (lc_collate_is_c(collid))
1771  {
1772  ssup->comparator = bttextfastcmp_c;
1773  collate_c = true;
1774  }
1775 #ifdef WIN32
1776  else if (GetDatabaseEncoding() == PG_UTF8)
1777  return;
1778 #endif
1779  else
1780  {
1782 
1783  /*
1784  * We need a collation-sensitive comparison. To make things faster,
1785  * we'll figure out the collation based on the locale id and cache the
1786  * result.
1787  */
1788  if (collid != DEFAULT_COLLATION_OID)
1789  {
1790  if (!OidIsValid(collid))
1791  {
1792  /*
1793  * This typically means that the parser could not resolve a
1794  * conflict of implicit collations, so report it that way.
1795  */
1796  ereport(ERROR,
1797  (errcode(ERRCODE_INDETERMINATE_COLLATION),
1798  errmsg("could not determine which collation to use for string comparison"),
1799  errhint("Use the COLLATE clause to set the collation explicitly.")));
1800  }
1801 #ifdef HAVE_LOCALE_T
1802  locale = pg_newlocale_from_collation(collid);
1803 #endif
1804  }
1805  }
1806 
1807  /*
1808  * It's possible that there are platforms where the use of abbreviated
1809  * keys should be disabled at compile time. Having only 4 byte datums
1810  * could make worst-case performance drastically more likely, for example.
1811  * Moreover, Darwin's strxfrm() implementations is known to not effectively
1812  * concentrate a significant amount of entropy from the original string in
1813  * earlier transformed blobs. It's possible that other supported platforms
1814  * are similarly encumbered. However, even in those cases, the abbreviated
1815  * keys optimization may win, and if it doesn't, the "abort abbreviation"
1816  * code may rescue us. So, for now, we don't disable this anywhere on the
1817  * basis of performance.
1818  */
1819 
1820  /*
1821  * If we're using abbreviated keys, or if we're using a locale-aware
1822  * comparison, we need to initialize a TextSortSupport object. Both cases
1823  * will make use of the temporary buffers we initialize here for scratch
1824  * space, and the abbreviation case requires additional state.
1825  */
1826  if (abbreviate || !collate_c)
1827  {
1828  tss = palloc(sizeof(TextSortSupport));
1829  tss->buf1 = palloc(TEXTBUFLEN);
1830  tss->buflen1 = TEXTBUFLEN;
1831  tss->buf2 = palloc(TEXTBUFLEN);
1832  tss->buflen2 = TEXTBUFLEN;
1833 #ifdef HAVE_LOCALE_T
1834  tss->locale = locale;
1835 #endif
1836  tss->collate_c = collate_c;
1837  ssup->ssup_extra = tss;
1838 
1839  /*
1840  * If possible, plan to use the abbreviated keys optimization. The
1841  * core code may switch back to authoritative comparator should
1842  * abbreviation be aborted.
1843  */
1844  if (abbreviate)
1845  {
1846  tss->prop_card = 0.20;
1847  initHyperLogLog(&tss->abbr_card, 10);
1848  initHyperLogLog(&tss->full_card, 10);
1849  ssup->abbrev_full_comparator = ssup->comparator;
1850  ssup->comparator = bttextcmp_abbrev;
1853  }
1854  }
1855 }
1856 
1857 /*
1858  * sortsupport comparison func (for C locale case)
1859  */
1860 static int
1862 {
1863  text *arg1 = DatumGetTextPP(x);
1864  text *arg2 = DatumGetTextPP(y);
1865  char *a1p,
1866  *a2p;
1867  int len1,
1868  len2,
1869  result;
1870 
1871  a1p = VARDATA_ANY(arg1);
1872  a2p = VARDATA_ANY(arg2);
1873 
1874  len1 = VARSIZE_ANY_EXHDR(arg1);
1875  len2 = VARSIZE_ANY_EXHDR(arg2);
1876 
1877  result = memcmp(a1p, a2p, Min(len1, len2));
1878  if ((result == 0) && (len1 != len2))
1879  result = (len1 < len2) ? -1 : 1;
1880 
1881  /* We can't afford to leak memory here. */
1882  if (PointerGetDatum(arg1) != x)
1883  pfree(arg1);
1884  if (PointerGetDatum(arg2) != y)
1885  pfree(arg2);
1886 
1887  return result;
1888 }
1889 
1890 /*
1891  * sortsupport comparison func (for locale case)
1892  */
1893 static int
1895 {
1896  text *arg1 = DatumGetTextPP(x);
1897  text *arg2 = DatumGetTextPP(y);
1898  TextSortSupport *tss = (TextSortSupport *) ssup->ssup_extra;
1899 
1900  /* working state */
1901  char *a1p,
1902  *a2p;
1903  int len1,
1904  len2,
1905  result;
1906 
1907  a1p = VARDATA_ANY(arg1);
1908  a2p = VARDATA_ANY(arg2);
1909 
1910  len1 = VARSIZE_ANY_EXHDR(arg1);
1911  len2 = VARSIZE_ANY_EXHDR(arg2);
1912 
1913  /* Fast pre-check for equality, as discussed in varstr_cmp() */
1914  if (len1 == len2 && memcmp(a1p, a2p, len1) == 0)
1915  {
1916  result = 0;
1917  goto done;
1918  }
1919 
1920  if (len1 >= tss->buflen1)
1921  {
1922  pfree(tss->buf1);
1923  tss->buflen1 = Max(len1 + 1, Min(tss->buflen1 * 2, MaxAllocSize));
1924  tss->buf1 = MemoryContextAlloc(ssup->ssup_cxt, tss->buflen1);
1925  }
1926  if (len2 >= tss->buflen2)
1927  {
1928  pfree(tss->buf2);
1929  tss->buflen2 = Max(len2 + 1, Min(tss->buflen2 * 2, MaxAllocSize));
1930  tss->buf2 = MemoryContextAlloc(ssup->ssup_cxt, tss->buflen2);
1931  }
1932 
1933  memcpy(tss->buf1, a1p, len1);
1934  tss->buf1[len1] = '\0';
1935  memcpy(tss->buf2, a2p, len2);
1936  tss->buf2[len2] = '\0';
1937 
1938 #ifdef HAVE_LOCALE_T
1939  if (tss->locale)
1940  result = strcoll_l(tss->buf1, tss->buf2, tss->locale);
1941  else
1942 #endif
1943  result = strcoll(tss->buf1, tss->buf2);
1944 
1945  /*
1946  * In some locales strcoll() can claim that nonidentical strings are equal.
1947  * Believing that would be bad news for a number of reasons, so we follow
1948  * Perl's lead and sort "equal" strings according to strcmp().
1949  */
1950  if (result == 0)
1951  result = strcmp(tss->buf1, tss->buf2);
1952 
1953 done:
1954  /* We can't afford to leak memory here. */
1955  if (PointerGetDatum(arg1) != x)
1956  pfree(arg1);
1957  if (PointerGetDatum(arg2) != y)
1958  pfree(arg2);
1959 
1960  return result;
1961 }
1962 
1963 /*
1964  * Abbreviated key comparison func
1965  */
1966 static int
1968 {
1969  char *a = (char *) &x;
1970  char *b = (char *) &y;
1971  int result;
1972 
1973  result = memcmp(a, b, sizeof(Datum));
1974 
1975  /*
1976  * When result = 0, the core system will call bttextfastcmp_c() or
1977  * bttextfastcmp_locale(). Even a strcmp() on two non-truncated strxfrm()
1978  * blobs cannot indicate *equality* authoritatively, for the same reason
1979  * that there is a strcoll() tie-breaker call to strcmp() in varstr_cmp().
1980  */
1981  return result;
1982 }
1983 
1984 /*
1985  * Conversion routine for sortsupport. Converts original text to abbreviated
1986  * key representation. Our encoding strategy is simple -- pack the first 8
1987  * bytes of a strxfrm() blob into a Datum.
1988  */
1989 static Datum
1991 {
1992  TextSortSupport *tss = (TextSortSupport *) ssup->ssup_extra;
1993  text *authoritative = DatumGetTextPP(original);
1994  char *authoritative_data = VARDATA_ANY(authoritative);
1995 
1996  /* working state */
1997  Datum res;
1998  char *pres;
1999  int len;
2000  uint32 hash;
2001 
2002  /*
2003  * Abbreviated key representation is a pass-by-value Datum that is treated
2004  * as a char array by the specialized comparator bttextcmp_abbrev().
2005  */
2006  pres = (char *) &res;
2007  /* memset(), so any non-overwritten bytes are NUL */
2008  memset(pres, 0, sizeof(Datum));
2009  len = VARSIZE_ANY_EXHDR(authoritative);
2010 
2011  /*
2012  * If we're using the C collation, use memcmp(), rather than strxfrm(),
2013  * to abbreviate keys. The full comparator for the C locale is always
2014  * memcmp(), and we can't risk having this give a different answer.
2015  * Besides, this should be faster, too.
2016  */
2017  if (tss->collate_c)
2018  memcpy(pres, authoritative_data, Min(len, sizeof(Datum)));
2019  else
2020  {
2021  Size bsize;
2022 
2023  /*
2024  * We're not using the C collation, so fall back on strxfrm.
2025  */
2026 
2027  /* By convention, we use buffer 1 to store and NUL-terminate text */
2028  if (len >= tss->buflen1)
2029  {
2030  pfree(tss->buf1);
2031  tss->buflen1 = Max(len + 1, Min(tss->buflen1 * 2, MaxAllocSize));
2032  tss->buf1 = palloc(tss->buflen1);
2033  }
2034 
2035  /* Just like strcoll(), strxfrm() expects a NUL-terminated string */
2036  memcpy(tss->buf1, VARDATA_ANY(authoritative), len);
2037  tss->buf1[len] = '\0';
2038 
2039  /* Don't leak memory here */
2040  if (PointerGetDatum(authoritative) != original)
2041  pfree(authoritative);
2042 
2043  for (;;)
2044  {
2045 #ifdef HAVE_LOCALE_T
2046  if (tss->locale)
2047  bsize = strxfrm_l(tss->buf2, tss->buf1,
2048  tss->buflen2, tss->locale);
2049  else
2050 #endif
2051  bsize = strxfrm(tss->buf2, tss->buf1, tss->buflen2);
2052 
2053  if (bsize < tss->buflen2)
2054  break;
2055 
2056  /*
2057  * The C standard states that the contents of the buffer is now
2058  * unspecified. Grow buffer, and retry.
2059  */
2060  pfree(tss->buf2);
2061  tss->buflen2 = Max(bsize + 1,
2062  Min(tss->buflen2 * 2, MaxAllocSize));
2063  tss->buf2 = palloc(tss->buflen2);
2064  }
2065 
2066  /*
2067  * Every Datum byte is always compared. This is safe because the
2068  * strxfrm() blob is itself NUL terminated, leaving no danger of
2069  * misinterpreting any NUL bytes not intended to be interpreted as
2070  * logically representing termination.
2071  */
2072  memcpy(pres, tss->buf2, Min(sizeof(Datum), bsize));
2073  }
2074 
2075  /*
2076  * Maintain approximate cardinality of both abbreviated keys and original,
2077  * authoritative keys using HyperLogLog. Used as cheap insurance against
2078  * the worst case, where we do many string transformations for no saving in
2079  * full strcoll()-based comparisons. These statistics are used by
2080  * bttext_abbrev_abort().
2081  *
2082  * First, Hash key proper, or a significant fraction of it. Mix in length
2083  * in order to compensate for cases where differences are past
2084  * PG_CACHE_LINE_SIZE bytes, so as to limit the overhead of hashing.
2085  */
2086  hash = DatumGetUInt32(hash_any((unsigned char *) authoritative_data,
2087  Min(len, PG_CACHE_LINE_SIZE)));
2088 
2089  if (len > PG_CACHE_LINE_SIZE)
2090  hash ^= DatumGetUInt32(hash_uint32((uint32) len));
2091 
2092  addHyperLogLog(&tss->full_card, hash);
2093 
2094  /* Hash abbreviated key */
2095 #if SIZEOF_DATUM == 8
2096  {
2097  uint32 lohalf,
2098  hihalf;
2099 
2100  lohalf = (uint32) res;
2101  hihalf = (uint32) (res >> 32);
2102  hash = DatumGetUInt32(hash_uint32(lohalf ^ hihalf));
2103  }
2104 #else /* SIZEOF_DATUM != 8 */
2105  hash = DatumGetUInt32(hash_uint32((uint32) res));
2106 #endif
2107 
2108  addHyperLogLog(&tss->abbr_card, hash);
2109 
2110  return res;
2111 }
2112 
2113 /*
2114  * Callback for estimating effectiveness of abbreviated key optimization, using
2115  * heuristic rules. Returns value indicating if the abbreviation optimization
2116  * should be aborted, based on its projected effectiveness.
2117  */
2118 static bool
2119 bttext_abbrev_abort(int memtupcount, SortSupport ssup)
2120 {
2121  TextSortSupport *tss = (TextSortSupport *) ssup->ssup_extra;
2122  double abbrev_distinct, key_distinct;
2123 
2124  Assert(ssup->abbreviate);
2125 
2126  /* Have a little patience */
2127  if (memtupcount < 100)
2128  return false;
2129 
2130  abbrev_distinct = estimateHyperLogLog(&tss->abbr_card);
2131  key_distinct = estimateHyperLogLog(&tss->full_card);
2132 
2133  /*
2134  * Clamp cardinality estimates to at least one distinct value. While NULLs
2135  * are generally disregarded, if only NULL values were seen so far, that
2136  * might misrepresent costs if we failed to clamp.
2137  */
2138  if (abbrev_distinct <= 1.0)
2139  abbrev_distinct = 1.0;
2140 
2141  if (key_distinct <= 1.0)
2142  key_distinct = 1.0;
2143 
2144  /*
2145  * In the worst case all abbreviated keys are identical, while at the same
2146  * time there are differences within full key strings not captured in
2147  * abbreviations.
2148  */
2149 #ifdef TRACE_SORT
2150  if (trace_sort)
2151  {
2152  double norm_abbrev_card = abbrev_distinct / (double) memtupcount;
2153 
2154  elog(LOG, "bttext_abbrev: abbrev_distinct after %d: %f "
2155  "(key_distinct: %f, norm_abbrev_card: %f, prop_card: %f)",
2156  memtupcount, abbrev_distinct, key_distinct, norm_abbrev_card,
2157  tss->prop_card);
2158  }
2159 #endif
2160 
2161  /*
2162  * If the number of distinct abbreviated keys approximately matches the
2163  * number of distinct authoritative original keys, that's reason enough to
2164  * proceed. We can win even with a very low cardinality set if most
2165  * tie-breakers only memcmp(). This is by far the most important
2166  * consideration.
2167  *
2168  * While comparisons that are resolved at the abbreviated key level are
2169  * considerably cheaper than tie-breakers resolved with memcmp(), both of
2170  * those two outcomes are so much cheaper than a full strcoll() once
2171  * sorting is underway that it doesn't seem worth it to weigh abbreviated
2172  * cardinality against the overall size of the set in order to more
2173  * accurately model costs. Assume that an abbreviated comparison, and an
2174  * abbreviated comparison with a cheap memcmp()-based authoritative
2175  * resolution are equivalent.
2176  */
2177  if (abbrev_distinct > key_distinct * tss->prop_card)
2178  {
2179  /*
2180  * When we have exceeded 10,000 tuples, decay required cardinality
2181  * aggressively for next call.
2182  *
2183  * This is useful because the number of comparisons required on average
2184  * increases at a linearithmic rate, and at roughly 10,000 tuples that
2185  * factor will start to dominate over the linear costs of string
2186  * transformation (this is a conservative estimate). The decay rate is
2187  * chosen to be a little less aggressive than halving -- which (since
2188  * we're called at points at which memtupcount has doubled) would never
2189  * see the cost model actually abort past the first call following a
2190  * decay. This decay rate is mostly a precaution against a sudden,
2191  * violent swing in how well abbreviated cardinality tracks full key
2192  * cardinality. The decay also serves to prevent a marginal case from
2193  * being aborted too late, when too much has already been invested in
2194  * string transformation.
2195  *
2196  * It's possible for sets of several million distinct strings with mere
2197  * tens of thousands of distinct abbreviated keys to still benefit very
2198  * significantly. This will generally occur provided each abbreviated
2199  * key is a proxy for a roughly uniform number of the set's full keys.
2200  * If it isn't so, we hope to catch that early and abort. If it isn't
2201  * caught early, by the time the problem is apparent it's probably not
2202  * worth aborting.
2203  */
2204  if (memtupcount > 10000)
2205  tss->prop_card *= 0.65;
2206 
2207  return false;
2208  }
2209 
2210  /*
2211  * Abort abbreviation strategy.
2212  *
2213  * The worst case, where all abbreviated keys are identical while all
2214  * original strings differ will typically only see a regression of about
2215  * 10% in execution time for small to medium sized lists of strings.
2216  * Whereas on modern CPUs where cache stalls are the dominant cost, we can
2217  * often expect very large improvements, particularly with sets of strings
2218  * of moderately high to high abbreviated cardinality. There is little to
2219  * lose but much to gain, which our strategy reflects.
2220  */
2221 #ifdef TRACE_SORT
2222  if (trace_sort)
2223  elog(LOG, "bttext_abbrev: aborted abbreviation at %d "
2224  "(abbrev_distinct: %f, key_distinct: %f, prop_card: %f)",
2225  memtupcount, abbrev_distinct, key_distinct, tss->prop_card);
2226 #endif
2227 
2228  return true;
2229 }
2230 
2231 Datum
2233 {
2234  text *arg1 = PG_GETARG_TEXT_PP(0);
2235  text *arg2 = PG_GETARG_TEXT_PP(1);
2236  text *result;
2237 
2238  result = ((text_cmp(arg1, arg2, PG_GET_COLLATION()) > 0) ? arg1 : arg2);
2239 
2240  PG_RETURN_TEXT_P(result);
2241 }
2242 
2243 Datum
2245 {
2246  text *arg1 = PG_GETARG_TEXT_PP(0);
2247  text *arg2 = PG_GETARG_TEXT_PP(1);
2248  text *result;
2249 
2250  result = ((text_cmp(arg1, arg2, PG_GET_COLLATION()) < 0) ? arg1 : arg2);
2251 
2252  PG_RETURN_TEXT_P(result);
2253 }
2254 
2255 
2256 /*
2257  * The following operators support character-by-character comparison
2258  * of text datums, to allow building indexes suitable for LIKE clauses.
2259  * Note that the regular texteq/textne comparison operators are assumed
2260  * to be compatible with these!
2261  */
2262 
2263 static int
2265 {
2266  int result;
2267  int len1,
2268  len2;
2269 
2270  len1 = VARSIZE_ANY_EXHDR(arg1);
2271  len2 = VARSIZE_ANY_EXHDR(arg2);
2272 
2273  result = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
2274  if (result != 0)
2275  return result;
2276  else if (len1 < len2)
2277  return -1;
2278  else if (len1 > len2)
2279  return 1;
2280  else
2281  return 0;
2282 }
2283 
2284 
2285 Datum
2287 {
2288  text *arg1 = PG_GETARG_TEXT_PP(0);
2289  text *arg2 = PG_GETARG_TEXT_PP(1);
2290  int result;
2291 
2292  result = internal_text_pattern_compare(arg1, arg2);
2293 
2294  PG_FREE_IF_COPY(arg1, 0);
2295  PG_FREE_IF_COPY(arg2, 1);
2296 
2297  PG_RETURN_BOOL(result < 0);
2298 }
2299 
2300 
2301 Datum
2303 {
2304  text *arg1 = PG_GETARG_TEXT_PP(0);
2305  text *arg2 = PG_GETARG_TEXT_PP(1);
2306  int result;
2307 
2308  result = internal_text_pattern_compare(arg1, arg2);
2309 
2310  PG_FREE_IF_COPY(arg1, 0);
2311  PG_FREE_IF_COPY(arg2, 1);
2312 
2313  PG_RETURN_BOOL(result <= 0);
2314 }
2315 
2316 
2317 Datum
2319 {
2320  text *arg1 = PG_GETARG_TEXT_PP(0);
2321  text *arg2 = PG_GETARG_TEXT_PP(1);
2322  int result;
2323 
2324  result = internal_text_pattern_compare(arg1, arg2);
2325 
2326  PG_FREE_IF_COPY(arg1, 0);
2327  PG_FREE_IF_COPY(arg2, 1);
2328 
2329  PG_RETURN_BOOL(result >= 0);
2330 }
2331 
2332 
2333 Datum
2335 {
2336  text *arg1 = PG_GETARG_TEXT_PP(0);
2337  text *arg2 = PG_GETARG_TEXT_PP(1);
2338  int result;
2339 
2340  result = internal_text_pattern_compare(arg1, arg2);
2341 
2342  PG_FREE_IF_COPY(arg1, 0);
2343  PG_FREE_IF_COPY(arg2, 1);
2344 
2345  PG_RETURN_BOOL(result > 0);
2346 }
2347 
2348 
2349 Datum
2351 {
2352  text *arg1 = PG_GETARG_TEXT_PP(0);
2353  text *arg2 = PG_GETARG_TEXT_PP(1);
2354  int result;
2355 
2356  result = internal_text_pattern_compare(arg1, arg2);
2357 
2358  PG_FREE_IF_COPY(arg1, 0);
2359  PG_FREE_IF_COPY(arg2, 1);
2360 
2361  PG_RETURN_INT32(result);
2362 }
2363 
2364 
2365 /*-------------------------------------------------------------
2366  * byteaoctetlen
2367  *
2368  * get the number of bytes contained in an instance of type 'bytea'
2369  *-------------------------------------------------------------
2370  */
2371 Datum
2373 {
2374  Datum str = PG_GETARG_DATUM(0);
2375 
2376  /* We need not detoast the input at all */
2378 }
2379 
2380 /*
2381  * byteacat -
2382  * takes two bytea* and returns a bytea* that is the concatenation of
2383  * the two.
2384  *
2385  * Cloned from textcat and modified as required.
2386  */
2387 Datum
2389 {
2390  bytea *t1 = PG_GETARG_BYTEA_PP(0);
2391  bytea *t2 = PG_GETARG_BYTEA_PP(1);
2392 
2394 }
2395 
2396 /*
2397  * bytea_catenate
2398  * Guts of byteacat(), broken out so it can be used by other functions
2399  *
2400  * Arguments can be in short-header form, but not compressed or out-of-line
2401  */
2402 static bytea *
2404 {
2405  bytea *result;
2406  int len1,
2407  len2,
2408  len;
2409  char *ptr;
2410 
2411  len1 = VARSIZE_ANY_EXHDR(t1);
2412  len2 = VARSIZE_ANY_EXHDR(t2);
2413 
2414  /* paranoia ... probably should throw error instead? */
2415  if (len1 < 0)
2416  len1 = 0;
2417  if (len2 < 0)
2418  len2 = 0;
2419 
2420  len = len1 + len2 + VARHDRSZ;
2421  result = (bytea *) palloc(len);
2422 
2423  /* Set size of result string... */
2424  SET_VARSIZE(result, len);
2425 
2426  /* Fill data field of result string... */
2427  ptr = VARDATA(result);
2428  if (len1 > 0)
2429  memcpy(ptr, VARDATA_ANY(t1), len1);
2430  if (len2 > 0)
2431  memcpy(ptr + len1, VARDATA_ANY(t2), len2);
2432 
2433  return result;
2434 }
2435 
2436 #define PG_STR_GET_BYTEA(str_) \
2437  DatumGetByteaP(DirectFunctionCall1(byteain, CStringGetDatum(str_)))
2438 
2439 /*
2440  * bytea_substr()
2441  * Return a substring starting at the specified position.
2442  * Cloned from text_substr and modified as required.
2443  *
2444  * Input:
2445  * - string
2446  * - starting position (is one-based)
2447  * - string length (optional)
2448  *
2449  * If the starting position is zero or less, then return from the start of the string
2450  * adjusting the length to be consistent with the "negative start" per SQL.
2451  * If the length is less than zero, an ERROR is thrown. If no third argument
2452  * (length) is provided, the length to the end of the string is assumed.
2453  */
2454 Datum
2456 {
2458  PG_GETARG_INT32(1),
2459  PG_GETARG_INT32(2),
2460  false));
2461 }
2462 
2463 /*
2464  * bytea_substr_no_len -
2465  * Wrapper to avoid opr_sanity failure due to
2466  * one function accepting a different number of args.
2467  */
2468 Datum
2470 {
2472  PG_GETARG_INT32(1),
2473  -1,
2474  true));
2475 }
2476 
2477 static bytea *
2479  int S,
2480  int L,
2481  bool length_not_specified)
2482 {
2483  int S1; /* adjusted start position */
2484  int L1; /* adjusted substring length */
2485 
2486  S1 = Max(S, 1);
2487 
2488  if (length_not_specified)
2489  {
2490  /*
2491  * Not passed a length - DatumGetByteaPSlice() grabs everything to the
2492  * end of the string if we pass it a negative value for length.
2493  */
2494  L1 = -1;
2495  }
2496  else
2497  {
2498  /* end position */
2499  int E = S + L;
2500 
2501  /*
2502  * A negative value for L is the only way for the end position to be
2503  * before the start. SQL99 says to throw an error.
2504  */
2505  if (E < S)
2506  ereport(ERROR,
2507  (errcode(ERRCODE_SUBSTRING_ERROR),
2508  errmsg("negative substring length not allowed")));
2509 
2510  /*
2511  * A zero or negative value for the end position can happen if the
2512  * start was negative or one. SQL99 says to return a zero-length
2513  * string.
2514  */
2515  if (E < 1)
2516  return PG_STR_GET_BYTEA("");
2517 
2518  L1 = E - S1;
2519  }
2520 
2521  /*
2522  * If the start position is past the end of the string, SQL99 says to
2523  * return a zero-length string -- DatumGetByteaPSlice() will do that for
2524  * us. Convert to zero-based starting position
2525  */
2526  return DatumGetByteaPSlice(str, S1 - 1, L1);
2527 }
2528 
2529 /*
2530  * byteaoverlay
2531  * Replace specified substring of first string with second
2532  *
2533  * The SQL standard defines OVERLAY() in terms of substring and concatenation.
2534  * This code is a direct implementation of what the standard says.
2535  */
2536 Datum
2538 {
2539  bytea *t1 = PG_GETARG_BYTEA_PP(0);
2540  bytea *t2 = PG_GETARG_BYTEA_PP(1);
2541  int sp = PG_GETARG_INT32(2); /* substring start position */
2542  int sl = PG_GETARG_INT32(3); /* substring length */
2543 
2544  PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl));
2545 }
2546 
2547 Datum
2549 {
2550  bytea *t1 = PG_GETARG_BYTEA_PP(0);
2551  bytea *t2 = PG_GETARG_BYTEA_PP(1);
2552  int sp = PG_GETARG_INT32(2); /* substring start position */
2553  int sl;
2554 
2555  sl = VARSIZE_ANY_EXHDR(t2); /* defaults to length(t2) */
2556  PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl));
2557 }
2558 
2559 static bytea *
2560 bytea_overlay(bytea *t1, bytea *t2, int sp, int sl)
2561 {
2562  bytea *result;
2563  bytea *s1;
2564  bytea *s2;
2565  int sp_pl_sl;
2566 
2567  /*
2568  * Check for possible integer-overflow cases. For negative sp, throw a
2569  * "substring length" error because that's what should be expected
2570  * according to the spec's definition of OVERLAY().
2571  */
2572  if (sp <= 0)
2573  ereport(ERROR,
2574  (errcode(ERRCODE_SUBSTRING_ERROR),
2575  errmsg("negative substring length not allowed")));
2576  sp_pl_sl = sp + sl;
2577  if (sp_pl_sl <= sl)
2578  ereport(ERROR,
2579  (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
2580  errmsg("integer out of range")));
2581 
2582  s1 = bytea_substring(PointerGetDatum(t1), 1, sp - 1, false);
2583  s2 = bytea_substring(PointerGetDatum(t1), sp_pl_sl, -1, true);
2584  result = bytea_catenate(s1, t2);
2585  result = bytea_catenate(result, s2);
2586 
2587  return result;
2588 }
2589 
2590 /*
2591  * byteapos -
2592  * Return the position of the specified substring.
2593  * Implements the SQL POSITION() function.
2594  * Cloned from textpos and modified as required.
2595  */
2596 Datum
2598 {
2599  bytea *t1 = PG_GETARG_BYTEA_PP(0);
2600  bytea *t2 = PG_GETARG_BYTEA_PP(1);
2601  int pos;
2602  int px,
2603  p;
2604  int len1,
2605  len2;
2606  char *p1,
2607  *p2;
2608 
2609  len1 = VARSIZE_ANY_EXHDR(t1);
2610  len2 = VARSIZE_ANY_EXHDR(t2);
2611 
2612  if (len2 <= 0)
2613  PG_RETURN_INT32(1); /* result for empty pattern */
2614 
2615  p1 = VARDATA_ANY(t1);
2616  p2 = VARDATA_ANY(t2);
2617 
2618  pos = 0;
2619  px = (len1 - len2);
2620  for (p = 0; p <= px; p++)
2621  {
2622  if ((*p2 == *p1) && (memcmp(p1, p2, len2) == 0))
2623  {
2624  pos = p + 1;
2625  break;
2626  };
2627  p1++;
2628  };
2629 
2630  PG_RETURN_INT32(pos);
2631 }
2632 
2633 /*-------------------------------------------------------------
2634  * byteaGetByte
2635  *
2636  * this routine treats "bytea" as an array of bytes.
2637  * It returns the Nth byte (a number between 0 and 255).
2638  *-------------------------------------------------------------
2639  */
2640 Datum
2642 {
2643  bytea *v = PG_GETARG_BYTEA_PP(0);
2644  int32 n = PG_GETARG_INT32(1);
2645  int len;
2646  int byte;
2647 
2648  len = VARSIZE_ANY_EXHDR(v);
2649 
2650  if (n < 0 || n >= len)
2651  ereport(ERROR,
2652  (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
2653  errmsg("index %d out of valid range, 0..%d",
2654  n, len - 1)));
2655 
2656  byte = ((unsigned char *) VARDATA_ANY(v))[n];
2657 
2658  PG_RETURN_INT32(byte);
2659 }
2660 
2661 /*-------------------------------------------------------------
2662  * byteaGetBit
2663  *
2664  * This routine treats a "bytea" type like an array of bits.
2665  * It returns the value of the Nth bit (0 or 1).
2666  *
2667  *-------------------------------------------------------------
2668  */
2669 Datum
2671 {
2672  bytea *v = PG_GETARG_BYTEA_PP(0);
2673  int32 n = PG_GETARG_INT32(1);
2674  int byteNo,
2675  bitNo;
2676  int len;
2677  int byte;
2678 
2679  len = VARSIZE_ANY_EXHDR(v);
2680 
2681  if (n < 0 || n >= len * 8)
2682  ereport(ERROR,
2683  (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
2684  errmsg("index %d out of valid range, 0..%d",
2685  n, len * 8 - 1)));
2686 
2687  byteNo = n / 8;
2688  bitNo = n % 8;
2689 
2690  byte = ((unsigned char *) VARDATA_ANY(v))[byteNo];
2691 
2692  if (byte & (1 << bitNo))
2693  PG_RETURN_INT32(1);
2694  else
2695  PG_RETURN_INT32(0);
2696 }
2697 
2698 /*-------------------------------------------------------------
2699  * byteaSetByte
2700  *
2701  * Given an instance of type 'bytea' creates a new one with
2702  * the Nth byte set to the given value.
2703  *
2704  *-------------------------------------------------------------
2705  */
2706 Datum
2708 {
2709  bytea *v = PG_GETARG_BYTEA_P(0);
2710  int32 n = PG_GETARG_INT32(1);
2711  int32 newByte = PG_GETARG_INT32(2);
2712  int len;
2713  bytea *res;
2714 
2715  len = VARSIZE(v) - VARHDRSZ;
2716 
2717  if (n < 0 || n >= len)
2718  ereport(ERROR,
2719  (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
2720  errmsg("index %d out of valid range, 0..%d",
2721  n, len - 1)));
2722 
2723  /*
2724  * Make a copy of the original varlena.
2725  */
2726  res = (bytea *) palloc(VARSIZE(v));
2727  memcpy((char *) res, (char *) v, VARSIZE(v));
2728 
2729  /*
2730  * Now set the byte.
2731  */
2732  ((unsigned char *) VARDATA(res))[n] = newByte;
2733 
2734  PG_RETURN_BYTEA_P(res);
2735 }
2736 
2737 /*-------------------------------------------------------------
2738  * byteaSetBit
2739  *
2740  * Given an instance of type 'bytea' creates a new one with
2741  * the Nth bit set to the given value.
2742  *
2743  *-------------------------------------------------------------
2744  */
2745 Datum
2747 {
2748  bytea *v = PG_GETARG_BYTEA_P(0);
2749  int32 n = PG_GETARG_INT32(1);
2750  int32 newBit = PG_GETARG_INT32(2);
2751  bytea *res;
2752  int len;
2753  int oldByte,
2754  newByte;
2755  int byteNo,
2756  bitNo;
2757 
2758  len = VARSIZE(v) - VARHDRSZ;
2759 
2760  if (n < 0 || n >= len * 8)
2761  ereport(ERROR,
2762  (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
2763  errmsg("index %d out of valid range, 0..%d",
2764  n, len * 8 - 1)));
2765 
2766  byteNo = n / 8;
2767  bitNo = n % 8;
2768 
2769  /*
2770  * sanity check!
2771  */
2772  if (newBit != 0 && newBit != 1)
2773  ereport(ERROR,
2774  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2775  errmsg("new bit must be 0 or 1")));
2776 
2777  /*
2778  * Make a copy of the original varlena.
2779  */
2780  res = (bytea *) palloc(VARSIZE(v));
2781  memcpy((char *) res, (char *) v, VARSIZE(v));
2782 
2783  /*
2784  * Update the byte.
2785  */
2786  oldByte = ((unsigned char *) VARDATA(res))[byteNo];
2787 
2788  if (newBit == 0)
2789  newByte = oldByte & (~(1 << bitNo));
2790  else
2791  newByte = oldByte | (1 << bitNo);
2792 
2793  ((unsigned char *) VARDATA(res))[byteNo] = newByte;
2794 
2795  PG_RETURN_BYTEA_P(res);
2796 }
2797 
2798 
2799 /* text_name()
2800  * Converts a text type to a Name type.
2801  */
2802 Datum
2804 {
2805  text *s = PG_GETARG_TEXT_PP(0);
2806  Name result;
2807  int len;
2808 
2809  len = VARSIZE_ANY_EXHDR(s);
2810 
2811  /* Truncate oversize input */
2812  if (len >= NAMEDATALEN)
2813  len = pg_mbcliplen(VARDATA_ANY(s), len, NAMEDATALEN - 1);
2814 
2815  /* We use palloc0 here to ensure result is zero-padded */
2816  result = (Name) palloc0(NAMEDATALEN);
2817  memcpy(NameStr(*result), VARDATA_ANY(s), len);
2818 
2819  PG_RETURN_NAME(result);
2820 }
2821 
2822 /* name_text()
2823  * Converts a Name type to a text type.
2824  */
2825 Datum
2827 {
2828  Name s = PG_GETARG_NAME(0);
2829 
2831 }
2832 
2833 
2834 /*
2835  * textToQualifiedNameList - convert a text object to list of names
2836  *
2837  * This implements the input parsing needed by nextval() and other
2838  * functions that take a text parameter representing a qualified name.
2839  * We split the name at dots, downcase if not double-quoted, and
2840  * truncate names if they're too long.
2841  */
2842 List *
2844 {
2845  char *rawname;
2846  List *result = NIL;
2847  List *namelist;
2848  ListCell *l;
2849 
2850  /* Convert to C string (handles possible detoasting). */
2851  /* Note we rely on being able to modify rawname below. */
2852  rawname = text_to_cstring(textval);
2853 
2854  if (!SplitIdentifierString(rawname, '.', &namelist))
2855  ereport(ERROR,
2856  (errcode(ERRCODE_INVALID_NAME),
2857  errmsg("invalid name syntax")));
2858 
2859  if (namelist == NIL)
2860  ereport(ERROR,
2861  (errcode(ERRCODE_INVALID_NAME),
2862  errmsg("invalid name syntax")));
2863 
2864  foreach(l, namelist)
2865  {
2866  char *curname = (char *) lfirst(l);
2867 
2868  result = lappend(result, makeString(pstrdup(curname)));
2869  }
2870 
2871  pfree(rawname);
2872  list_free(namelist);
2873 
2874  return result;
2875 }
2876 
2877 /*
2878  * SplitIdentifierString --- parse a string containing identifiers
2879  *
2880  * This is the guts of textToQualifiedNameList, and is exported for use in
2881  * other situations such as parsing GUC variables. In the GUC case, it's
2882  * important to avoid memory leaks, so the API is designed to minimize the
2883  * amount of stuff that needs to be allocated and freed.
2884  *
2885  * Inputs:
2886  * rawstring: the input string; must be overwritable! On return, it's
2887  * been modified to contain the separated identifiers.
2888  * separator: the separator punctuation expected between identifiers
2889  * (typically '.' or ','). Whitespace may also appear around
2890  * identifiers.
2891  * Outputs:
2892  * namelist: filled with a palloc'd list of pointers to identifiers within
2893  * rawstring. Caller should list_free() this even on error return.
2894  *
2895  * Returns TRUE if okay, FALSE if there is a syntax error in the string.
2896  *
2897  * Note that an empty string is considered okay here, though not in
2898  * textToQualifiedNameList.
2899  */
2900 bool
2901 SplitIdentifierString(char *rawstring, char separator,
2902  List **namelist)
2903 {
2904  char *nextp = rawstring;
2905  bool done = false;
2906 
2907  *namelist = NIL;
2908 
2909  while (isspace((unsigned char) *nextp))
2910  nextp++; /* skip leading whitespace */
2911 
2912  if (*nextp == '\0')
2913  return true; /* allow empty string */
2914 
2915  /* At the top of the loop, we are at start of a new identifier. */
2916  do
2917  {
2918  char *curname;
2919  char *endp;
2920 
2921  if (*nextp == '\"')
2922  {
2923  /* Quoted name --- collapse quote-quote pairs, no downcasing */
2924  curname = nextp + 1;
2925  for (;;)
2926  {
2927  endp = strchr(nextp + 1, '\"');
2928  if (endp == NULL)
2929  return false; /* mismatched quotes */
2930  if (endp[1] != '\"')
2931  break; /* found end of quoted name */
2932  /* Collapse adjacent quotes into one quote, and look again */
2933  memmove(endp, endp + 1, strlen(endp));
2934  nextp = endp;
2935  }
2936  /* endp now points at the terminating quote */
2937  nextp = endp + 1;
2938  }
2939  else
2940  {
2941  /* Unquoted name --- extends to separator or whitespace */
2942  char *downname;
2943  int len;
2944 
2945  curname = nextp;
2946  while (*nextp && *nextp != separator &&
2947  !isspace((unsigned char) *nextp))
2948  nextp++;
2949  endp = nextp;
2950  if (curname == nextp)
2951  return false; /* empty unquoted name not allowed */
2952 
2953  /*
2954  * Downcase the identifier, using same code as main lexer does.
2955  *
2956  * XXX because we want to overwrite the input in-place, we cannot
2957  * support a downcasing transformation that increases the string
2958  * length. This is not a problem given the current implementation
2959  * of downcase_truncate_identifier, but we'll probably have to do
2960  * something about this someday.
2961  */
2962  len = endp - curname;
2963  downname = downcase_truncate_identifier(curname, len, false);
2964  Assert(strlen(downname) <= len);
2965  strncpy(curname, downname, len); /* strncpy is required here */
2966  pfree(downname);
2967  }
2968 
2969  while (isspace((unsigned char) *nextp))
2970  nextp++; /* skip trailing whitespace */
2971 
2972  if (*nextp == separator)
2973  {
2974  nextp++;
2975  while (isspace((unsigned char) *nextp))
2976  nextp++; /* skip leading whitespace for next */
2977  /* we expect another name, so done remains false */
2978  }
2979  else if (*nextp == '\0')
2980  done = true;
2981  else
2982  return false; /* invalid syntax */
2983 
2984  /* Now safe to overwrite separator with a null */
2985  *endp = '\0';
2986 
2987  /* Truncate name if it's overlength */
2988  truncate_identifier(curname, strlen(curname), false);
2989 
2990  /*
2991  * Finished isolating current name --- add it to list
2992  */
2993  *namelist = lappend(*namelist, curname);
2994 
2995  /* Loop back if we didn't reach end of string */
2996  } while (!done);
2997 
2998  return true;
2999 }
3000 
3001 
3002 /*
3003  * SplitDirectoriesString --- parse a string containing directory names
3004  *
3005  * This is similar to SplitIdentifierString, except that the parsing
3006  * rules are meant to handle pathnames instead of identifiers: there is
3007  * no downcasing, embedded spaces are allowed, the max length is MAXPGPATH-1,
3008  * and we apply canonicalize_path() to each extracted string. Because of the
3009  * last, the returned strings are separately palloc'd rather than being
3010  * pointers into rawstring --- but we still scribble on rawstring.
3011  *
3012  * Inputs:
3013  * rawstring: the input string; must be modifiable!
3014  * separator: the separator punctuation expected between directories
3015  * (typically ',' or ';'). Whitespace may also appear around
3016  * directories.
3017  * Outputs:
3018  * namelist: filled with a palloc'd list of directory names.
3019  * Caller should list_free_deep() this even on error return.
3020  *
3021  * Returns TRUE if okay, FALSE if there is a syntax error in the string.
3022  *
3023  * Note that an empty string is considered okay here.
3024  */
3025 bool
3026 SplitDirectoriesString(char *rawstring, char separator,
3027  List **namelist)
3028 {
3029  char *nextp = rawstring;
3030  bool done = false;
3031 
3032  *namelist = NIL;
3033 
3034  while (isspace((unsigned char) *nextp))
3035  nextp++; /* skip leading whitespace */
3036 
3037  if (*nextp == '\0')
3038  return true; /* allow empty string */
3039 
3040  /* At the top of the loop, we are at start of a new directory. */
3041  do
3042  {
3043  char *curname;
3044  char *endp;
3045 
3046  if (*nextp == '\"')
3047  {
3048  /* Quoted name --- collapse quote-quote pairs */
3049  curname = nextp + 1;
3050  for (;;)
3051  {
3052  endp = strchr(nextp + 1, '\"');
3053  if (endp == NULL)
3054  return false; /* mismatched quotes */
3055  if (endp[1] != '\"')
3056  break; /* found end of quoted name */
3057  /* Collapse adjacent quotes into one quote, and look again */
3058  memmove(endp, endp + 1, strlen(endp));
3059  nextp = endp;
3060  }
3061  /* endp now points at the terminating quote */
3062  nextp = endp + 1;
3063  }
3064  else
3065  {
3066  /* Unquoted name --- extends to separator or end of string */
3067  curname = endp = nextp;
3068  while (*nextp && *nextp != separator)
3069  {
3070  /* trailing whitespace should not be included in name */
3071  if (!isspace((unsigned char) *nextp))
3072  endp = nextp + 1;
3073  nextp++;
3074  }
3075  if (curname == endp)
3076  return false; /* empty unquoted name not allowed */
3077  }
3078 
3079  while (isspace((unsigned char) *nextp))
3080  nextp++; /* skip trailing whitespace */
3081 
3082  if (*nextp == separator)
3083  {
3084  nextp++;
3085  while (isspace((unsigned char) *nextp))
3086  nextp++; /* skip leading whitespace for next */
3087  /* we expect another name, so done remains false */
3088  }
3089  else if (*nextp == '\0')
3090  done = true;
3091  else
3092  return false; /* invalid syntax */
3093 
3094  /* Now safe to overwrite separator with a null */
3095  *endp = '\0';
3096 
3097  /* Truncate path if it's overlength */
3098  if (strlen(curname) >= MAXPGPATH)
3099  curname[MAXPGPATH - 1] = '\0';
3100 
3101  /*
3102  * Finished isolating current name --- add it to list
3103  */
3104  curname = pstrdup(curname);
3105  canonicalize_path(curname);
3106  *namelist = lappend(*namelist, curname);
3107 
3108  /* Loop back if we didn't reach end of string */
3109  } while (!done);
3110 
3111  return true;
3112 }
3113 
3114 
3115 /*****************************************************************************
3116  * Comparison Functions used for bytea
3117  *
3118  * Note: btree indexes need these routines not to leak memory; therefore,
3119  * be careful to free working copies of toasted datums. Most places don't
3120  * need to be so careful.
3121  *****************************************************************************/
3122 
3123 Datum
3125 {
3126  Datum arg1 = PG_GETARG_DATUM(0);
3127  Datum arg2 = PG_GETARG_DATUM(1);
3128  bool result;
3129  Size len1,
3130  len2;
3131 
3132  /*
3133  * We can use a fast path for unequal lengths, which might save us from
3134  * having to detoast one or both values.
3135  */
3136  len1 = toast_raw_datum_size(arg1);
3137  len2 = toast_raw_datum_size(arg2);
3138  if (len1 != len2)
3139  result = false;
3140  else
3141  {
3142  bytea *barg1 = DatumGetByteaPP(arg1);
3143  bytea *barg2 = DatumGetByteaPP(arg2);
3144 
3145  result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2),
3146  len1 - VARHDRSZ) == 0);
3147 
3148  PG_FREE_IF_COPY(barg1, 0);
3149  PG_FREE_IF_COPY(barg2, 1);
3150  }
3151 
3152  PG_RETURN_BOOL(result);
3153 }
3154 
3155 Datum
3157 {
3158  Datum arg1 = PG_GETARG_DATUM(0);
3159  Datum arg2 = PG_GETARG_DATUM(1);
3160  bool result;
3161  Size len1,
3162  len2;
3163 
3164  /*
3165  * We can use a fast path for unequal lengths, which might save us from
3166  * having to detoast one or both values.
3167  */
3168  len1 = toast_raw_datum_size(arg1);
3169  len2 = toast_raw_datum_size(arg2);
3170  if (len1 != len2)
3171  result = true;
3172  else
3173  {
3174  bytea *barg1 = DatumGetByteaPP(arg1);
3175  bytea *barg2 = DatumGetByteaPP(arg2);
3176 
3177  result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2),
3178  len1 - VARHDRSZ) != 0);
3179 
3180  PG_FREE_IF_COPY(barg1, 0);
3181  PG_FREE_IF_COPY(barg2, 1);
3182  }
3183 
3184  PG_RETURN_BOOL(result);
3185 }
3186 
3187 Datum
3189 {
3190  bytea *arg1 = PG_GETARG_BYTEA_PP(0);
3191  bytea *arg2 = PG_GETARG_BYTEA_PP(1);
3192  int len1,
3193  len2;
3194  int cmp;
3195 
3196  len1 = VARSIZE_ANY_EXHDR(arg1);
3197  len2 = VARSIZE_ANY_EXHDR(arg2);
3198 
3199  cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3200 
3201  PG_FREE_IF_COPY(arg1, 0);
3202  PG_FREE_IF_COPY(arg2, 1);
3203 
3204  PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 < len2)));
3205 }
3206 
3207 Datum
3209 {
3210  bytea *arg1 = PG_GETARG_BYTEA_PP(0);
3211  bytea *arg2 = PG_GETARG_BYTEA_PP(1);
3212  int len1,
3213  len2;
3214  int cmp;
3215 
3216  len1 = VARSIZE_ANY_EXHDR(arg1);
3217  len2 = VARSIZE_ANY_EXHDR(arg2);
3218 
3219  cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3220 
3221  PG_FREE_IF_COPY(arg1, 0);
3222  PG_FREE_IF_COPY(arg2, 1);
3223 
3224  PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 <= len2)));
3225 }
3226 
3227 Datum
3229 {
3230  bytea *arg1 = PG_GETARG_BYTEA_PP(0);
3231  bytea *arg2 = PG_GETARG_BYTEA_PP(1);
3232  int len1,
3233  len2;
3234  int cmp;
3235 
3236  len1 = VARSIZE_ANY_EXHDR(arg1);
3237  len2 = VARSIZE_ANY_EXHDR(arg2);
3238 
3239  cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3240 
3241  PG_FREE_IF_COPY(arg1, 0);
3242  PG_FREE_IF_COPY(arg2, 1);
3243 
3244  PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 > len2)));
3245 }
3246 
3247 Datum
3249 {
3250  bytea *arg1 = PG_GETARG_BYTEA_PP(0);
3251  bytea *arg2 = PG_GETARG_BYTEA_PP(1);
3252  int len1,
3253  len2;
3254  int cmp;
3255 
3256  len1 = VARSIZE_ANY_EXHDR(arg1);
3257  len2 = VARSIZE_ANY_EXHDR(arg2);
3258 
3259  cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3260 
3261  PG_FREE_IF_COPY(arg1, 0);
3262  PG_FREE_IF_COPY(arg2, 1);
3263 
3264  PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 >= len2)));
3265 }
3266 
3267 Datum
3269 {
3270  bytea *arg1 = PG_GETARG_BYTEA_PP(0);
3271  bytea *arg2 = PG_GETARG_BYTEA_PP(1);
3272  int len1,
3273  len2;
3274  int cmp;
3275 
3276  len1 = VARSIZE_ANY_EXHDR(arg1);
3277  len2 = VARSIZE_ANY_EXHDR(arg2);
3278 
3279  cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3280  if ((cmp == 0) && (len1 != len2))
3281  cmp = (len1 < len2) ? -1 : 1;
3282 
3283  PG_FREE_IF_COPY(arg1, 0);
3284  PG_FREE_IF_COPY(arg2, 1);
3285 
3286  PG_RETURN_INT32(cmp);
3287 }
3288 
3289 /*
3290  * appendStringInfoText
3291  *
3292  * Append a text to str.
3293  * Like appendStringInfoString(str, text_to_cstring(t)) but faster.
3294  */
3295 static void
3297 {
3299 }
3300 
3301 /*
3302  * replace_text
3303  * replace all occurrences of 'old_sub_str' in 'orig_str'
3304  * with 'new_sub_str' to form 'new_str'
3305  *
3306  * returns 'orig_str' if 'old_sub_str' == '' or 'orig_str' == ''
3307  * otherwise returns 'new_str'
3308  */
3309 Datum
3311 {
3312  text *src_text = PG_GETARG_TEXT_PP(0);
3313  text *from_sub_text = PG_GETARG_TEXT_PP(1);
3314  text *to_sub_text = PG_GETARG_TEXT_PP(2);
3315  int src_text_len;
3316  int from_sub_text_len;
3318  text *ret_text;
3319  int start_posn;
3320  int curr_posn;
3321  int chunk_len;
3322  char *start_ptr;
3323  StringInfoData str;
3324 
3325  text_position_setup(src_text, from_sub_text, &state);
3326 
3327  /*
3328  * Note: we check the converted string length, not the original, because
3329  * they could be different if the input contained invalid encoding.
3330  */
3331  src_text_len = state.len1;
3332  from_sub_text_len = state.len2;
3333 
3334  /* Return unmodified source string if empty source or pattern */
3335  if (src_text_len < 1 || from_sub_text_len < 1)
3336  {
3337  text_position_cleanup(&state);
3338  PG_RETURN_TEXT_P(src_text);
3339  }
3340 
3341  start_posn = 1;
3342  curr_posn = text_position_next(1, &state);
3343 
3344  /* When the from_sub_text is not found, there is nothing to do. */
3345  if (curr_posn == 0)
3346  {
3347  text_position_cleanup(&state);
3348  PG_RETURN_TEXT_P(src_text);
3349  }
3350 
3351  /* start_ptr points to the start_posn'th character of src_text */
3352  start_ptr = VARDATA_ANY(src_text);
3353 
3354  initStringInfo(&str);
3355 
3356  do
3357  {
3359 
3360  /* copy the data skipped over by last text_position_next() */
3361  chunk_len = charlen_to_bytelen(start_ptr, curr_posn - start_posn);
3362  appendBinaryStringInfo(&str, start_ptr, chunk_len);
3363 
3364  appendStringInfoText(&str, to_sub_text);
3365 
3366  start_posn = curr_posn;
3367  start_ptr += chunk_len;
3368  start_posn += from_sub_text_len;
3369  start_ptr += charlen_to_bytelen(start_ptr, from_sub_text_len);
3370 
3371  curr_posn = text_position_next(start_posn, &state);
3372  }
3373  while (curr_posn > 0);
3374 
3375  /* copy trailing data */
3376  chunk_len = ((char *) src_text + VARSIZE_ANY(src_text)) - start_ptr;
3377  appendBinaryStringInfo(&str, start_ptr, chunk_len);
3378 
3379  text_position_cleanup(&state);
3380 
3381  ret_text = cstring_to_text_with_len(str.data, str.len);
3382  pfree(str.data);
3383 
3384  PG_RETURN_TEXT_P(ret_text);
3385 }
3386 
3387 /*
3388  * check_replace_text_has_escape_char
3389  *
3390  * check whether replace_text contains escape char.
3391  */
3392 static bool
3394 {
3395  const char *p = VARDATA_ANY(replace_text);
3396  const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);
3397 
3399  {
3400  for (; p < p_end; p++)
3401  {
3402  if (*p == '\\')
3403  return true;
3404  }
3405  }
3406  else
3407  {
3408  for (; p < p_end; p += pg_mblen(p))
3409  {
3410  if (*p == '\\')
3411  return true;
3412  }
3413  }
3414 
3415  return false;
3416 }
3417 
3418 /*
3419  * appendStringInfoRegexpSubstr
3420  *
3421  * Append replace_text to str, substituting regexp back references for
3422  * \n escapes. start_ptr is the start of the match in the source string,
3423  * at logical character position data_pos.
3424  */
3425 static void
3427  regmatch_t *pmatch,
3428  char *start_ptr, int data_pos)
3429 {
3430  const char *p = VARDATA_ANY(replace_text);
3431  const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);
3432  int eml = pg_database_encoding_max_length();
3433 
3434  for (;;)
3435  {
3436  const char *chunk_start = p;
3437  int so;
3438  int eo;
3439 
3440  /* Find next escape char. */
3441  if (eml == 1)
3442  {
3443  for (; p < p_end && *p != '\\'; p++)
3444  /* nothing */ ;
3445  }
3446  else
3447  {
3448  for (; p < p_end && *p != '\\'; p += pg_mblen(p))
3449  /* nothing */ ;
3450  }
3451 
3452  /* Copy the text we just scanned over, if any. */
3453  if (p > chunk_start)
3454  appendBinaryStringInfo(str, chunk_start, p - chunk_start);
3455 
3456  /* Done if at end of string, else advance over escape char. */
3457  if (p >= p_end)
3458  break;
3459  p++;
3460 
3461  if (p >= p_end)
3462  {
3463  /* Escape at very end of input. Treat same as unexpected char */
3464  appendStringInfoChar(str, '\\');
3465  break;
3466  }
3467 
3468  if (*p >= '1' && *p <= '9')
3469  {
3470  /* Use the back reference of regexp. */
3471  int idx = *p - '0';
3472 
3473  so = pmatch[idx].rm_so;
3474  eo = pmatch[idx].rm_eo;
3475  p++;
3476  }
3477  else if (*p == '&')
3478  {
3479  /* Use the entire matched string. */
3480  so = pmatch[0].rm_so;
3481  eo = pmatch[0].rm_eo;
3482  p++;
3483  }
3484  else if (*p == '\\')
3485  {
3486  /* \\ means transfer one \ to output. */
3487  appendStringInfoChar(str, '\\');
3488  p++;
3489  continue;
3490  }
3491  else
3492  {
3493  /*
3494  * If escape char is not followed by any expected char, just treat
3495  * it as ordinary data to copy. (XXX would it be better to throw
3496  * an error?)
3497  */
3498  appendStringInfoChar(str, '\\');
3499  continue;
3500  }
3501 
3502  if (so != -1 && eo != -1)
3503  {
3504  /*
3505  * Copy the text that is back reference of regexp. Note so and eo
3506  * are counted in characters not bytes.
3507  */
3508  char *chunk_start;
3509  int chunk_len;
3510 
3511  Assert(so >= data_pos);
3512  chunk_start = start_ptr;
3513  chunk_start += charlen_to_bytelen(chunk_start, so - data_pos);
3514  chunk_len = charlen_to_bytelen(chunk_start, eo - so);
3515  appendBinaryStringInfo(str, chunk_start, chunk_len);
3516  }
3517  }
3518 }
3519 
3520 #define REGEXP_REPLACE_BACKREF_CNT 10
3521 
3522 /*
3523  * replace_text_regexp
3524  *
3525  * replace text that matches to regexp in src_text to replace_text.
3526  *
3527  * Note: to avoid having to include regex.h in builtins.h, we declare
3528  * the regexp argument as void *, but really it's regex_t *.
3529  */
3530 text *
3531 replace_text_regexp(text *src_text, void *regexp,
3532  text *replace_text, bool glob)
3533 {
3534  text *ret_text;
3535  regex_t *re = (regex_t *) regexp;
3536  int src_text_len = VARSIZE_ANY_EXHDR(src_text);
3539  pg_wchar *data;
3540  size_t data_len;
3541  int search_start;
3542  int data_pos;
3543  char *start_ptr;
3544  bool have_escape;
3545 
3546  initStringInfo(&buf);
3547 
3548  /* Convert data string to wide characters. */
3549  data = (pg_wchar *) palloc((src_text_len + 1) * sizeof(pg_wchar));
3550  data_len = pg_mb2wchar_with_len(VARDATA_ANY(src_text), data, src_text_len);
3551 
3552  /* Check whether replace_text has escape char. */
3553  have_escape = check_replace_text_has_escape_char(replace_text);
3554 
3555  /* start_ptr points to the data_pos'th character of src_text */
3556  start_ptr = (char *) VARDATA_ANY(src_text);
3557  data_pos = 0;
3558 
3559  search_start = 0;
3560  while (search_start <= data_len)
3561  {
3562  int regexec_result;
3563 
3565 
3566  regexec_result = pg_regexec(re,
3567  data,
3568  data_len,
3569  search_start,
3570  NULL, /* no details */
3572  pmatch,
3573  0);
3574 
3575  if (regexec_result == REG_NOMATCH)
3576  break;
3577 
3578  if (regexec_result != REG_OKAY)
3579  {
3580  char errMsg[100];
3581 
3583  pg_regerror(regexec_result, re, errMsg, sizeof(errMsg));
3584  ereport(ERROR,
3585  (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
3586  errmsg("regular expression failed: %s", errMsg)));
3587  }
3588 
3589  /*
3590  * Copy the text to the left of the match position. Note we are given
3591  * character not byte indexes.
3592  */
3593  if (pmatch[0].rm_so - data_pos > 0)
3594  {
3595  int chunk_len;
3596 
3597  chunk_len = charlen_to_bytelen(start_ptr,
3598  pmatch[0].rm_so - data_pos);
3599  appendBinaryStringInfo(&buf, start_ptr, chunk_len);
3600 
3601  /*
3602  * Advance start_ptr over that text, to avoid multiple rescans of
3603  * it if the replace_text contains multiple back-references.
3604  */
3605  start_ptr += chunk_len;
3606  data_pos = pmatch[0].rm_so;
3607  }
3608 
3609  /*
3610  * Copy the replace_text. Process back references when the
3611  * replace_text has escape characters.
3612  */
3613  if (have_escape)
3614  appendStringInfoRegexpSubstr(&buf, replace_text, pmatch,
3615  start_ptr, data_pos);
3616  else
3617  appendStringInfoText(&buf, replace_text);
3618 
3619  /* Advance start_ptr and data_pos over the matched text. */
3620  start_ptr += charlen_to_bytelen(start_ptr,
3621  pmatch[0].rm_eo - data_pos);
3622  data_pos = pmatch[0].rm_eo;
3623 
3624  /*
3625  * When global option is off, replace the first instance only.
3626  */
3627  if (!glob)
3628  break;
3629 
3630  /*
3631  * Advance search position. Normally we start the next search at the
3632  * end of the previous match; but if the match was of zero length, we
3633  * have to advance by one character, or we'd just find the same match
3634  * again.
3635  */
3636  search_start = data_pos;
3637  if (pmatch[0].rm_so == pmatch[0].rm_eo)
3638  search_start++;
3639  }
3640 
3641  /*
3642  * Copy the text to the right of the last match.
3643  */
3644  if (data_pos < data_len)
3645  {
3646  int chunk_len;
3647 
3648  chunk_len = ((char *) src_text + VARSIZE_ANY(src_text)) - start_ptr;
3649  appendBinaryStringInfo(&buf, start_ptr, chunk_len);
3650  }
3651 
3652  ret_text = cstring_to_text_with_len(buf.data, buf.len);
3653  pfree(buf.data);
3654  pfree(data);
3655 
3656  return ret_text;
3657 }
3658 
3659 /*
3660  * split_text
3661  * parse input string
3662  * return ord item (1 based)
3663  * based on provided field separator
3664  */
3665 Datum
3667 {
3668  text *inputstring = PG_GETARG_TEXT_PP(0);
3669  text *fldsep = PG_GETARG_TEXT_PP(1);
3670  int fldnum = PG_GETARG_INT32(2);
3671  int inputstring_len;
3672  int fldsep_len;
3674  int start_posn;
3675  int end_posn;
3676  text *result_text;
3677 
3678  /* field number is 1 based */
3679  if (fldnum < 1)
3680  ereport(ERROR,
3681  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
3682  errmsg("field position must be greater than zero")));
3683 
3684  text_position_setup(inputstring, fldsep, &state);
3685 
3686  /*
3687  * Note: we check the converted string length, not the original, because
3688  * they could be different if the input contained invalid encoding.
3689  */
3690  inputstring_len = state.len1;
3691  fldsep_len = state.len2;
3692 
3693  /* return empty string for empty input string */
3694  if (inputstring_len < 1)
3695  {
3696  text_position_cleanup(&state);
3698  }
3699 
3700  /* empty field separator */
3701  if (fldsep_len < 1)
3702  {
3703  text_position_cleanup(&state);
3704  /* if first field, return input string, else empty string */
3705  if (fldnum == 1)
3706  PG_RETURN_TEXT_P(inputstring);
3707  else
3709  }
3710 
3711  /* identify bounds of first field */
3712  start_posn = 1;
3713  end_posn = text_position_next(1, &state);
3714 
3715  /* special case if fldsep not found at all */
3716  if (end_posn == 0)
3717  {
3718  text_position_cleanup(&state);
3719  /* if field 1 requested, return input string, else empty string */
3720  if (fldnum == 1)
3721  PG_RETURN_TEXT_P(inputstring);
3722  else
3724  }
3725 
3726  while (end_posn > 0 && --fldnum > 0)
3727  {
3728  /* identify bounds of next field */
3729  start_posn = end_posn + fldsep_len;
3730  end_posn = text_position_next(start_posn, &state);
3731  }
3732 
3733  text_position_cleanup(&state);
3734 
3735  if (fldnum > 0)
3736  {
3737  /* N'th field separator not found */
3738  /* if last field requested, return it, else empty string */
3739  if (fldnum == 1)
3740  result_text = text_substring(PointerGetDatum(inputstring),
3741  start_posn,
3742  -1,
3743  true);
3744  else
3745  result_text = cstring_to_text("");
3746  }
3747  else
3748  {
3749  /* non-last field requested */
3750  result_text = text_substring(PointerGetDatum(inputstring),
3751  start_posn,
3752  end_posn - start_posn,
3753  false);
3754  }
3755 
3756  PG_RETURN_TEXT_P(result_text);
3757 }
3758 
3759 /*
3760  * Convenience function to return true when two text params are equal.
3761  */
3762 static bool
3763 text_isequal(text *txt1, text *txt2)
3764 {
3766  PointerGetDatum(txt1),
3767  PointerGetDatum(txt2)));
3768 }
3769 
3770 /*
3771  * text_to_array
3772  * parse input string and return text array of elements,
3773  * based on provided field separator
3774  */
3775 Datum
3777 {
3778  return text_to_array_internal(fcinfo);
3779 }
3780 
3781 /*
3782  * text_to_array_null
3783  * parse input string and return text array of elements,
3784  * based on provided field separator and null string
3785  *
3786  * This is a separate entry point only to prevent the regression tests from
3787  * complaining about different argument sets for the same internal function.
3788  */
3789 Datum
3791 {
3792  return text_to_array_internal(fcinfo);
3793 }
3794 
3795 /*
3796  * common code for text_to_array and text_to_array_null functions
3797  *
3798  * These are not strict so we have to test for null inputs explicitly.
3799  */
3800 static Datum
3802 {
3803  text *inputstring;
3804  text *fldsep;
3805  text *null_string;
3806  int inputstring_len;
3807  int fldsep_len;
3808  char *start_ptr;
3809  text *result_text;
3810  bool is_null;
3811  ArrayBuildState *astate = NULL;
3812 
3813  /* when input string is NULL, then result is NULL too */
3814  if (PG_ARGISNULL(0))
3815  PG_RETURN_NULL();
3816 
3817  inputstring = PG_GETARG_TEXT_PP(0);
3818 
3819  /* fldsep can be NULL */
3820  if (!PG_ARGISNULL(1))
3821  fldsep = PG_GETARG_TEXT_PP(1);
3822  else
3823  fldsep = NULL;
3824 
3825  /* null_string can be NULL or omitted */
3826  if (PG_NARGS() > 2 && !PG_ARGISNULL(2))
3827  null_string = PG_GETARG_TEXT_PP(2);
3828  else
3829  null_string = NULL;
3830 
3831  if (fldsep != NULL)
3832  {
3833  /*
3834  * Normal case with non-null fldsep. Use the text_position machinery
3835  * to search for occurrences of fldsep.
3836  */
3838  int fldnum;
3839  int start_posn;
3840  int end_posn;
3841  int chunk_len;
3842 
3843  text_position_setup(inputstring, fldsep, &state);
3844 
3845  /*
3846  * Note: we check the converted string length, not the original,
3847  * because they could be different if the input contained invalid
3848  * encoding.
3849  */
3850  inputstring_len = state.len1;
3851  fldsep_len = state.len2;
3852 
3853  /* return empty array for empty input string */
3854  if (inputstring_len < 1)
3855  {
3856  text_position_cleanup(&state);
3858  }
3859 
3860  /*
3861  * empty field separator: return the input string as a one-element
3862  * array
3863  */
3864  if (fldsep_len < 1)
3865  {
3866  text_position_cleanup(&state);
3867  /* single element can be a NULL too */
3868  is_null = null_string ? text_isequal(inputstring, null_string) : false;
3870  PointerGetDatum(inputstring),
3871  is_null, 1));
3872  }
3873 
3874  start_posn = 1;
3875  /* start_ptr points to the start_posn'th character of inputstring */
3876  start_ptr = VARDATA_ANY(inputstring);
3877 
3878  for (fldnum = 1;; fldnum++) /* field number is 1 based */
3879  {
3881 
3882  end_posn = text_position_next(start_posn, &state);
3883 
3884  if (end_posn == 0)
3885  {
3886  /* fetch last field */
3887  chunk_len = ((char *) inputstring + VARSIZE_ANY(inputstring)) - start_ptr;
3888  }
3889  else
3890  {
3891  /* fetch non-last field */
3892  chunk_len = charlen_to_bytelen(start_ptr, end_posn - start_posn);
3893  }
3894 
3895  /* must build a temp text datum to pass to accumArrayResult */
3896  result_text = cstring_to_text_with_len(start_ptr, chunk_len);
3897  is_null = null_string ? text_isequal(result_text, null_string) : false;
3898 
3899  /* stash away this field */
3900  astate = accumArrayResult(astate,
3901  PointerGetDatum(result_text),
3902  is_null,
3903  TEXTOID,
3905 
3906  pfree(result_text);
3907 
3908  if (end_posn == 0)
3909  break;
3910 
3911  start_posn = end_posn;
3912  start_ptr += chunk_len;
3913  start_posn += fldsep_len;
3914  start_ptr += charlen_to_bytelen(start_ptr, fldsep_len);
3915  }
3916 
3917  text_position_cleanup(&state);
3918  }
3919  else
3920  {
3921  /*
3922  * When fldsep is NULL, each character in the inputstring becomes an
3923  * element in the result array. The separator is effectively the
3924  * space between characters.
3925  */
3926  inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
3927 
3928  /* return empty array for empty input string */
3929  if (inputstring_len < 1)
3931 
3932  start_ptr = VARDATA_ANY(inputstring);
3933 
3934  while (inputstring_len > 0)
3935  {
3936  int chunk_len = pg_mblen(start_ptr);
3937 
3939 
3940  /* must build a temp text datum to pass to accumArrayResult */
3941  result_text = cstring_to_text_with_len(start_ptr, chunk_len);
3942  is_null = null_string ? text_isequal(result_text, null_string) : false;
3943 
3944  /* stash away this field */
3945  astate = accumArrayResult(astate,
3946  PointerGetDatum(result_text),
3947  is_null,
3948  TEXTOID,
3950 
3951  pfree(result_text);
3952 
3953  start_ptr += chunk_len;
3954  inputstring_len -= chunk_len;
3955  }
3956  }
3957 
3960 }
3961 
3962 /*
3963  * array_to_text
3964  * concatenate Cstring representation of input array elements
3965  * using provided field separator
3966  */
3967 Datum
3969 {
3971  char *fldsep = text_to_cstring(PG_GETARG_TEXT_PP(1));
3972 
3973  PG_RETURN_TEXT_P(array_to_text_internal(fcinfo, v, fldsep, NULL));
3974 }
3975 
3976 /*
3977  * array_to_text_null
3978  * concatenate Cstring representation of input array elements
3979  * using provided field separator and null string
3980  *
3981  * This version is not strict so we have to test for null inputs explicitly.
3982  */
3983 Datum
3985 {
3986  ArrayType *v;
3987  char *fldsep;
3988  char *null_string;
3989 
3990  /* returns NULL when first or second parameter is NULL */
3991  if (PG_ARGISNULL(0) || PG_ARGISNULL(1))
3992  PG_RETURN_NULL();
3993 
3994  v = PG_GETARG_ARRAYTYPE_P(0);
3995  fldsep = text_to_cstring(PG_GETARG_TEXT_PP(1));
3996 
3997  /* NULL null string is passed through as a null pointer */
3998  if (!PG_ARGISNULL(2))
3999  null_string = text_to_cstring(PG_GETARG_TEXT_PP(2));
4000  else
4001  null_string = NULL;
4002 
4003  PG_RETURN_TEXT_P(array_to_text_internal(fcinfo, v, fldsep, null_string));
4004 }
4005 
4006 /*
4007  * common code for array_to_text and array_to_text_null functions
4008  */
4009 static text *
4011  const char *fldsep, const char *null_string)
4012 {
4013  text *result;
4014  int nitems,
4015  *dims,
4016  ndims;
4017  Oid element_type;
4018  int typlen;
4019  bool typbyval;
4020  char typalign;
4022  bool printed = false;
4023  char *p;
4024  bits8 *bitmap;
4025  int bitmask;
4026  int i;
4027  ArrayMetaState *my_extra;
4028 
4029  ndims = ARR_NDIM(v);
4030  dims = ARR_DIMS(v);
4031  nitems = ArrayGetNItems(ndims, dims);
4032 
4033  /* if there are no elements, return an empty string */
4034  if (nitems == 0)
4035  return cstring_to_text_with_len("", 0);
4036 
4037  element_type = ARR_ELEMTYPE(v);
4038  initStringInfo(&buf);
4039 
4040  /*
4041  * We arrange to look up info about element type, including its output
4042  * conversion proc, only once per series of calls, assuming the element
4043  * type doesn't change underneath us.
4044  */
4045  my_extra = (ArrayMetaState *) fcinfo->flinfo->fn_extra;
4046  if (my_extra == NULL)
4047  {
4048  fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
4049  sizeof(ArrayMetaState));
4050  my_extra = (ArrayMetaState *) fcinfo->flinfo->fn_extra;
4051  my_extra->element_type = ~element_type;
4052  }
4053 
4054  if (my_extra->element_type != element_type)
4055  {
4056  /*
4057  * Get info about element type, including its output conversion proc
4058  */
4059  get_type_io_data(element_type, IOFunc_output,
4060  &my_extra->typlen, &my_extra->typbyval,
4061  &my_extra->typalign, &my_extra->typdelim,
4062  &my_extra->typioparam, &my_extra->typiofunc);
4063  fmgr_info_cxt(my_extra->typiofunc, &my_extra->proc,
4064  fcinfo->flinfo->fn_mcxt);
4065  my_extra->element_type = element_type;
4066  }
4067  typlen = my_extra->typlen;
4068  typbyval = my_extra->typbyval;
4069  typalign = my_extra->typalign;
4070 
4071  p = ARR_DATA_PTR(v);
4072  bitmap = ARR_NULLBITMAP(v);
4073  bitmask = 1;
4074 
4075  for (i = 0; i < nitems; i++)
4076  {
4077  Datum itemvalue;
4078  char *value;
4079 
4080  /* Get source element, checking for NULL */
4081  if (bitmap && (*bitmap & bitmask) == 0)
4082  {
4083  /* if null_string is NULL, we just ignore null elements */
4084  if (null_string != NULL)
4085  {
4086  if (printed)
4087  appendStringInfo(&buf, "%s%s", fldsep, null_string);
4088  else
4089  appendStringInfoString(&buf, null_string);
4090  printed = true;
4091  }
4092  }
4093  else
4094  {
4095  itemvalue = fetch_att(p, typbyval, typlen);
4096 
4097  value = OutputFunctionCall(&my_extra->proc, itemvalue);
4098 
4099  if (printed)
4100  appendStringInfo(&buf, "%s%s", fldsep, value);
4101  else
4102  appendStringInfoString(&buf, value);
4103  printed = true;
4104 
4105  p = att_addlength_pointer(p, typlen, p);
4106  p = (char *) att_align_nominal(p, typalign);
4107  }
4108 
4109  /* advance bitmap pointer if any */
4110  if (bitmap)
4111  {
4112  bitmask <<= 1;
4113  if (bitmask == 0x100)
4114  {
4115  bitmap++;
4116  bitmask = 1;
4117  }
4118  }
4119  }
4120 
4121  result = cstring_to_text_with_len(buf.data, buf.len);
4122  pfree(buf.data);
4123 
4124  return result;
4125 }
4126 
4127 #define HEXBASE 16
4128 /*
4129  * Convert a int32 to a string containing a base 16 (hex) representation of
4130  * the number.
4131  */
4132 Datum
4134 {
4136  char *ptr;
4137  const char *digits = "0123456789abcdef";
4138  char buf[32]; /* bigger than needed, but reasonable */
4139 
4140  ptr = buf + sizeof(buf) - 1;
4141  *ptr = '\0';
4142 
4143  do
4144  {
4145  *--ptr = digits[value % HEXBASE];
4146  value /= HEXBASE;
4147  } while (ptr > buf && value);
4148 
4150 }
4151 
4152 /*
4153  * Convert a int64 to a string containing a base 16 (hex) representation of
4154  * the number.
4155  */
4156 Datum
4158 {
4159  uint64 value = (uint64) PG_GETARG_INT64(0);
4160  char *ptr;
4161  const char *digits = "0123456789abcdef";
4162  char buf[32]; /* bigger than needed, but reasonable */
4163 
4164  ptr = buf + sizeof(buf) - 1;
4165  *ptr = '\0';
4166 
4167  do
4168  {
4169  *--ptr = digits[value % HEXBASE];
4170  value /= HEXBASE;
4171  } while (ptr > buf && value);
4172 
4174 }
4175 
4176 /*
4177  * Create an md5 hash of a text string and return it as hex
4178  *
4179  * md5 produces a 16 byte (128 bit) hash; double it for hex
4180  */
4181 #define MD5_HASH_LEN 32
4182 
4183 Datum
4185 {
4186  text *in_text = PG_GETARG_TEXT_PP(0);
4187  size_t len;
4188  char hexsum[MD5_HASH_LEN + 1];
4189 
4190  /* Calculate the length of the buffer using varlena metadata */
4191  len = VARSIZE_ANY_EXHDR(in_text);
4192 
4193  /* get the hash result */
4194  if (pg_md5_hash(VARDATA_ANY(in_text), len, hexsum) == false)
4195  ereport(ERROR,
4196  (errcode(ERRCODE_OUT_OF_MEMORY),
4197  errmsg("out of memory")));
4198 
4199  /* convert to text and return it */
4201 }
4202 
4203 /*
4204  * Create an md5 hash of a bytea field and return it as a hex string:
4205  * 16-byte md5 digest is represented in 32 hex characters.
4206  */
4207 Datum
4209 {
4210  bytea *in = PG_GETARG_BYTEA_PP(0);
4211  size_t len;
4212  char hexsum[MD5_HASH_LEN + 1];
4213 
4214  len = VARSIZE_ANY_EXHDR(in);
4215  if (pg_md5_hash(VARDATA_ANY(in), len, hexsum) == false)
4216  ereport(ERROR,
4217  (errcode(ERRCODE_OUT_OF_MEMORY),
4218  errmsg("out of memory")));
4219 
4221 }
4222 
4223 /*
4224  * Return the size of a datum, possibly compressed
4225  *
4226  * Works on any data type
4227  */
4228 Datum
4230 {
4232  int32 result;
4233  int typlen;
4234 
4235  /* On first call, get the input type's typlen, and save at *fn_extra */
4236  if (fcinfo->flinfo->fn_extra == NULL)
4237  {
4238  /* Lookup the datatype of the supplied argument */
4239  Oid argtypeid = get_fn_expr_argtype(fcinfo->flinfo, 0);
4240 
4241  typlen = get_typlen(argtypeid);
4242  if (typlen == 0) /* should not happen */
4243  elog(ERROR, "cache lookup failed for type %u", argtypeid);
4244 
4245  fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
4246  sizeof(int));
4247  *((int *) fcinfo->flinfo->fn_extra) = typlen;
4248  }
4249  else
4250  typlen = *((int *) fcinfo->flinfo->fn_extra);
4251 
4252  if (typlen == -1)
4253  {
4254  /* varlena type, possibly toasted */
4255  result = toast_datum_size(value);
4256  }
4257  else if (typlen == -2)
4258  {
4259  /* cstring */
4260  result = strlen(DatumGetCString(value)) + 1;
4261  }
4262  else
4263  {
4264  /* ordinary fixed-width type */
4265  result = typlen;
4266  }
4267 
4268  PG_RETURN_INT32(result);
4269 }
4270 
4271 /*
4272  * string_agg - Concatenates values and returns string.
4273  *
4274  * Syntax: string_agg(value text, delimiter text) RETURNS text
4275  *
4276  * Note: Any NULL values are ignored. The first-call delimiter isn't
4277  * actually used at all, and on subsequent calls the delimiter precedes
4278  * the associated value.
4279  */
4280 
4281 /* subroutine to initialize state */
4282 static StringInfo
4284 {
4285  StringInfo state;
4286  MemoryContext aggcontext;
4287  MemoryContext oldcontext;
4288 
4289  if (!AggCheckCallContext(fcinfo, &aggcontext))
4290  {
4291  /* cannot be called directly because of internal-type argument */
4292  elog(ERROR, "string_agg_transfn called in non-aggregate context");
4293  }
4294 
4295  /*
4296  * Create state in aggregate context. It'll stay there across subsequent
4297  * calls.
4298  */
4299  oldcontext = MemoryContextSwitchTo(aggcontext);
4300  state = makeStringInfo();
4301  MemoryContextSwitchTo(oldcontext);
4302 
4303  return state;
4304 }
4305 
4306 Datum
4308 {
4309  StringInfo state;
4310 
4311  state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
4312 
4313  /* Append the value unless null. */
4314  if (!PG_ARGISNULL(1))
4315  {
4316  /* On the first time through, we ignore the delimiter. */
4317  if (state == NULL)
4318  state = makeStringAggState(fcinfo);
4319  else if (!PG_ARGISNULL(2))
4320  appendStringInfoText(state, PG_GETARG_TEXT_PP(2)); /* delimiter */
4321 
4322  appendStringInfoText(state, PG_GETARG_TEXT_PP(1)); /* value */
4323  }
4324 
4325  /*
4326  * The transition type for string_agg() is declared to be "internal",
4327  * which is a pass-by-value type the same size as a pointer.
4328  */
4329  PG_RETURN_POINTER(state);
4330 }
4331 
4332 Datum
4334 {
4335  StringInfo state;
4336 
4337  /* cannot be called directly because of internal-type argument */
4338  Assert(AggCheckCallContext(fcinfo, NULL));
4339 
4340  state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
4341 
4342  if (state != NULL)
4344  else
4345  PG_RETURN_NULL();
4346 }
4347 
4348 /*
4349  * Implementation of both concat() and concat_ws().
4350  *
4351  * sepstr is the separator string to place between values.
4352  * argidx identifies the first argument to concatenate (counting from zero).
4353  * Returns NULL if result should be NULL, else text value.
4354  */
4355 static text *
4356 concat_internal(const char *sepstr, int argidx,
4357  FunctionCallInfo fcinfo)
4358 {
4359  text *result;
4360  StringInfoData str;
4361  bool first_arg = true;
4362  int i;
4363 
4364  /*
4365  * concat(VARIADIC some-array) is essentially equivalent to
4366  * array_to_text(), ie concat the array elements with the given separator.
4367  * So we just pass the case off to that code.
4368  */
4369  if (get_fn_expr_variadic(fcinfo->flinfo))
4370  {
4371  ArrayType *arr;
4372 
4373  /* Should have just the one argument */
4374  Assert(argidx == PG_NARGS() - 1);
4375 
4376  /* concat(VARIADIC NULL) is defined as NULL */
4377  if (PG_ARGISNULL(argidx))
4378  return NULL;
4379 
4380  /*
4381  * Non-null argument had better be an array. We assume that any call
4382  * context that could let get_fn_expr_variadic return true will have
4383  * checked that a VARIADIC-labeled parameter actually is an array. So
4384  * it should be okay to just Assert that it's an array rather than
4385  * doing a full-fledged error check.
4386  */
4388 
4389  /* OK, safe to fetch the array value */
4390  arr = PG_GETARG_ARRAYTYPE_P(argidx);
4391 
4392  /*
4393  * And serialize the array. We tell array_to_text to ignore null
4394  * elements, which matches the behavior of the loop below.
4395  */
4396  return array_to_text_internal(fcinfo, arr, sepstr, NULL);
4397  }
4398 
4399  /* Normal case without explicit VARIADIC marker */
4400  initStringInfo(&str);
4401 
4402  for (i = argidx; i < PG_NARGS(); i++)
4403  {
4404  if (!PG_ARGISNULL(i))
4405  {
4407  Oid valtype;
4408  Oid typOutput;
4409  bool typIsVarlena;
4410 
4411  /* add separator if appropriate */
4412  if (first_arg)
4413  first_arg = false;
4414  else
4415  appendStringInfoString(&str, sepstr);
4416 
4417  /* call the appropriate type output function, append the result */
4418  valtype = get_fn_expr_argtype(fcinfo->flinfo, i);
4419  if (!OidIsValid(valtype))
4420  elog(ERROR, "could not determine data type of concat() input");
4421  getTypeOutputInfo(valtype, &typOutput, &typIsVarlena);
4423  OidOutputFunctionCall(typOutput, value));
4424  }
4425  }
4426 
4427  result = cstring_to_text_with_len(str.data, str.len);
4428  pfree(str.data);
4429 
4430  return result;
4431 }
4432 
4433 /*
4434  * Concatenate all arguments. NULL arguments are ignored.
4435  */
4436 Datum
4438 {
4439  text *result;
4440 
4441  result = concat_internal("", 0, fcinfo);
4442  if (result == NULL)
4443  PG_RETURN_NULL();
4444  PG_RETURN_TEXT_P(result);
4445 }
4446 
4447 /*
4448  * Concatenate all but first argument value with separators. The first
4449  * parameter is used as the separator. NULL arguments are ignored.
4450  */
4451 Datum
4453 {
4454  char *sep;
4455  text *result;
4456 
4457  /* return NULL when separator is NULL */
4458  if (PG_ARGISNULL(0))
4459  PG_RETURN_NULL();
4461 
4462  result = concat_internal(sep, 1, fcinfo);
4463  if (result == NULL)
4464  PG_RETURN_NULL();
4465  PG_RETURN_TEXT_P(result);
4466 }
4467 
4468 /*
4469  * Return first n characters in the string. When n is negative,
4470  * return all but last |n| characters.
4471  */
4472 Datum
4474 {
4475  text *str = PG_GETARG_TEXT_PP(0);
4476  const char *p = VARDATA_ANY(str);
4477  int len = VARSIZE_ANY_EXHDR(str);
4478  int n = PG_GETARG_INT32(1);
4479  int rlen;
4480 
4481  if (n < 0)
4482  n = pg_mbstrlen_with_len(p, len) + n;
4483  rlen = pg_mbcharcliplen(p, len, n);
4484 
4486 }
4487 
4488 /*
4489  * Return last n characters in the string. When n is negative,
4490  * return all but first |n| characters.
4491  */
4492 Datum
4494 {
4495  text *str = PG_GETARG_TEXT_PP(0);
4496  const char *p = VARDATA_ANY(str);
4497  int len = VARSIZE_ANY_EXHDR(str);
4498  int n = PG_GETARG_INT32(1);
4499  int off;
4500 
4501  if (n < 0)
4502  n = -n;
4503  else
4504  n = pg_mbstrlen_with_len(p, len) - n;
4505  off = pg_mbcharcliplen(p, len, n);
4506 
4507  PG_RETURN_TEXT_P(cstring_to_text_with_len(p + off, len - off));
4508 }
4509 
4510 /*
4511  * Return reversed string
4512  */
4513 Datum
4515 {
4516  text *str = PG_GETARG_TEXT_PP(0);
4517  const char *p = VARDATA_ANY(str);
4518  int len = VARSIZE_ANY_EXHDR(str);
4519  const char *endp = p + len;
4520  text *result;
4521  char *dst;
4522 
4523  result = palloc(len + VARHDRSZ);
4524  dst = (char *) VARDATA(result) + len;
4525  SET_VARSIZE(result, len + VARHDRSZ);
4526 
4528  {
4529  /* multibyte version */
4530  while (p < endp)
4531  {
4532  int sz;
4533 
4534  sz = pg_mblen(p);
4535  dst -= sz;
4536  memcpy(dst, p, sz);
4537  p += sz;
4538  }
4539  }
4540  else
4541  {
4542  /* single byte version */
4543  while (p < endp)
4544  *(--dst) = *p++;
4545  }
4546 
4547  PG_RETURN_TEXT_P(result);
4548 }
4549 
4550 
4551 /*
4552  * Support macros for text_format()
4553  */
4554 #define TEXT_FORMAT_FLAG_MINUS 0x0001 /* is minus flag present? */
4555 
4556 #define ADVANCE_PARSE_POINTER(ptr,end_ptr) \
4557  do { \
4558  if (++(ptr) >= (end_ptr)) \
4559  ereport(ERROR, \
4560  (errcode(ERRCODE_INVALID_PARAMETER_VALUE), \
4561  errmsg("unterminated format specifier"))); \
4562  } while (0)
4563 
4564 /*
4565  * Returns a formatted string
4566  */
4567 Datum
4569 {
4570  text *fmt;
4571  StringInfoData str;
4572  const char *cp;
4573  const char *start_ptr;
4574  const char *end_ptr;
4575  text *result;
4576  int arg;
4577  bool funcvariadic;
4578  int nargs;
4579  Datum *elements = NULL;
4580  bool *nulls = NULL;
4581  Oid element_type = InvalidOid;
4582  Oid prev_type = InvalidOid;
4583  Oid prev_width_type = InvalidOid;
4584  FmgrInfo typoutputfinfo;
4585  FmgrInfo typoutputinfo_width;
4586 
4587  /* When format string is null, immediately return null */
4588  if (PG_ARGISNULL(0))
4589  PG_RETURN_NULL();
4590 
4591  /* If argument is marked VARIADIC, expand array into elements */
4592  if (get_fn_expr_variadic(fcinfo->flinfo))
4593  {
4594  ArrayType *arr;
4595  int16 elmlen;
4596  bool elmbyval;
4597  char elmalign;
4598  int nitems;
4599 
4600  /* Should have just the one argument */
4601  Assert(PG_NARGS() == 2);
4602 
4603  /* If argument is NULL, we treat it as zero-length array */
4604  if (PG_ARGISNULL(1))
4605  nitems = 0;
4606  else
4607  {
4608  /*
4609  * Non-null argument had better be an array. We assume that any
4610  * call context that could let get_fn_expr_variadic return true
4611  * will have checked that a VARIADIC-labeled parameter actually is
4612  * an array. So it should be okay to just Assert that it's an
4613  * array rather than doing a full-fledged error check.
4614  */
4616 
4617  /* OK, safe to fetch the array value */
4618  arr = PG_GETARG_ARRAYTYPE_P(1);
4619 
4620  /* Get info about array element type */
4621  element_type = ARR_ELEMTYPE(arr);
4622  get_typlenbyvalalign(element_type,
4623  &elmlen, &elmbyval, &elmalign);
4624 
4625  /* Extract all array elements */
4626  deconstruct_array(arr, element_type, elmlen, elmbyval, elmalign,
4627  &elements, &nulls, &nitems);
4628  }
4629 
4630  nargs = nitems + 1;
4631  funcvariadic = true;
4632  }
4633  else
4634  {
4635  /* Non-variadic case, we'll process the arguments individually */
4636  nargs = PG_NARGS();
4637  funcvariadic = false;
4638  }
4639 
4640  /* Setup for main loop. */
4641  fmt = PG_GETARG_TEXT_PP(0);
4642  start_ptr = VARDATA_ANY(fmt);
4643  end_ptr = start_ptr + VARSIZE_ANY_EXHDR(fmt);
4644  initStringInfo(&str);
4645  arg = 1; /* next argument position to print */
4646 
4647  /* Scan format string, looking for conversion specifiers. */
4648  for (cp = start_ptr; cp < end_ptr; cp++)
4649  {
4650  int argpos;
4651  int widthpos;
4652  int flags;
4653  int width;
4654  Datum value;
4655  bool isNull;
4656  Oid typid;
4657 
4658  /*
4659  * If it's not the start of a conversion specifier, just copy it to
4660  * the output buffer.
4661  */
4662  if (*cp != '%')
4663  {
4664  appendStringInfoCharMacro(&str, *cp);
4665  continue;
4666  }
4667 
4668  ADVANCE_PARSE_POINTER(cp, end_ptr);
4669 
4670  /* Easy case: %% outputs a single % */
4671  if (*cp == '%')
4672  {
4673  appendStringInfoCharMacro(&str, *cp);
4674  continue;
4675  }
4676 
4677  /* Parse the optional portions of the format specifier */
4678  cp = text_format_parse_format(cp, end_ptr,
4679  &argpos, &widthpos,
4680  &flags, &width);
4681 
4682  /*
4683  * Next we should see the main conversion specifier. Whether or not
4684  * an argument position was present, it's known that at least one
4685  * character remains in the string at this point. Experience suggests
4686  * that it's worth checking that that character is one of the expected
4687  * ones before we try to fetch arguments, so as to produce the least
4688  * confusing response to a mis-formatted specifier.
4689  */
4690  if (strchr("sIL", *cp) == NULL)
4691  ereport(ERROR,
4692  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4693  errmsg("unrecognized conversion type specifier \"%c\"",
4694  *cp)));
4695 
4696  /* If indirect width was specified, get its value */
4697  if (widthpos >= 0)
4698  {
4699  /* Collect the specified or next argument position */
4700  if (widthpos > 0)
4701  arg = widthpos;
4702  if (arg >= nargs)
4703  ereport(ERROR,
4704  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4705  errmsg("too few arguments for format")));
4706 
4707  /* Get the value and type of the selected argument */
4708  if (!funcvariadic)
4709  {
4710  value = PG_GETARG_DATUM(arg);
4711  isNull = PG_ARGISNULL(arg);
4712  typid = get_fn_expr_argtype(fcinfo->flinfo, arg);
4713  }
4714  else
4715  {
4716  value = elements[arg - 1];
4717  isNull = nulls[arg - 1];
4718  typid = element_type;
4719  }
4720  if (!OidIsValid(typid))
4721  elog(ERROR, "could not determine data type of format() input");
4722 
4723  arg++;
4724 
4725  /* We can treat NULL width the same as zero */
4726  if (isNull)
4727  width = 0;
4728  else if (typid == INT4OID)
4729  width = DatumGetInt32(value);
4730  else if (typid == INT2OID)
4731  width = DatumGetInt16(value);
4732  else
4733  {
4734  /* For less-usual datatypes, convert to text then to int */
4735  char *str;
4736 
4737  if (typid != prev_width_type)
4738  {
4739  Oid typoutputfunc;
4740  bool typIsVarlena;
4741 
4742  getTypeOutputInfo(typid, &typoutputfunc, &typIsVarlena);
4743  fmgr_info(typoutputfunc, &typoutputinfo_width);
4744  prev_width_type = typid;
4745  }
4746 
4747  str = OutputFunctionCall(&typoutputinfo_width, value);
4748 
4749  /* pg_atoi will complain about bad data or overflow */
4750  width = pg_atoi(str, sizeof(int), '\0');
4751 
4752  pfree(str);
4753  }
4754  }
4755 
4756  /* Collect the specified or next argument position */
4757  if (argpos > 0)
4758  arg = argpos;
4759  if (arg >= nargs)
4760  ereport(ERROR,
4761  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4762  errmsg("too few arguments for format")));
4763 
4764  /* Get the value and type of the selected argument */
4765  if (!funcvariadic)
4766  {
4767  value = PG_GETARG_DATUM(arg);
4768  isNull = PG_ARGISNULL(arg);
4769  typid = get_fn_expr_argtype(fcinfo->flinfo, arg);
4770  }
4771  else
4772  {
4773  value = elements[arg - 1];
4774  isNull = nulls[arg - 1];
4775  typid = element_type;
4776  }
4777  if (!OidIsValid(typid))
4778  elog(ERROR, "could not determine data type of format() input");
4779 
4780  arg++;
4781 
4782  /*
4783  * Get the appropriate typOutput function, reusing previous one if
4784  * same type as previous argument. That's particularly useful in the
4785  * variadic-array case, but often saves work even for ordinary calls.
4786  */
4787  if (typid != prev_type)
4788  {
4789  Oid typoutputfunc;
4790  bool typIsVarlena;
4791 
4792  getTypeOutputInfo(typid, &typoutputfunc, &typIsVarlena);
4793  fmgr_info(typoutputfunc, &typoutputfinfo);
4794  prev_type = typid;
4795  }
4796 
4797  /*
4798  * And now we can format the value.
4799  */
4800  switch (*cp)
4801  {
4802  case 's':
4803  case 'I':
4804  case 'L':
4805  text_format_string_conversion(&str, *cp, &typoutputfinfo,
4806  value, isNull,
4807  flags, width);
4808  break;
4809  default:
4810  /* should not get here, because of previous check */
4811  ereport(ERROR,
4812  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4813  errmsg("unrecognized conversion type specifier \"%c\"",
4814  *cp)));
4815  break;
4816  }
4817  }
4818 
4819  /* Don't need deconstruct_array results anymore. */
4820  if (elements != NULL)
4821  pfree(elements);
4822  if (nulls != NULL)
4823  pfree(nulls);
4824 
4825  /* Generate results. */
4826  result = cstring_to_text_with_len(str.data, str.len);
4827  pfree(str.data);
4828 
4829  PG_RETURN_TEXT_P(result);
4830 }
4831 
4832 /*
4833  * Parse contiguous digits as a decimal number.
4834  *
4835  * Returns true if some digits could be parsed.
4836  * The value is returned into *value, and *ptr is advanced to the next
4837  * character to be parsed.
4838  *
4839  * Note parsing invariant: at least one character is known available before
4840  * string end (end_ptr) at entry, and this is still true at exit.
4841  */
4842 static bool
4843 text_format_parse_digits(const char **ptr, const char *end_ptr, int *value)
4844 {
4845  bool found = false;
4846  const char *cp = *ptr;
4847  int val = 0;
4848 
4849  while (*cp >= '0' && *cp <= '9')
4850  {
4851  int newval = val * 10 + (*cp - '0');
4852 
4853  if (newval / 10 != val) /* overflow? */
4854  ereport(ERROR,
4855  (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
4856  errmsg("number is out of range")));
4857  val = newval;
4858  ADVANCE_PARSE_POINTER(cp, end_ptr);
4859  found = true;
4860  }
4861 
4862  *ptr = cp;
4863  *value = val;
4864 
4865  return found;
4866 }
4867 
4868 /*
4869  * Parse a format specifier (generally following the SUS printf spec).
4870  *
4871  * We have already advanced over the initial '%', and we are looking for
4872  * [argpos][flags][width]type (but the type character is not consumed here).
4873  *
4874  * Inputs are start_ptr (the position after '%') and end_ptr (string end + 1).
4875  * Output parameters:
4876  * argpos: argument position for value to be printed. -1 means unspecified.
4877  * widthpos: argument position for width. Zero means the argument position
4878  * was unspecified (ie, take the next arg) and -1 means no width
4879  * argument (width was omitted or specified as a constant).
4880  * flags: bitmask of flags.
4881  * width: directly-specified width value. Zero means the width was omitted
4882  * (note it's not necessary to distinguish this case from an explicit
4883  * zero width value).
4884  *
4885  * The function result is the next character position to be parsed, ie, the
4886  * location where the type character is/should be.
4887  *
4888  * Note parsing invariant: at least one character is known available before
4889  * string end (end_ptr) at entry, and this is still true at exit.
4890  */
4891 static const char *
4892 text_format_parse_format(const char *start_ptr, const char *end_ptr,
4893  int *argpos, int *widthpos,
4894  int *flags, int *width)
4895 {
4896  const char *cp = start_ptr;
4897  int n;
4898 
4899  /* set defaults for output parameters */
4900  *argpos = -1;
4901  *widthpos = -1;
4902  *flags = 0;
4903  *width = 0;
4904 
4905  /* try to identify first number */
4906  if (text_format_parse_digits(&cp, end_ptr, &n))
4907  {
4908  if (*cp != '$')
4909  {
4910  /* Must be just a width and a type, so we're done */
4911  *width = n;
4912  return cp;
4913  }
4914  /* The number was argument position */
4915  *argpos = n;
4916  /* Explicit 0 for argument index is immediately refused */
4917  if (n == 0)
4918  ereport(ERROR,
4919  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4920  errmsg("format specifies argument 0, but arguments are numbered from 1")));
4921  ADVANCE_PARSE_POINTER(cp, end_ptr);
4922  }
4923 
4924  /* Handle flags (only minus is supported now) */
4925  while (*cp == '-')
4926  {
4927  *flags |= TEXT_FORMAT_FLAG_MINUS;
4928  ADVANCE_PARSE_POINTER(cp, end_ptr);
4929  }
4930 
4931  if (*cp == '*')
4932  {
4933  /* Handle indirect width */
4934  ADVANCE_PARSE_POINTER(cp, end_ptr);
4935  if (text_format_parse_digits(&cp, end_ptr, &n))
4936  {
4937  /* number in this position must be closed by $ */
4938  if (*cp != '$')
4939  ereport(ERROR,
4940  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4941  errmsg("width argument position must be ended by \"$\"")));
4942  /* The number was width argument position */
4943  *widthpos = n;
4944  /* Explicit 0 for argument index is immediately refused */
4945  if (n == 0)
4946  ereport(ERROR,
4947  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4948  errmsg("format specifies argument 0, but arguments are numbered from 1")));
4949  ADVANCE_PARSE_POINTER(cp, end_ptr);
4950  }
4951  else
4952  *widthpos = 0; /* width's argument position is unspecified */
4953  }
4954  else
4955  {
4956  /* Check for direct width specification */
4957  if (text_format_parse_digits(&cp, end_ptr, &n))
4958  *width = n;
4959  }
4960 
4961  /* cp should now be pointing at type character */
4962  return cp;
4963 }
4964 
4965 /*
4966  * Format a %s, %I, or %L conversion
4967  */
4968 static void
4970  FmgrInfo *typOutputInfo,
4971  Datum value, bool isNull,
4972  int flags, int width)
4973 {
4974  char *str;
4975 
4976  /* Handle NULL arguments before trying to stringify the value. */
4977  if (isNull)
4978  {
4979  if (conversion == 's')
4980  text_format_append_string(buf, "", flags, width);
4981  else if (conversion == 'L')
4982  text_format_append_string(buf, "NULL", flags, width);
4983  else if (conversion == 'I')
4984  ereport(ERROR,
4985  (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
4986  errmsg("null values cannot be formatted as an SQL identifier")));
4987  return;
4988  }
4989 
4990  /* Stringify. */
4991  str = OutputFunctionCall(typOutputInfo, value);
4992 
4993  /* Escape. */
4994  if (conversion == 'I')
4995  {
4996  /* quote_identifier may or may not allocate a new string. */
4997  text_format_append_string(buf, quote_identifier(str), flags, width);
4998  }
4999  else if (conversion == 'L')
5000  {
5001  char *qstr = quote_literal_cstr(str);
5002 
5003  text_format_append_string(buf, qstr, flags, width);
5004  /* quote_literal_cstr() always allocates a new string */
5005  pfree(qstr);
5006  }
5007  else
5008  text_format_append_string(buf, str, flags, width);
5009 
5010  /* Cleanup. */
5011  pfree(str);
5012 }
5013 
5014 /*
5015  * Append str to buf, padding as directed by flags/width
5016  */
5017 static void
5019  int flags, int width)
5020 {
5021  bool align_to_left = false;
5022  int len;
5023 
5024  /* fast path for typical easy case */
5025  if (width == 0)
5026  {
5027  appendStringInfoString(buf, str);
5028  return;
5029  }
5030 
5031  if (width < 0)
5032  {
5033  /* Negative width: implicit '-' flag, then take absolute value */
5034  align_to_left = true;
5035  /* -INT_MIN is undefined */
5036  if (width <= INT_MIN)
5037  ereport(ERROR,
5038  (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
5039  errmsg("number is out of range")));
5040  width = -width;
5041  }
5042  else if (flags & TEXT_FORMAT_FLAG_MINUS)
5043  align_to_left = true;
5044 
5045  len = pg_mbstrlen(str);
5046  if (align_to_left)
5047  {
5048  /* left justify */
5049  appendStringInfoString(buf, str);
5050  if (len < width)
5051  appendStringInfoSpaces(buf, width - len);
5052  }
5053  else
5054  {
5055  /* right justify */
5056  if (len < width)
5057  appendStringInfoSpaces(buf, width - len);
5058  appendStringInfoString(buf, str);
5059  }
5060 }
5061 
5062 /*
5063  * text_format_nv - nonvariadic wrapper for text_format function.
5064  *
5065  * note: this wrapper is necessary to pass the sanity check in opr_sanity,
5066  * which checks that all built-in functions that share the implementing C
5067  * function take the same number of arguments.
5068  */
5069 Datum
5071 {
5072  return text_format(fcinfo);
5073 }
5074 
5075 /*
5076  * Helper function for Levenshtein distance functions. Faster than memcmp(),
5077  * for this use case.
5078  */
5079 static inline bool
5080 rest_of_char_same(const char *s1, const char *s2, int len)
5081 {
5082  while (len > 0)
5083  {
5084  len--;
5085  if (s1[len] != s2[len])
5086  return false;
5087  }
5088  return true;
5089 }
5090 
5091 /* Expand each Levenshtein distance variant */
5092 #include "levenshtein.c"
5093 #define LEVENSHTEIN_LESS_EQUAL
5094 #include "levenshtein.c"