PostgreSQL Source Code  git master
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros
varlena.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * varlena.c
4  * Functions for the variable-length built-in types.
5  *
6  * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  * src/backend/utils/adt/varlena.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 #include "postgres.h"
16 
17 #include <ctype.h>
18 #include <limits.h>
19 
20 #include "access/hash.h"
21 #include "access/tuptoaster.h"
22 #include "catalog/pg_collation.h"
23 #include "catalog/pg_type.h"
24 #include "lib/hyperloglog.h"
25 #include "libpq/md5.h"
26 #include "libpq/pqformat.h"
27 #include "miscadmin.h"
28 #include "parser/scansup.h"
29 #include "regex/regex.h"
30 #include "utils/builtins.h"
31 #include "utils/bytea.h"
32 #include "utils/lsyscache.h"
33 #include "utils/memutils.h"
34 #include "utils/pg_locale.h"
35 #include "utils/sortsupport.h"
36 
37 #ifdef DEBUG_ABBREV_KEYS
38 #define DEBUG_elog_output DEBUG1
39 #endif
40 
41 /* GUC variable */
43 
44 typedef struct varlena unknown;
45 
46 typedef struct
47 {
48  bool use_wchar; /* T if multibyte encoding */
49  char *str1; /* use these if not use_wchar */
50  char *str2; /* note: these point to original texts */
51  pg_wchar *wstr1; /* use these if use_wchar */
52  pg_wchar *wstr2; /* note: these are palloc'd */
53  int len1; /* string lengths in logical characters */
54  int len2;
55  /* Skip table for Boyer-Moore-Horspool search algorithm: */
56  int skiptablemask; /* mask for ANDing with skiptable subscripts */
57  int skiptable[256]; /* skip distance for given mismatched char */
59 
60 typedef struct
61 {
62  char *buf1; /* 1st string, or abbreviation original string buf */
63  char *buf2; /* 2nd string, or abbreviation strxfrm() buf */
64  int buflen1;
65  int buflen2;
66  bool collate_c;
67  hyperLogLogState abbr_card; /* Abbreviated key cardinality state */
68  hyperLogLogState full_card; /* Full key cardinality state */
69 #ifdef HAVE_LOCALE_T
71 #endif
73 
74 /*
75  * This should be large enough that most strings will fit, but small enough
76  * that we feel comfortable putting it on the stack
77  */
78 #define TEXTBUFLEN 1024
79 
80 #define DatumGetUnknownP(X) ((unknown *) PG_DETOAST_DATUM(X))
81 #define DatumGetUnknownPCopy(X) ((unknown *) PG_DETOAST_DATUM_COPY(X))
82 #define PG_GETARG_UNKNOWN_P(n) DatumGetUnknownP(PG_GETARG_DATUM(n))
83 #define PG_GETARG_UNKNOWN_P_COPY(n) DatumGetUnknownPCopy(PG_GETARG_DATUM(n))
84 #define PG_RETURN_UNKNOWN_P(x) PG_RETURN_POINTER(x)
85 
86 static void btsortsupport_worker(SortSupport ssup, Oid collid);
87 static int bttextfastcmp_c(Datum x, Datum y, SortSupport ssup);
88 static int bttextfastcmp_locale(Datum x, Datum y, SortSupport ssup);
89 static int bttextcmp_abbrev(Datum x, Datum y, SortSupport ssup);
90 static Datum bttext_abbrev_convert(Datum original, SortSupport ssup);
91 static bool bttext_abbrev_abort(int memtupcount, SortSupport ssup);
92 static int32 text_length(Datum str);
93 static text *text_catenate(text *t1, text *t2);
94 static text *text_substring(Datum str,
95  int32 start,
96  int32 length,
97  bool length_not_specified);
98 static text *text_overlay(text *t1, text *t2, int sp, int sl);
99 static int text_position(text *t1, text *t2);
100 static void text_position_setup(text *t1, text *t2, TextPositionState *state);
101 static int text_position_next(int start_pos, TextPositionState *state);
103 static int text_cmp(text *arg1, text *arg2, Oid collid);
104 static bytea *bytea_catenate(bytea *t1, bytea *t2);
105 static bytea *bytea_substring(Datum str,
106  int S,
107  int L,
108  bool length_not_specified);
109 static bytea *bytea_overlay(bytea *t1, bytea *t2, int sp, int sl);
110 static void appendStringInfoText(StringInfo str, const text *t);
113  const char *fldsep, const char *null_string);
115 static bool text_format_parse_digits(const char **ptr, const char *end_ptr,
116  int *value);
117 static const char *text_format_parse_format(const char *start_ptr,
118  const char *end_ptr,
119  int *argpos, int *widthpos,
120  int *flags, int *width);
121 static void text_format_string_conversion(StringInfo buf, char conversion,
122  FmgrInfo *typOutputInfo,
123  Datum value, bool isNull,
124  int flags, int width);
125 static void text_format_append_string(StringInfo buf, const char *str,
126  int flags, int width);
127 
128 
129 /*****************************************************************************
130  * CONVERSION ROUTINES EXPORTED FOR USE BY C CODE *
131  *****************************************************************************/
132 
133 /*
134  * cstring_to_text
135  *
136  * Create a text value from a null-terminated C string.
137  *
138  * The new text value is freshly palloc'd with a full-size VARHDR.
139  */
140 text *
141 cstring_to_text(const char *s)
142 {
143  return cstring_to_text_with_len(s, strlen(s));
144 }
145 
146 /*
147  * cstring_to_text_with_len
148  *
149  * Same as cstring_to_text except the caller specifies the string length;
150  * the string need not be null_terminated.
151  */
152 text *
153 cstring_to_text_with_len(const char *s, int len)
154 {
155  text *result = (text *) palloc(len + VARHDRSZ);
156 
157  SET_VARSIZE(result, len + VARHDRSZ);
158  memcpy(VARDATA(result), s, len);
159 
160  return result;
161 }
162 
163 /*
164  * text_to_cstring
165  *
166  * Create a palloc'd, null-terminated C string from a text value.
167  *
168  * We support being passed a compressed or toasted text value.
169  * This is a bit bogus since such values shouldn't really be referred to as
170  * "text *", but it seems useful for robustness. If we didn't handle that
171  * case here, we'd need another routine that did, anyway.
172  */
173 char *
175 {
176  /* must cast away the const, unfortunately */
177  text *tunpacked = pg_detoast_datum_packed((struct varlena *) t);
178  int len = VARSIZE_ANY_EXHDR(tunpacked);
179  char *result;
180 
181  result = (char *) palloc(len + 1);
182  memcpy(result, VARDATA_ANY(tunpacked), len);
183  result[len] = '\0';
184 
185  if (tunpacked != t)
186  pfree(tunpacked);
187 
188  return result;
189 }
190 
191 /*
192  * text_to_cstring_buffer
193  *
194  * Copy a text value into a caller-supplied buffer of size dst_len.
195  *
196  * The text string is truncated if necessary to fit. The result is
197  * guaranteed null-terminated (unless dst_len == 0).
198  *
199  * We support being passed a compressed or toasted text value.
200  * This is a bit bogus since such values shouldn't really be referred to as
201  * "text *", but it seems useful for robustness. If we didn't handle that
202  * case here, we'd need another routine that did, anyway.
203  */
204 void
205 text_to_cstring_buffer(const text *src, char *dst, size_t dst_len)
206 {
207  /* must cast away the const, unfortunately */
208  text *srcunpacked = pg_detoast_datum_packed((struct varlena *) src);
209  size_t src_len = VARSIZE_ANY_EXHDR(srcunpacked);
210 
211  if (dst_len > 0)
212  {
213  dst_len--;
214  if (dst_len >= src_len)
215  dst_len = src_len;
216  else /* ensure truncation is encoding-safe */
217  dst_len = pg_mbcliplen(VARDATA_ANY(srcunpacked), src_len, dst_len);
218  memcpy(dst, VARDATA_ANY(srcunpacked), dst_len);
219  dst[dst_len] = '\0';
220  }
221 
222  if (srcunpacked != src)
223  pfree(srcunpacked);
224 }
225 
226 
227 /*****************************************************************************
228  * USER I/O ROUTINES *
229  *****************************************************************************/
230 
231 
232 #define VAL(CH) ((CH) - '0')
233 #define DIG(VAL) ((VAL) + '0')
234 
235 /*
236  * byteain - converts from printable representation of byte array
237  *
238  * Non-printable characters must be passed as '\nnn' (octal) and are
239  * converted to internal form. '\' must be passed as '\\'.
240  * ereport(ERROR, ...) if bad form.
241  *
242  * BUGS:
243  * The input is scanned twice.
244  * The error checking of input is minimal.
245  */
246 Datum
248 {
249  char *inputText = PG_GETARG_CSTRING(0);
250  char *tp;
251  char *rp;
252  int bc;
253  bytea *result;
254 
255  /* Recognize hex input */
256  if (inputText[0] == '\\' && inputText[1] == 'x')
257  {
258  size_t len = strlen(inputText);
259 
260  bc = (len - 2) / 2 + VARHDRSZ; /* maximum possible length */
261  result = palloc(bc);
262  bc = hex_decode(inputText + 2, len - 2, VARDATA(result));
263  SET_VARSIZE(result, bc + VARHDRSZ); /* actual length */
264 
265  PG_RETURN_BYTEA_P(result);
266  }
267 
268  /* Else, it's the traditional escaped style */
269  for (bc = 0, tp = inputText; *tp != '\0'; bc++)
270  {
271  if (tp[0] != '\\')
272  tp++;
273  else if ((tp[0] == '\\') &&
274  (tp[1] >= '0' && tp[1] <= '3') &&
275  (tp[2] >= '0' && tp[2] <= '7') &&
276  (tp[3] >= '0' && tp[3] <= '7'))
277  tp += 4;
278  else if ((tp[0] == '\\') &&
279  (tp[1] == '\\'))
280  tp += 2;
281  else
282  {
283  /*
284  * one backslash, not followed by another or ### valid octal
285  */
286  ereport(ERROR,
287  (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
288  errmsg("invalid input syntax for type bytea")));
289  }
290  }
291 
292  bc += VARHDRSZ;
293 
294  result = (bytea *) palloc(bc);
295  SET_VARSIZE(result, bc);
296 
297  tp = inputText;
298  rp = VARDATA(result);
299  while (*tp != '\0')
300  {
301  if (tp[0] != '\\')
302  *rp++ = *tp++;
303  else if ((tp[0] == '\\') &&
304  (tp[1] >= '0' && tp[1] <= '3') &&
305  (tp[2] >= '0' && tp[2] <= '7') &&
306  (tp[3] >= '0' && tp[3] <= '7'))
307  {
308  bc = VAL(tp[1]);
309  bc <<= 3;
310  bc += VAL(tp[2]);
311  bc <<= 3;
312  *rp++ = bc + VAL(tp[3]);
313 
314  tp += 4;
315  }
316  else if ((tp[0] == '\\') &&
317  (tp[1] == '\\'))
318  {
319  *rp++ = '\\';
320  tp += 2;
321  }
322  else
323  {
324  /*
325  * We should never get here. The first pass should not allow it.
326  */
327  ereport(ERROR,
328  (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
329  errmsg("invalid input syntax for type bytea")));
330  }
331  }
332 
333  PG_RETURN_BYTEA_P(result);
334 }
335 
336 /*
337  * byteaout - converts to printable representation of byte array
338  *
339  * In the traditional escaped format, non-printable characters are
340  * printed as '\nnn' (octal) and '\' as '\\'.
341  */
342 Datum
344 {
345  bytea *vlena = PG_GETARG_BYTEA_PP(0);
346  char *result;
347  char *rp;
348 
350  {
351  /* Print hex format */
352  rp = result = palloc(VARSIZE_ANY_EXHDR(vlena) * 2 + 2 + 1);
353  *rp++ = '\\';
354  *rp++ = 'x';
355  rp += hex_encode(VARDATA_ANY(vlena), VARSIZE_ANY_EXHDR(vlena), rp);
356  }
357  else if (bytea_output == BYTEA_OUTPUT_ESCAPE)
358  {
359  /* Print traditional escaped format */
360  char *vp;
361  int len;
362  int i;
363 
364  len = 1; /* empty string has 1 char */
365  vp = VARDATA_ANY(vlena);
366  for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++)
367  {
368  if (*vp == '\\')
369  len += 2;
370  else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e)
371  len += 4;
372  else
373  len++;
374  }
375  rp = result = (char *) palloc(len);
376  vp = VARDATA_ANY(vlena);
377  for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++)
378  {
379  if (*vp == '\\')
380  {
381  *rp++ = '\\';
382  *rp++ = '\\';
383  }
384  else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e)
385  {
386  int val; /* holds unprintable chars */
387 
388  val = *vp;
389  rp[0] = '\\';
390  rp[3] = DIG(val & 07);
391  val >>= 3;
392  rp[2] = DIG(val & 07);
393  val >>= 3;
394  rp[1] = DIG(val & 03);
395  rp += 4;
396  }
397  else
398  *rp++ = *vp;
399  }
400  }
401  else
402  {
403  elog(ERROR, "unrecognized bytea_output setting: %d",
404  bytea_output);
405  rp = result = NULL; /* keep compiler quiet */
406  }
407  *rp = '\0';
408  PG_RETURN_CSTRING(result);
409 }
410 
411 /*
412  * bytearecv - converts external binary format to bytea
413  */
414 Datum
416 {
418  bytea *result;
419  int nbytes;
420 
421  nbytes = buf->len - buf->cursor;
422  result = (bytea *) palloc(nbytes + VARHDRSZ);
423  SET_VARSIZE(result, nbytes + VARHDRSZ);
424  pq_copymsgbytes(buf, VARDATA(result), nbytes);
425  PG_RETURN_BYTEA_P(result);
426 }
427 
428 /*
429  * byteasend - converts bytea to binary format
430  *
431  * This is a special case: just copy the input...
432  */
433 Datum
435 {
436  bytea *vlena = PG_GETARG_BYTEA_P_COPY(0);
437 
438  PG_RETURN_BYTEA_P(vlena);
439 }
440 
441 Datum
443 {
445 
446  state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
447 
448  /* Append the value unless null. */
449  if (!PG_ARGISNULL(1))
450  {
452 
453  /* On the first time through, we ignore the delimiter. */
454  if (state == NULL)
455  state = makeStringAggState(fcinfo);
456  else if (!PG_ARGISNULL(2))
457  {
458  bytea *delim = PG_GETARG_BYTEA_PP(2);
459 
461  }
462 
464  }
465 
466  /*
467  * The transition type for string_agg() is declared to be "internal",
468  * which is a pass-by-value type the same size as a pointer.
469  */
470  PG_RETURN_POINTER(state);
471 }
472 
473 Datum
475 {
477 
478  /* cannot be called directly because of internal-type argument */
479  Assert(AggCheckCallContext(fcinfo, NULL));
480 
481  state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
482 
483  if (state != NULL)
484  {
485  bytea *result;
486 
487  result = (bytea *) palloc(state->len + VARHDRSZ);
488  SET_VARSIZE(result, state->len + VARHDRSZ);
489  memcpy(VARDATA(result), state->data, state->len);
490  PG_RETURN_BYTEA_P(result);
491  }
492  else
493  PG_RETURN_NULL();
494 }
495 
496 /*
497  * textin - converts "..." to internal representation
498  */
499 Datum
501 {
502  char *inputText = PG_GETARG_CSTRING(0);
503 
504  PG_RETURN_TEXT_P(cstring_to_text(inputText));
505 }
506 
507 /*
508  * textout - converts internal representation to "..."
509  */
510 Datum
512 {
513  Datum txt = PG_GETARG_DATUM(0);
514 
516 }
517 
518 /*
519  * textrecv - converts external binary format to text
520  */
521 Datum
523 {
525  text *result;
526  char *str;
527  int nbytes;
528 
529  str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
530 
531  result = cstring_to_text_with_len(str, nbytes);
532  pfree(str);
533  PG_RETURN_TEXT_P(result);
534 }
535 
536 /*
537  * textsend - converts text to binary format
538  */
539 Datum
541 {
542  text *t = PG_GETARG_TEXT_PP(0);
544 
545  pq_begintypsend(&buf);
548 }
549 
550 
551 /*
552  * unknownin - converts "..." to internal representation
553  */
554 Datum
556 {
557  char *str = PG_GETARG_CSTRING(0);
558 
559  /* representation is same as cstring */
561 }
562 
563 /*
564  * unknownout - converts internal representation to "..."
565  */
566 Datum
568 {
569  /* representation is same as cstring */
570  char *str = PG_GETARG_CSTRING(0);
571 
573 }
574 
575 /*
576  * unknownrecv - converts external binary format to unknown
577  */
578 Datum
580 {
582  char *str;
583  int nbytes;
584 
585  str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
586  /* representation is same as cstring */
587  PG_RETURN_CSTRING(str);
588 }
589 
590 /*
591  * unknownsend - converts unknown to binary format
592  */
593 Datum
595 {
596  /* representation is same as cstring */
597  char *str = PG_GETARG_CSTRING(0);
599 
600  pq_begintypsend(&buf);
601  pq_sendtext(&buf, str, strlen(str));
603 }
604 
605 
606 /* ========== PUBLIC ROUTINES ========== */
607 
608 /*
609  * textlen -
610  * returns the logical length of a text*
611  * (which is less than the VARSIZE of the text*)
612  */
613 Datum
615 {
616  Datum str = PG_GETARG_DATUM(0);
617 
618  /* try to avoid decompressing argument */
620 }
621 
622 /*
623  * text_length -
624  * Does the real work for textlen()
625  *
626  * This is broken out so it can be called directly by other string processing
627  * functions. Note that the argument is passed as a Datum, to indicate that
628  * it may still be in compressed form. We can avoid decompressing it at all
629  * in some cases.
630  */
631 static int32
633 {
634  /* fastpath when max encoding length is one */
637  else
638  {
639  text *t = DatumGetTextPP(str);
640 
642  VARSIZE_ANY_EXHDR(t)));
643  }
644 }
645 
646 /*
647  * textoctetlen -
648  * returns the physical length of a text*
649  * (which is less than the VARSIZE of the text*)
650  */
651 Datum
653 {
654  Datum str = PG_GETARG_DATUM(0);
655 
656  /* We need not detoast the input at all */
658 }
659 
660 /*
661  * textcat -
662  * takes two text* and returns a text* that is the concatenation of
663  * the two.
664  *
665  * Rewritten by Sapa, sapa@hq.icb.chel.su. 8-Jul-96.
666  * Updated by Thomas, Thomas.Lockhart@jpl.nasa.gov 1997-07-10.
667  * Allocate space for output in all cases.
668  * XXX - thomas 1997-07-10
669  */
670 Datum
672 {
673  text *t1 = PG_GETARG_TEXT_PP(0);
674  text *t2 = PG_GETARG_TEXT_PP(1);
675 
677 }
678 
679 /*
680  * text_catenate
681  * Guts of textcat(), broken out so it can be used by other functions
682  *
683  * Arguments can be in short-header form, but not compressed or out-of-line
684  */
685 static text *
687 {
688  text *result;
689  int len1,
690  len2,
691  len;
692  char *ptr;
693 
694  len1 = VARSIZE_ANY_EXHDR(t1);
695  len2 = VARSIZE_ANY_EXHDR(t2);
696 
697  /* paranoia ... probably should throw error instead? */
698  if (len1 < 0)
699  len1 = 0;
700  if (len2 < 0)
701  len2 = 0;
702 
703  len = len1 + len2 + VARHDRSZ;
704  result = (text *) palloc(len);
705 
706  /* Set size of result string... */
707  SET_VARSIZE(result, len);
708 
709  /* Fill data field of result string... */
710  ptr = VARDATA(result);
711  if (len1 > 0)
712  memcpy(ptr, VARDATA_ANY(t1), len1);
713  if (len2 > 0)
714  memcpy(ptr + len1, VARDATA_ANY(t2), len2);
715 
716  return result;
717 }
718 
719 /*
720  * charlen_to_bytelen()
721  * Compute the number of bytes occupied by n characters starting at *p
722  *
723  * It is caller's responsibility that there actually are n characters;
724  * the string need not be null-terminated.
725  */
726 static int
727 charlen_to_bytelen(const char *p, int n)
728 {
730  {
731  /* Optimization for single-byte encodings */
732  return n;
733  }
734  else
735  {
736  const char *s;
737 
738  for (s = p; n > 0; n--)
739  s += pg_mblen(s);
740 
741  return s - p;
742  }
743 }
744 
745 /*
746  * text_substr()
747  * Return a substring starting at the specified position.
748  * - thomas 1997-12-31
749  *
750  * Input:
751  * - string
752  * - starting position (is one-based)
753  * - string length
754  *
755  * If the starting position is zero or less, then return from the start of the string
756  * adjusting the length to be consistent with the "negative start" per SQL.
757  * If the length is less than zero, return the remaining string.
758  *
759  * Added multibyte support.
760  * - Tatsuo Ishii 1998-4-21
761  * Changed behavior if starting position is less than one to conform to SQL behavior.
762  * Formerly returned the entire string; now returns a portion.
763  * - Thomas Lockhart 1998-12-10
764  * Now uses faster TOAST-slicing interface
765  * - John Gray 2002-02-22
766  * Remove "#ifdef MULTIBYTE" and test for encoding_max_length instead. Change
767  * behaviors conflicting with SQL to meet SQL (if E = S + L < S throw
768  * error; if E < 1, return '', not entire string). Fixed MB related bug when
769  * S > LC and < LC + 4 sometimes garbage characters are returned.
770  * - Joe Conway 2002-08-10
771  */
772 Datum
774 {
776  PG_GETARG_INT32(1),
777  PG_GETARG_INT32(2),
778  false));
779 }
780 
781 /*
782  * text_substr_no_len -
783  * Wrapper to avoid opr_sanity failure due to
784  * one function accepting a different number of args.
785  */
786 Datum
788 {
790  PG_GETARG_INT32(1),
791  -1, true));
792 }
793 
794 /*
795  * text_substring -
796  * Does the real work for text_substr() and text_substr_no_len()
797  *
798  * This is broken out so it can be called directly by other string processing
799  * functions. Note that the argument is passed as a Datum, to indicate that
800  * it may still be in compressed/toasted form. We can avoid detoasting all
801  * of it in some cases.
802  *
803  * The result is always a freshly palloc'd datum.
804  */
805 static text *
806 text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
807 {
809  int32 S = start; /* start position */
810  int32 S1; /* adjusted start position */
811  int32 L1; /* adjusted substring length */
812 
813  /* life is easy if the encoding max length is 1 */
814  if (eml == 1)
815  {
816  S1 = Max(S, 1);
817 
818  if (length_not_specified) /* special case - get length to end of
819  * string */
820  L1 = -1;
821  else
822  {
823  /* end position */
824  int E = S + length;
825 
826  /*
827  * A negative value for L is the only way for the end position to
828  * be before the start. SQL99 says to throw an error.
829  */
830  if (E < S)
831  ereport(ERROR,
832  (errcode(ERRCODE_SUBSTRING_ERROR),
833  errmsg("negative substring length not allowed")));
834 
835  /*
836  * A zero or negative value for the end position can happen if the
837  * start was negative or one. SQL99 says to return a zero-length
838  * string.
839  */
840  if (E < 1)
841  return cstring_to_text("");
842 
843  L1 = E - S1;
844  }
845 
846  /*
847  * If the start position is past the end of the string, SQL99 says to
848  * return a zero-length string -- PG_GETARG_TEXT_P_SLICE() will do
849  * that for us. Convert to zero-based starting position
850  */
851  return DatumGetTextPSlice(str, S1 - 1, L1);
852  }
853  else if (eml > 1)
854  {
855  /*
856  * When encoding max length is > 1, we can't get LC without
857  * detoasting, so we'll grab a conservatively large slice now and go
858  * back later to do the right thing
859  */
860  int32 slice_start;
861  int32 slice_size;
862  int32 slice_strlen;
863  text *slice;
864  int32 E1;
865  int32 i;
866  char *p;
867  char *s;
868  text *ret;
869 
870  /*
871  * if S is past the end of the string, the tuple toaster will return a
872  * zero-length string to us
873  */
874  S1 = Max(S, 1);
875 
876  /*
877  * We need to start at position zero because there is no way to know
878  * in advance which byte offset corresponds to the supplied start
879  * position.
880  */
881  slice_start = 0;
882 
883  if (length_not_specified) /* special case - get length to end of
884  * string */
885  slice_size = L1 = -1;
886  else
887  {
888  int E = S + length;
889 
890  /*
891  * A negative value for L is the only way for the end position to
892  * be before the start. SQL99 says to throw an error.
893  */
894  if (E < S)
895  ereport(ERROR,
896  (errcode(ERRCODE_SUBSTRING_ERROR),
897  errmsg("negative substring length not allowed")));
898 
899  /*
900  * A zero or negative value for the end position can happen if the
901  * start was negative or one. SQL99 says to return a zero-length
902  * string.
903  */
904  if (E < 1)
905  return cstring_to_text("");
906 
907  /*
908  * if E is past the end of the string, the tuple toaster will
909  * truncate the length for us
910  */
911  L1 = E - S1;
912 
913  /*
914  * Total slice size in bytes can't be any longer than the start
915  * position plus substring length times the encoding max length.
916  */
917  slice_size = (S1 + L1) * eml;
918  }
919 
920  /*
921  * If we're working with an untoasted source, no need to do an extra
922  * copying step.
923  */
926  slice = DatumGetTextPSlice(str, slice_start, slice_size);
927  else
928  slice = (text *) DatumGetPointer(str);
929 
930  /* see if we got back an empty string */
931  if (VARSIZE_ANY_EXHDR(slice) == 0)
932  {
933  if (slice != (text *) DatumGetPointer(str))
934  pfree(slice);
935  return cstring_to_text("");
936  }
937 
938  /* Now we can get the actual length of the slice in MB characters */
939  slice_strlen = pg_mbstrlen_with_len(VARDATA_ANY(slice),
940  VARSIZE_ANY_EXHDR(slice));
941 
942  /*
943  * Check that the start position wasn't > slice_strlen. If so, SQL99
944  * says to return a zero-length string.
945  */
946  if (S1 > slice_strlen)
947  {
948  if (slice != (text *) DatumGetPointer(str))
949  pfree(slice);
950  return cstring_to_text("");
951  }
952 
953  /*
954  * Adjust L1 and E1 now that we know the slice string length. Again
955  * remember that S1 is one based, and slice_start is zero based.
956  */
957  if (L1 > -1)
958  E1 = Min(S1 + L1, slice_start + 1 + slice_strlen);
959  else
960  E1 = slice_start + 1 + slice_strlen;
961 
962  /*
963  * Find the start position in the slice; remember S1 is not zero based
964  */
965  p = VARDATA_ANY(slice);
966  for (i = 0; i < S1 - 1; i++)
967  p += pg_mblen(p);
968 
969  /* hang onto a pointer to our start position */
970  s = p;
971 
972  /*
973  * Count the actual bytes used by the substring of the requested
974  * length.
975  */
976  for (i = S1; i < E1; i++)
977  p += pg_mblen(p);
978 
979  ret = (text *) palloc(VARHDRSZ + (p - s));
980  SET_VARSIZE(ret, VARHDRSZ + (p - s));
981  memcpy(VARDATA(ret), s, (p - s));
982 
983  if (slice != (text *) DatumGetPointer(str))
984  pfree(slice);
985 
986  return ret;
987  }
988  else
989  elog(ERROR, "invalid backend encoding: encoding max length < 1");
990 
991  /* not reached: suppress compiler warning */
992  return NULL;
993 }
994 
995 /*
996  * textoverlay
997  * Replace specified substring of first string with second
998  *
999  * The SQL standard defines OVERLAY() in terms of substring and concatenation.
1000  * This code is a direct implementation of what the standard says.
1001  */
1002 Datum
1004 {
1005  text *t1 = PG_GETARG_TEXT_PP(0);
1006  text *t2 = PG_GETARG_TEXT_PP(1);
1007  int sp = PG_GETARG_INT32(2); /* substring start position */
1008  int sl = PG_GETARG_INT32(3); /* substring length */
1009 
1010  PG_RETURN_TEXT_P(text_overlay(t1, t2, sp, sl));
1011 }
1012 
1013 Datum
1015 {
1016  text *t1 = PG_GETARG_TEXT_PP(0);
1017  text *t2 = PG_GETARG_TEXT_PP(1);
1018  int sp = PG_GETARG_INT32(2); /* substring start position */
1019  int sl;
1020 
1021  sl = text_length(PointerGetDatum(t2)); /* defaults to length(t2) */
1022  PG_RETURN_TEXT_P(text_overlay(t1, t2, sp, sl));
1023 }
1024 
1025 static text *
1026 text_overlay(text *t1, text *t2, int sp, int sl)
1027 {
1028  text *result;
1029  text *s1;
1030  text *s2;
1031  int sp_pl_sl;
1032 
1033  /*
1034  * Check for possible integer-overflow cases. For negative sp, throw a
1035  * "substring length" error because that's what should be expected
1036  * according to the spec's definition of OVERLAY().
1037  */
1038  if (sp <= 0)
1039  ereport(ERROR,
1040  (errcode(ERRCODE_SUBSTRING_ERROR),
1041  errmsg("negative substring length not allowed")));
1042  sp_pl_sl = sp + sl;
1043  if (sp_pl_sl <= sl)
1044  ereport(ERROR,
1045  (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
1046  errmsg("integer out of range")));
1047 
1048  s1 = text_substring(PointerGetDatum(t1), 1, sp - 1, false);
1049  s2 = text_substring(PointerGetDatum(t1), sp_pl_sl, -1, true);
1050  result = text_catenate(s1, t2);
1051  result = text_catenate(result, s2);
1052 
1053  return result;
1054 }
1055 
1056 /*
1057  * textpos -
1058  * Return the position of the specified substring.
1059  * Implements the SQL POSITION() function.
1060  * Ref: A Guide To The SQL Standard, Date & Darwen, 1997
1061  * - thomas 1997-07-27
1062  */
1063 Datum
1065 {
1066  text *str = PG_GETARG_TEXT_PP(0);
1067  text *search_str = PG_GETARG_TEXT_PP(1);
1068 
1069  PG_RETURN_INT32((int32) text_position(str, search_str));
1070 }
1071 
1072 /*
1073  * text_position -
1074  * Does the real work for textpos()
1075  *
1076  * Inputs:
1077  * t1 - string to be searched
1078  * t2 - pattern to match within t1
1079  * Result:
1080  * Character index of the first matched char, starting from 1,
1081  * or 0 if no match.
1082  *
1083  * This is broken out so it can be called directly by other string processing
1084  * functions.
1085  */
1086 static int
1088 {
1090  int result;
1091 
1092  text_position_setup(t1, t2, &state);
1093  result = text_position_next(1, &state);
1094  text_position_cleanup(&state);
1095  return result;
1096 }
1097 
1098 
1099 /*
1100  * text_position_setup, text_position_next, text_position_cleanup -
1101  * Component steps of text_position()
1102  *
1103  * These are broken out so that a string can be efficiently searched for
1104  * multiple occurrences of the same pattern. text_position_next may be
1105  * called multiple times with increasing values of start_pos, which is
1106  * the 1-based character position to start the search from. The "state"
1107  * variable is normally just a local variable in the caller.
1108  */
1109 
1110 static void
1112 {
1113  int len1 = VARSIZE_ANY_EXHDR(t1);
1114  int len2 = VARSIZE_ANY_EXHDR(t2);
1115 
1117  {
1118  /* simple case - single byte encoding */
1119  state->use_wchar = false;
1120  state->str1 = VARDATA_ANY(t1);
1121  state->str2 = VARDATA_ANY(t2);
1122  state->len1 = len1;
1123  state->len2 = len2;
1124  }
1125  else
1126  {
1127  /* not as simple - multibyte encoding */
1128  pg_wchar *p1,
1129  *p2;
1130 
1131  p1 = (pg_wchar *) palloc((len1 + 1) * sizeof(pg_wchar));
1132  len1 = pg_mb2wchar_with_len(VARDATA_ANY(t1), p1, len1);
1133  p2 = (pg_wchar *) palloc((len2 + 1) * sizeof(pg_wchar));
1134  len2 = pg_mb2wchar_with_len(VARDATA_ANY(t2), p2, len2);
1135 
1136  state->use_wchar = true;
1137  state->wstr1 = p1;
1138  state->wstr2 = p2;
1139  state->len1 = len1;
1140  state->len2 = len2;
1141  }
1142 
1143  /*
1144  * Prepare the skip table for Boyer-Moore-Horspool searching. In these
1145  * notes we use the terminology that the "haystack" is the string to be
1146  * searched (t1) and the "needle" is the pattern being sought (t2).
1147  *
1148  * If the needle is empty or bigger than the haystack then there is no
1149  * point in wasting cycles initializing the table. We also choose not to
1150  * use B-M-H for needles of length 1, since the skip table can't possibly
1151  * save anything in that case.
1152  */
1153  if (len1 >= len2 && len2 > 1)
1154  {
1155  int searchlength = len1 - len2;
1156  int skiptablemask;
1157  int last;
1158  int i;
1159 
1160  /*
1161  * First we must determine how much of the skip table to use. The
1162  * declaration of TextPositionState allows up to 256 elements, but for
1163  * short search problems we don't really want to have to initialize so
1164  * many elements --- it would take too long in comparison to the
1165  * actual search time. So we choose a useful skip table size based on
1166  * the haystack length minus the needle length. The closer the needle
1167  * length is to the haystack length the less useful skipping becomes.
1168  *
1169  * Note: since we use bit-masking to select table elements, the skip
1170  * table size MUST be a power of 2, and so the mask must be 2^N-1.
1171  */
1172  if (searchlength < 16)
1173  skiptablemask = 3;
1174  else if (searchlength < 64)
1175  skiptablemask = 7;
1176  else if (searchlength < 128)
1177  skiptablemask = 15;
1178  else if (searchlength < 512)
1179  skiptablemask = 31;
1180  else if (searchlength < 2048)
1181  skiptablemask = 63;
1182  else if (searchlength < 4096)
1183  skiptablemask = 127;
1184  else
1185  skiptablemask = 255;
1186  state->skiptablemask = skiptablemask;
1187 
1188  /*
1189  * Initialize the skip table. We set all elements to the needle
1190  * length, since this is the correct skip distance for any character
1191  * not found in the needle.
1192  */
1193  for (i = 0; i <= skiptablemask; i++)
1194  state->skiptable[i] = len2;
1195 
1196  /*
1197  * Now examine the needle. For each character except the last one,
1198  * set the corresponding table element to the appropriate skip
1199  * distance. Note that when two characters share the same skip table
1200  * entry, the one later in the needle must determine the skip
1201  * distance.
1202  */
1203  last = len2 - 1;
1204 
1205  if (!state->use_wchar)
1206  {
1207  const char *str2 = state->str2;
1208 
1209  for (i = 0; i < last; i++)
1210  state->skiptable[(unsigned char) str2[i] & skiptablemask] = last - i;
1211  }
1212  else
1213  {
1214  const pg_wchar *wstr2 = state->wstr2;
1215 
1216  for (i = 0; i < last; i++)
1217  state->skiptable[wstr2[i] & skiptablemask] = last - i;
1218  }
1219  }
1220 }
1221 
1222 static int
1224 {
1225  int haystack_len = state->len1;
1226  int needle_len = state->len2;
1227  int skiptablemask = state->skiptablemask;
1228 
1229  Assert(start_pos > 0); /* else caller error */
1230 
1231  if (needle_len <= 0)
1232  return start_pos; /* result for empty pattern */
1233 
1234  start_pos--; /* adjust for zero based arrays */
1235 
1236  /* Done if the needle can't possibly fit */
1237  if (haystack_len < start_pos + needle_len)
1238  return 0;
1239 
1240  if (!state->use_wchar)
1241  {
1242  /* simple case - single byte encoding */
1243  const char *haystack = state->str1;
1244  const char *needle = state->str2;
1245  const char *haystack_end = &haystack[haystack_len];
1246  const char *hptr;
1247 
1248  if (needle_len == 1)
1249  {
1250  /* No point in using B-M-H for a one-character needle */
1251  char nchar = *needle;
1252 
1253  hptr = &haystack[start_pos];
1254  while (hptr < haystack_end)
1255  {
1256  if (*hptr == nchar)
1257  return hptr - haystack + 1;
1258  hptr++;
1259  }
1260  }
1261  else
1262  {
1263  const char *needle_last = &needle[needle_len - 1];
1264 
1265  /* Start at startpos plus the length of the needle */
1266  hptr = &haystack[start_pos + needle_len - 1];
1267  while (hptr < haystack_end)
1268  {
1269  /* Match the needle scanning *backward* */
1270  const char *nptr;
1271  const char *p;
1272 
1273  nptr = needle_last;
1274  p = hptr;
1275  while (*nptr == *p)
1276  {
1277  /* Matched it all? If so, return 1-based position */
1278  if (nptr == needle)
1279  return p - haystack + 1;
1280  nptr--, p--;
1281  }
1282 
1283  /*
1284  * No match, so use the haystack char at hptr to decide how
1285  * far to advance. If the needle had any occurrence of that
1286  * character (or more precisely, one sharing the same
1287  * skiptable entry) before its last character, then we advance
1288  * far enough to align the last such needle character with
1289  * that haystack position. Otherwise we can advance by the
1290  * whole needle length.
1291  */
1292  hptr += state->skiptable[(unsigned char) *hptr & skiptablemask];
1293  }
1294  }
1295  }
1296  else
1297  {
1298  /* The multibyte char version. This works exactly the same way. */
1299  const pg_wchar *haystack = state->wstr1;
1300  const pg_wchar *needle = state->wstr2;
1301  const pg_wchar *haystack_end = &haystack[haystack_len];
1302  const pg_wchar *hptr;
1303 
1304  if (needle_len == 1)
1305  {
1306  /* No point in using B-M-H for a one-character needle */
1307  pg_wchar nchar = *needle;
1308 
1309  hptr = &haystack[start_pos];
1310  while (hptr < haystack_end)
1311  {
1312  if (*hptr == nchar)
1313  return hptr - haystack + 1;
1314  hptr++;
1315  }
1316  }
1317  else
1318  {
1319  const pg_wchar *needle_last = &needle[needle_len - 1];
1320 
1321  /* Start at startpos plus the length of the needle */
1322  hptr = &haystack[start_pos + needle_len - 1];
1323  while (hptr < haystack_end)
1324  {
1325  /* Match the needle scanning *backward* */
1326  const pg_wchar *nptr;
1327  const pg_wchar *p;
1328 
1329  nptr = needle_last;
1330  p = hptr;
1331  while (*nptr == *p)
1332  {
1333  /* Matched it all? If so, return 1-based position */
1334  if (nptr == needle)
1335  return p - haystack + 1;
1336  nptr--, p--;
1337  }
1338 
1339  /*
1340  * No match, so use the haystack char at hptr to decide how
1341  * far to advance. If the needle had any occurrence of that
1342  * character (or more precisely, one sharing the same
1343  * skiptable entry) before its last character, then we advance
1344  * far enough to align the last such needle character with
1345  * that haystack position. Otherwise we can advance by the
1346  * whole needle length.
1347  */
1348  hptr += state->skiptable[*hptr & skiptablemask];
1349  }
1350  }
1351  }
1352 
1353  return 0; /* not found */
1354 }
1355 
1356 static void
1358 {
1359  if (state->use_wchar)
1360  {
1361  pfree(state->wstr1);
1362  pfree(state->wstr2);
1363  }
1364 }
1365 
1366 /* varstr_cmp()
1367  * Comparison function for text strings with given lengths.
1368  * Includes locale support, but must copy strings to temporary memory
1369  * to allow null-termination for inputs to strcoll().
1370  * Returns an integer less than, equal to, or greater than zero, indicating
1371  * whether arg1 is less than, equal to, or greater than arg2.
1372  */
1373 int
1374 varstr_cmp(char *arg1, int len1, char *arg2, int len2, Oid collid)
1375 {
1376  int result;
1377 
1378  /*
1379  * Unfortunately, there is no strncoll(), so in the non-C locale case we
1380  * have to do some memory copying. This turns out to be significantly
1381  * slower, so we optimize the case where LC_COLLATE is C. We also try to
1382  * optimize relatively-short strings by avoiding palloc/pfree overhead.
1383  */
1384  if (lc_collate_is_c(collid))
1385  {
1386  result = memcmp(arg1, arg2, Min(len1, len2));
1387  if ((result == 0) && (len1 != len2))
1388  result = (len1 < len2) ? -1 : 1;
1389  }
1390  else
1391  {
1392  char a1buf[TEXTBUFLEN];
1393  char a2buf[TEXTBUFLEN];
1394  char *a1p,
1395  *a2p;
1396 
1397 #ifdef HAVE_LOCALE_T
1398  pg_locale_t mylocale = 0;
1399 #endif
1400 
1401  if (collid != DEFAULT_COLLATION_OID)
1402  {
1403  if (!OidIsValid(collid))
1404  {
1405  /*
1406  * This typically means that the parser could not resolve a
1407  * conflict of implicit collations, so report it that way.
1408  */
1409  ereport(ERROR,
1410  (errcode(ERRCODE_INDETERMINATE_COLLATION),
1411  errmsg("could not determine which collation to use for string comparison"),
1412  errhint("Use the COLLATE clause to set the collation explicitly.")));
1413  }
1414 #ifdef HAVE_LOCALE_T
1415  mylocale = pg_newlocale_from_collation(collid);
1416 #endif
1417  }
1418 
1419  /*
1420  * memcmp() can't tell us which of two unequal strings sorts first, but
1421  * it's a cheap way to tell if they're equal. Testing shows that
1422  * memcmp() followed by strcoll() is only trivially slower than
1423  * strcoll() by itself, so we don't lose much if this doesn't work out
1424  * very often, and if it does - for example, because there are many
1425  * equal strings in the input - then we win big by avoiding expensive
1426  * collation-aware comparisons.
1427  */
1428  if (len1 == len2 && memcmp(arg1, arg2, len1) == 0)
1429  return 0;
1430 
1431 #ifdef WIN32
1432  /* Win32 does not have UTF-8, so we need to map to UTF-16 */
1433  if (GetDatabaseEncoding() == PG_UTF8)
1434  {
1435  int a1len;
1436  int a2len;
1437  int r;
1438 
1439  if (len1 >= TEXTBUFLEN / 2)
1440  {
1441  a1len = len1 * 2 + 2;
1442  a1p = palloc(a1len);
1443  }
1444  else
1445  {
1446  a1len = TEXTBUFLEN;
1447  a1p = a1buf;
1448  }
1449  if (len2 >= TEXTBUFLEN / 2)
1450  {
1451  a2len = len2 * 2 + 2;
1452  a2p = palloc(a2len);
1453  }
1454  else
1455  {
1456  a2len = TEXTBUFLEN;
1457  a2p = a2buf;
1458  }
1459 
1460  /* stupid Microsloth API does not work for zero-length input */
1461  if (len1 == 0)
1462  r = 0;
1463  else
1464  {
1465  r = MultiByteToWideChar(CP_UTF8, 0, arg1, len1,
1466  (LPWSTR) a1p, a1len / 2);
1467  if (!r)
1468  ereport(ERROR,
1469  (errmsg("could not convert string to UTF-16: error code %lu",
1470  GetLastError())));
1471  }
1472  ((LPWSTR) a1p)[r] = 0;
1473 
1474  if (len2 == 0)
1475  r = 0;
1476  else
1477  {
1478  r = MultiByteToWideChar(CP_UTF8, 0, arg2, len2,
1479  (LPWSTR) a2p, a2len / 2);
1480  if (!r)
1481  ereport(ERROR,
1482  (errmsg("could not convert string to UTF-16: error code %lu",
1483  GetLastError())));
1484  }
1485  ((LPWSTR) a2p)[r] = 0;
1486 
1487  errno = 0;
1488 #ifdef HAVE_LOCALE_T
1489  if (mylocale)
1490  result = wcscoll_l((LPWSTR) a1p, (LPWSTR) a2p, mylocale);
1491  else
1492 #endif
1493  result = wcscoll((LPWSTR) a1p, (LPWSTR) a2p);
1494  if (result == 2147483647) /* _NLSCMPERROR; missing from mingw
1495  * headers */
1496  ereport(ERROR,
1497  (errmsg("could not compare Unicode strings: %m")));
1498 
1499  /*
1500  * In some locales wcscoll() can claim that nonidentical strings
1501  * are equal. Believing that would be bad news for a number of
1502  * reasons, so we follow Perl's lead and sort "equal" strings
1503  * according to strcmp (on the UTF-8 representation).
1504  */
1505  if (result == 0)
1506  {
1507  result = memcmp(arg1, arg2, Min(len1, len2));
1508  if ((result == 0) && (len1 != len2))
1509  result = (len1 < len2) ? -1 : 1;
1510  }
1511 
1512  if (a1p != a1buf)
1513  pfree(a1p);
1514  if (a2p != a2buf)
1515  pfree(a2p);
1516 
1517  return result;
1518  }
1519 #endif /* WIN32 */
1520 
1521  if (len1 >= TEXTBUFLEN)
1522  a1p = (char *) palloc(len1 + 1);
1523  else
1524  a1p = a1buf;
1525  if (len2 >= TEXTBUFLEN)
1526  a2p = (char *) palloc(len2 + 1);
1527  else
1528  a2p = a2buf;
1529 
1530  memcpy(a1p, arg1, len1);
1531  a1p[len1] = '\0';
1532  memcpy(a2p, arg2, len2);
1533  a2p[len2] = '\0';
1534 
1535 #ifdef HAVE_LOCALE_T
1536  if (mylocale)
1537  result = strcoll_l(a1p, a2p, mylocale);
1538  else
1539 #endif
1540  result = strcoll(a1p, a2p);
1541 
1542  /*
1543  * In some locales strcoll() can claim that nonidentical strings are
1544  * equal. Believing that would be bad news for a number of reasons,
1545  * so we follow Perl's lead and sort "equal" strings according to
1546  * strcmp().
1547  */
1548  if (result == 0)
1549  result = strcmp(a1p, a2p);
1550 
1551  if (a1p != a1buf)
1552  pfree(a1p);
1553  if (a2p != a2buf)
1554  pfree(a2p);
1555  }
1556 
1557  return result;
1558 }
1559 
1560 /* text_cmp()
1561  * Internal comparison function for text strings.
1562  * Returns -1, 0 or 1
1563  */
1564 static int
1565 text_cmp(text *arg1, text *arg2, Oid collid)
1566 {
1567  char *a1p,
1568  *a2p;
1569  int len1,
1570  len2;
1571 
1572  a1p = VARDATA_ANY(arg1);
1573  a2p = VARDATA_ANY(arg2);
1574 
1575  len1 = VARSIZE_ANY_EXHDR(arg1);
1576  len2 = VARSIZE_ANY_EXHDR(arg2);
1577 
1578  return varstr_cmp(a1p, len1, a2p, len2, collid);
1579 }
1580 
1581 /*
1582  * Comparison functions for text strings.
1583  *
1584  * Note: btree indexes need these routines not to leak memory; therefore,
1585  * be careful to free working copies of toasted datums. Most places don't
1586  * need to be so careful.
1587  */
1588 
1589 Datum
1591 {
1592  Datum arg1 = PG_GETARG_DATUM(0);
1593  Datum arg2 = PG_GETARG_DATUM(1);
1594  bool result;
1595  Size len1,
1596  len2;
1597 
1598  /*
1599  * Since we only care about equality or not-equality, we can avoid all the
1600  * expense of strcoll() here, and just do bitwise comparison. In fact, we
1601  * don't even have to do a bitwise comparison if we can show the lengths
1602  * of the strings are unequal; which might save us from having to detoast
1603  * one or both values.
1604  */
1605  len1 = toast_raw_datum_size(arg1);
1606  len2 = toast_raw_datum_size(arg2);
1607  if (len1 != len2)
1608  result = false;
1609  else
1610  {
1611  text *targ1 = DatumGetTextPP(arg1);
1612  text *targ2 = DatumGetTextPP(arg2);
1613 
1614  result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1615  len1 - VARHDRSZ) == 0);
1616 
1617  PG_FREE_IF_COPY(targ1, 0);
1618  PG_FREE_IF_COPY(targ2, 1);
1619  }
1620 
1621  PG_RETURN_BOOL(result);
1622 }
1623 
1624 Datum
1626 {
1627  Datum arg1 = PG_GETARG_DATUM(0);
1628  Datum arg2 = PG_GETARG_DATUM(1);
1629  bool result;
1630  Size len1,
1631  len2;
1632 
1633  /* See comment in texteq() */
1634  len1 = toast_raw_datum_size(arg1);
1635  len2 = toast_raw_datum_size(arg2);
1636  if (len1 != len2)
1637  result = true;
1638  else
1639  {
1640  text *targ1 = DatumGetTextPP(arg1);
1641  text *targ2 = DatumGetTextPP(arg2);
1642 
1643  result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1644  len1 - VARHDRSZ) != 0);
1645 
1646  PG_FREE_IF_COPY(targ1, 0);
1647  PG_FREE_IF_COPY(targ2, 1);
1648  }
1649 
1650  PG_RETURN_BOOL(result);
1651 }
1652 
1653 Datum
1655 {
1656  text *arg1 = PG_GETARG_TEXT_PP(0);
1657  text *arg2 = PG_GETARG_TEXT_PP(1);
1658  bool result;
1659 
1660  result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) < 0);
1661 
1662  PG_FREE_IF_COPY(arg1, 0);
1663  PG_FREE_IF_COPY(arg2, 1);
1664 
1665  PG_RETURN_BOOL(result);
1666 }
1667 
1668 Datum
1670 {
1671  text *arg1 = PG_GETARG_TEXT_PP(0);
1672  text *arg2 = PG_GETARG_TEXT_PP(1);
1673  bool result;
1674 
1675  result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) <= 0);
1676 
1677  PG_FREE_IF_COPY(arg1, 0);
1678  PG_FREE_IF_COPY(arg2, 1);
1679 
1680  PG_RETURN_BOOL(result);
1681 }
1682 
1683 Datum
1685 {
1686  text *arg1 = PG_GETARG_TEXT_PP(0);
1687  text *arg2 = PG_GETARG_TEXT_PP(1);
1688  bool result;
1689 
1690  result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) > 0);
1691 
1692  PG_FREE_IF_COPY(arg1, 0);
1693  PG_FREE_IF_COPY(arg2, 1);
1694 
1695  PG_RETURN_BOOL(result);
1696 }
1697 
1698 Datum
1700 {
1701  text *arg1 = PG_GETARG_TEXT_PP(0);
1702  text *arg2 = PG_GETARG_TEXT_PP(1);
1703  bool result;
1704 
1705  result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) >= 0);
1706 
1707  PG_FREE_IF_COPY(arg1, 0);
1708  PG_FREE_IF_COPY(arg2, 1);
1709 
1710  PG_RETURN_BOOL(result);
1711 }
1712 
1713 Datum
1715 {
1716  text *arg1 = PG_GETARG_TEXT_PP(0);
1717  text *arg2 = PG_GETARG_TEXT_PP(1);
1718  int32 result;
1719 
1720  result = text_cmp(arg1, arg2, PG_GET_COLLATION());
1721 
1722  PG_FREE_IF_COPY(arg1, 0);
1723  PG_FREE_IF_COPY(arg2, 1);
1724 
1725  PG_RETURN_INT32(result);
1726 }
1727 
1728 Datum
1730 {
1732  Oid collid = ssup->ssup_collation;
1733  MemoryContext oldcontext;
1734 
1735  oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
1736 
1737  btsortsupport_worker(ssup, collid);
1738 
1739  MemoryContextSwitchTo(oldcontext);
1740 
1741  PG_RETURN_VOID();
1742 }
1743 
1744 static void
1746 {
1747  bool abbreviate = ssup->abbreviate;
1748  bool collate_c = false;
1749  TextSortSupport *tss;
1750 
1751 #ifdef HAVE_LOCALE_T
1752  pg_locale_t locale = 0;
1753 #endif
1754 
1755  /*
1756  * If possible, set ssup->comparator to a function which can be used to
1757  * directly compare two datums. If we can do this, we'll avoid the
1758  * overhead of a trip through the fmgr layer for every comparison,
1759  * which can be substantial.
1760  *
1761  * Most typically, we'll set the comparator to bttextfastcmp_locale,
1762  * which uses strcoll() to perform comparisons. However, if LC_COLLATE
1763  * = C, we can make things quite a bit faster with bttextfastcmp_c,
1764  * which uses memcmp() rather than strcoll().
1765  *
1766  * There is a further exception on Windows. When the database encoding
1767  * is UTF-8 and we are not using the C collation, complex hacks are
1768  * required. We don't currently have a comparator that handles that case,
1769  * so we fall back on the slow method of having the sort code invoke
1770  * bttextcmp() via the fmgr trampoline.
1771  */
1772  if (lc_collate_is_c(collid))
1773  {
1774  ssup->comparator = bttextfastcmp_c;
1775  collate_c = true;
1776  }
1777 #ifdef WIN32
1778  else if (GetDatabaseEncoding() == PG_UTF8)
1779  return;
1780 #endif
1781  else
1782  {
1784 
1785  /*
1786  * We need a collation-sensitive comparison. To make things faster,
1787  * we'll figure out the collation based on the locale id and cache the
1788  * result.
1789  */
1790  if (collid != DEFAULT_COLLATION_OID)
1791  {
1792  if (!OidIsValid(collid))
1793  {
1794  /*
1795  * This typically means that the parser could not resolve a
1796  * conflict of implicit collations, so report it that way.
1797  */
1798  ereport(ERROR,
1799  (errcode(ERRCODE_INDETERMINATE_COLLATION),
1800  errmsg("could not determine which collation to use for string comparison"),
1801  errhint("Use the COLLATE clause to set the collation explicitly.")));
1802  }
1803 #ifdef HAVE_LOCALE_T
1804  locale = pg_newlocale_from_collation(collid);
1805 #endif
1806  }
1807  }
1808 
1809  /*
1810  * It's possible that there are platforms where the use of abbreviated
1811  * keys should be disabled at compile time. Having only 4 byte datums
1812  * could make worst-case performance drastically more likely, for example.
1813  * Moreover, Darwin's strxfrm() implementations is known to not effectively
1814  * concentrate a significant amount of entropy from the original string in
1815  * earlier transformed blobs. It's possible that other supported platforms
1816  * are similarly encumbered. However, even in those cases, the abbreviated
1817  * keys optimization may win, and if it doesn't, the "abort abbreviation"
1818  * code may rescue us. So, for now, we don't disable this anywhere on the
1819  * basis of performance.
1820  */
1821 
1822  /*
1823  * If we're using abbreviated keys, or if we're using a locale-aware
1824  * comparison, we need to initialize a TextSortSupport object. Both cases
1825  * will make use of the temporary buffers we initialize here for scratch
1826  * space, and the abbreviation case requires additional state.
1827  */
1828  if (abbreviate || !collate_c)
1829  {
1830  tss = palloc(sizeof(TextSortSupport));
1831  tss->buf1 = palloc(TEXTBUFLEN);
1832  tss->buflen1 = TEXTBUFLEN;
1833  tss->buf2 = palloc(TEXTBUFLEN);
1834  tss->buflen2 = TEXTBUFLEN;
1835 #ifdef HAVE_LOCALE_T
1836  tss->locale = locale;
1837 #endif
1838  tss->collate_c = collate_c;
1839  ssup->ssup_extra = tss;
1840 
1841  /*
1842  * If possible, plan to use the abbreviated keys optimization. The
1843  * core code may switch back to authoritative comparator should
1844  * abbreviation be aborted.
1845  */
1846  if (abbreviate)
1847  {
1848  initHyperLogLog(&tss->abbr_card, 10);
1849  initHyperLogLog(&tss->full_card, 10);
1850  ssup->abbrev_full_comparator = ssup->comparator;
1851  ssup->comparator = bttextcmp_abbrev;
1854  }
1855  }
1856 }
1857 
1858 /*
1859  * sortsupport comparison func (for C locale case)
1860  */
1861 static int
1863 {
1864  text *arg1 = DatumGetTextPP(x);
1865  text *arg2 = DatumGetTextPP(y);
1866  char *a1p,
1867  *a2p;
1868  int len1,
1869  len2,
1870  result;
1871 
1872  a1p = VARDATA_ANY(arg1);
1873  a2p = VARDATA_ANY(arg2);
1874 
1875  len1 = VARSIZE_ANY_EXHDR(arg1);
1876  len2 = VARSIZE_ANY_EXHDR(arg2);
1877 
1878  result = memcmp(a1p, a2p, Min(len1, len2));
1879  if ((result == 0) && (len1 != len2))
1880  result = (len1 < len2) ? -1 : 1;
1881 
1882  /* We can't afford to leak memory here. */
1883  if (PointerGetDatum(arg1) != x)
1884  pfree(arg1);
1885  if (PointerGetDatum(arg2) != y)
1886  pfree(arg2);
1887 
1888  return result;
1889 }
1890 
1891 /*
1892  * sortsupport comparison func (for locale case)
1893  */
1894 static int
1896 {
1897  text *arg1 = DatumGetTextPP(x);
1898  text *arg2 = DatumGetTextPP(y);
1899  TextSortSupport *tss = (TextSortSupport *) ssup->ssup_extra;
1900 
1901  /* working state */
1902  char *a1p,
1903  *a2p;
1904  int len1,
1905  len2,
1906  result;
1907 
1908  a1p = VARDATA_ANY(arg1);
1909  a2p = VARDATA_ANY(arg2);
1910 
1911  len1 = VARSIZE_ANY_EXHDR(arg1);
1912  len2 = VARSIZE_ANY_EXHDR(arg2);
1913 
1914  /* Fast pre-check for equality, as discussed in varstr_cmp() */
1915  if (len1 == len2 && memcmp(a1p, a2p, len1) == 0)
1916  {
1917  result = 0;
1918  goto done;
1919  }
1920 
1921  if (len1 >= tss->buflen1)
1922  {
1923  pfree(tss->buf1);
1924  tss->buflen1 = Max(len1 + 1, Min(tss->buflen1 * 2, MaxAllocSize));
1925  tss->buf1 = MemoryContextAlloc(ssup->ssup_cxt, tss->buflen1);
1926  }
1927  if (len2 >= tss->buflen2)
1928  {
1929  pfree(tss->buf2);
1930  tss->buflen2 = Max(len2 + 1, Min(tss->buflen2 * 2, MaxAllocSize));
1931  tss->buf2 = MemoryContextAlloc(ssup->ssup_cxt, tss->buflen2);
1932  }
1933 
1934  memcpy(tss->buf1, a1p, len1);
1935  tss->buf1[len1] = '\0';
1936  memcpy(tss->buf2, a2p, len2);
1937  tss->buf2[len2] = '\0';
1938 
1939 #ifdef HAVE_LOCALE_T
1940  if (tss->locale)
1941  result = strcoll_l(tss->buf1, tss->buf2, tss->locale);
1942  else
1943 #endif
1944  result = strcoll(tss->buf1, tss->buf2);
1945 
1946  /*
1947  * In some locales strcoll() can claim that nonidentical strings are equal.
1948  * Believing that would be bad news for a number of reasons, so we follow
1949  * Perl's lead and sort "equal" strings according to strcmp().
1950  */
1951  if (result == 0)
1952  result = strcmp(tss->buf1, tss->buf2);
1953 
1954 done:
1955  /* We can't afford to leak memory here. */
1956  if (PointerGetDatum(arg1) != x)
1957  pfree(arg1);
1958  if (PointerGetDatum(arg2) != y)
1959  pfree(arg2);
1960 
1961  return result;
1962 }
1963 
1964 /*
1965  * Abbreviated key comparison func
1966  */
1967 static int
1969 {
1970  char *a = (char *) &x;
1971  char *b = (char *) &y;
1972  int result;
1973 
1974  result = memcmp(a, b, sizeof(Datum));
1975 
1976  /*
1977  * When result = 0, the core system will call bttextfastcmp_c() or
1978  * bttextfastcmp_locale(). Even a strcmp() on two non-truncated strxfrm()
1979  * blobs cannot indicate *equality* authoritatively, for the same reason
1980  * that there is a strcoll() tie-breaker call to strcmp() in varstr_cmp().
1981  */
1982  return result;
1983 }
1984 
1985 /*
1986  * Conversion routine for sortsupport. Converts original text to abbreviated
1987  * key representation. Our encoding strategy is simple -- pack the first 8
1988  * bytes of a strxfrm() blob into a Datum.
1989  */
1990 static Datum
1992 {
1993  TextSortSupport *tss = (TextSortSupport *) ssup->ssup_extra;
1994  text *authoritative = DatumGetTextPP(original);
1995  char *authoritative_data = VARDATA_ANY(authoritative);
1996 
1997  /* working state */
1998  Datum res;
1999  char *pres;
2000  int len;
2001  uint32 hash;
2002 
2003  /*
2004  * Abbreviated key representation is a pass-by-value Datum that is treated
2005  * as a char array by the specialized comparator bttextcmp_abbrev().
2006  */
2007  pres = (char *) &res;
2008  /* memset(), so any non-overwritten bytes are NUL */
2009  memset(pres, 0, sizeof(Datum));
2010  len = VARSIZE_ANY_EXHDR(authoritative);
2011 
2012  /*
2013  * If we're using the C collation, use memcmp(), rather than strxfrm(),
2014  * to abbreviate keys. The full comparator for the C locale is always
2015  * memcmp(), and we can't risk having this give a different answer.
2016  * Besides, this should be faster, too.
2017  */
2018  if (tss->collate_c)
2019  memcpy(pres, authoritative_data, Min(len, sizeof(Datum)));
2020  else
2021  {
2022  Size bsize;
2023 
2024  /*
2025  * We're not using the C collation, so fall back on strxfrm.
2026  */
2027 
2028  /* By convention, we use buffer 1 to store and NUL-terminate text */
2029  if (len >= tss->buflen1)
2030  {
2031  pfree(tss->buf1);
2032  tss->buflen1 = Max(len + 1, Min(tss->buflen1 * 2, MaxAllocSize));
2033  tss->buf1 = palloc(tss->buflen1);
2034  }
2035 
2036  /* Just like strcoll(), strxfrm() expects a NUL-terminated string */
2037  memcpy(tss->buf1, VARDATA_ANY(authoritative), len);
2038  tss->buf1[len] = '\0';
2039 
2040  /* Don't leak memory here */
2041  if (PointerGetDatum(authoritative) != original)
2042  pfree(authoritative);
2043 
2044  for (;;)
2045  {
2046 #ifdef HAVE_LOCALE_T
2047  if (tss->locale)
2048  bsize = strxfrm_l(tss->buf2, tss->buf1,
2049  tss->buflen2, tss->locale);
2050  else
2051 #endif
2052  bsize = strxfrm(tss->buf2, tss->buf1, tss->buflen2);
2053 
2054  if (bsize < tss->buflen2)
2055  break;
2056 
2057  /*
2058  * The C standard states that the contents of the buffer is now
2059  * unspecified. Grow buffer, and retry.
2060  */
2061  pfree(tss->buf2);
2062  tss->buflen2 = Max(bsize + 1,
2063  Min(tss->buflen2 * 2, MaxAllocSize));
2064  tss->buf2 = palloc(tss->buflen2);
2065  }
2066 
2067  /*
2068  * Every Datum byte is always compared. This is safe because the
2069  * strxfrm() blob is itself NUL terminated, leaving no danger of
2070  * misinterpreting any NUL bytes not intended to be interpreted as
2071  * logically representing termination.
2072  */
2073  memcpy(pres, tss->buf2, Min(sizeof(Datum), bsize));
2074  }
2075 
2076  /*
2077  * Maintain approximate cardinality of both abbreviated keys and original,
2078  * authoritative keys using HyperLogLog. Used as cheap insurance against
2079  * the worst case, where we do many string transformations for no saving in
2080  * full strcoll()-based comparisons. These statistics are used by
2081  * bttext_abbrev_abort().
2082  *
2083  * First, Hash key proper, or a significant fraction of it. Mix in length
2084  * in order to compensate for cases where differences are past
2085  * PG_CACHE_LINE_SIZE bytes, so as to limit the overhead of hashing.
2086  */
2087  hash = hash_any((unsigned char *) authoritative_data,
2088  Min(len, PG_CACHE_LINE_SIZE));
2089 
2090  if (len > PG_CACHE_LINE_SIZE)
2091  hash ^= DatumGetUInt32(hash_uint32((uint32) len));
2092 
2093  addHyperLogLog(&tss->full_card, hash);
2094 
2095  /* Hash abbreviated key */
2096 #if SIZEOF_DATUM == 8
2097  {
2098  uint32 lohalf,
2099  hihalf;
2100 
2101  lohalf = (uint32) res;
2102  hihalf = (uint32) (res >> 32);
2103  hash = hash_uint32(lohalf ^ hihalf);
2104  }
2105 #else /* SIZEOF_DATUM != 8 */
2106  hash = hash_uint32((uint32) res);
2107 #endif
2108 
2109  addHyperLogLog(&tss->abbr_card, hash);
2110 
2111  return res;
2112 }
2113 
2114 /*
2115  * Callback for estimating effectiveness of abbreviated key optimization, using
2116  * heuristic rules. Returns value indicating if the abbreviation optimization
2117  * should be aborted, based on its projected effectiveness.
2118  */
2119 static bool
2120 bttext_abbrev_abort(int memtupcount, SortSupport ssup)
2121 {
2122  TextSortSupport *tss = (TextSortSupport *) ssup->ssup_extra;
2123  double abbrev_distinct, key_distinct;
2124 
2125  Assert(ssup->abbreviate);
2126 
2127  /* Have a little patience */
2128  if (memtupcount < 20)
2129  return false;
2130 
2131  abbrev_distinct = estimateHyperLogLog(&tss->abbr_card);
2132  key_distinct = estimateHyperLogLog(&tss->full_card);
2133 
2134  /*
2135  * Clamp cardinality estimates to at least one distinct value. While NULLs
2136  * are generally disregarded, if only NULL values were seen so far, that
2137  * might misrepresent costs if we failed to clamp.
2138  */
2139  if (abbrev_distinct <= 1.0)
2140  abbrev_distinct = 1.0;
2141 
2142  if (key_distinct <= 1.0)
2143  key_distinct = 1.0;
2144 
2145  /*
2146  * In the worst case all abbreviated keys are identical, while at the same
2147  * time there are differences within full key strings not captured in
2148  * abbreviations.
2149  */
2150 #ifdef DEBUG_ABBREV_KEYS
2151  {
2152  double norm_abbrev_card = abbrev_distinct / (double) memtupcount;
2153 
2154  elog(DEBUG_elog_output, "abbrev_distinct after %d: %f (key_distinct: %f, norm_abbrev_card: %f)",
2155  memtupcount, abbrev_distinct, key_distinct, norm_abbrev_card);
2156  }
2157 #endif
2158 
2159  /*
2160  * If the number of distinct abbreviated keys approximately matches the
2161  * number of distinct authoritative original keys, that's reason enough to
2162  * proceed. We can win even with a very low cardinality set if most
2163  * tie-breakers only memcmp(). This is by far the most important
2164  * consideration.
2165  *
2166  * While comparisons that are resolved at the abbreviated key level are
2167  * considerably cheaper than tie-breakers resolved with memcmp(), both of
2168  * those two outcomes are so much cheaper than a full strcoll() once
2169  * sorting is underway that it doesn't seem worth it to weigh abbreviated
2170  * cardinality against the overall size of the set in order to more
2171  * accurately model costs. Assume that an abbreviated comparison, and an
2172  * abbreviated comparison with a cheap memcmp()-based authoritative
2173  * resolution are equivalent.
2174  */
2175  if (abbrev_distinct > key_distinct * 0.05)
2176  return false;
2177 
2178  /*
2179  * Abort abbreviation strategy.
2180  *
2181  * The worst case, where all abbreviated keys are identical while all
2182  * original strings differ will typically only see a regression of about
2183  * 10% in execution time for small to medium sized lists of strings.
2184  * Whereas on modern CPUs where cache stalls are the dominant cost, we can
2185  * often expect very large improvements, particularly with sets of strings
2186  * of moderately high to high abbreviated cardinality. There is little to
2187  * lose but much to gain, which our strategy reflects.
2188  */
2189 #ifdef DEBUG_ABBREV_KEYS
2190  elog(DEBUG_elog_output, "would have aborted abbreviation due to worst-case at %d. abbrev_distinct: %f, key_distinct: %f",
2191  memtupcount, abbrev_distinct, key_distinct);
2192  /* Actually abort only when debugging is disabled */
2193  return false;
2194 #endif
2195 
2196  return true;
2197 }
2198 
2199 Datum
2201 {
2202  text *arg1 = PG_GETARG_TEXT_PP(0);
2203  text *arg2 = PG_GETARG_TEXT_PP(1);
2204  text *result;
2205 
2206  result = ((text_cmp(arg1, arg2, PG_GET_COLLATION()) > 0) ? arg1 : arg2);
2207 
2208  PG_RETURN_TEXT_P(result);
2209 }
2210 
2211 Datum
2213 {
2214  text *arg1 = PG_GETARG_TEXT_PP(0);
2215  text *arg2 = PG_GETARG_TEXT_PP(1);
2216  text *result;
2217 
2218  result = ((text_cmp(arg1, arg2, PG_GET_COLLATION()) < 0) ? arg1 : arg2);
2219 
2220  PG_RETURN_TEXT_P(result);
2221 }
2222 
2223 
2224 /*
2225  * The following operators support character-by-character comparison
2226  * of text datums, to allow building indexes suitable for LIKE clauses.
2227  * Note that the regular texteq/textne comparison operators are assumed
2228  * to be compatible with these!
2229  */
2230 
2231 static int
2233 {
2234  int result;
2235  int len1,
2236  len2;
2237 
2238  len1 = VARSIZE_ANY_EXHDR(arg1);
2239  len2 = VARSIZE_ANY_EXHDR(arg2);
2240 
2241  result = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
2242  if (result != 0)
2243  return result;
2244  else if (len1 < len2)
2245  return -1;
2246  else if (len1 > len2)
2247  return 1;
2248  else
2249  return 0;
2250 }
2251 
2252 
2253 Datum
2255 {
2256  text *arg1 = PG_GETARG_TEXT_PP(0);
2257  text *arg2 = PG_GETARG_TEXT_PP(1);
2258  int result;
2259 
2260  result = internal_text_pattern_compare(arg1, arg2);
2261 
2262  PG_FREE_IF_COPY(arg1, 0);
2263  PG_FREE_IF_COPY(arg2, 1);
2264 
2265  PG_RETURN_BOOL(result < 0);
2266 }
2267 
2268 
2269 Datum
2271 {
2272  text *arg1 = PG_GETARG_TEXT_PP(0);
2273  text *arg2 = PG_GETARG_TEXT_PP(1);
2274  int result;
2275 
2276  result = internal_text_pattern_compare(arg1, arg2);
2277 
2278  PG_FREE_IF_COPY(arg1, 0);
2279  PG_FREE_IF_COPY(arg2, 1);
2280 
2281  PG_RETURN_BOOL(result <= 0);
2282 }
2283 
2284 
2285 Datum
2287 {
2288  text *arg1 = PG_GETARG_TEXT_PP(0);
2289  text *arg2 = PG_GETARG_TEXT_PP(1);
2290  int result;
2291 
2292  result = internal_text_pattern_compare(arg1, arg2);
2293 
2294  PG_FREE_IF_COPY(arg1, 0);
2295  PG_FREE_IF_COPY(arg2, 1);
2296 
2297  PG_RETURN_BOOL(result >= 0);
2298 }
2299 
2300 
2301 Datum
2303 {
2304  text *arg1 = PG_GETARG_TEXT_PP(0);
2305  text *arg2 = PG_GETARG_TEXT_PP(1);
2306  int result;
2307 
2308  result = internal_text_pattern_compare(arg1, arg2);
2309 
2310  PG_FREE_IF_COPY(arg1, 0);
2311  PG_FREE_IF_COPY(arg2, 1);
2312 
2313  PG_RETURN_BOOL(result > 0);
2314 }
2315 
2316 
2317 Datum
2319 {
2320  text *arg1 = PG_GETARG_TEXT_PP(0);
2321  text *arg2 = PG_GETARG_TEXT_PP(1);
2322  int result;
2323 
2324  result = internal_text_pattern_compare(arg1, arg2);
2325 
2326  PG_FREE_IF_COPY(arg1, 0);
2327  PG_FREE_IF_COPY(arg2, 1);
2328 
2329  PG_RETURN_INT32(result);
2330 }
2331 
2332 
2333 /*-------------------------------------------------------------
2334  * byteaoctetlen
2335  *
2336  * get the number of bytes contained in an instance of type 'bytea'
2337  *-------------------------------------------------------------
2338  */
2339 Datum
2341 {
2342  Datum str = PG_GETARG_DATUM(0);
2343 
2344  /* We need not detoast the input at all */
2346 }
2347 
2348 /*
2349  * byteacat -
2350  * takes two bytea* and returns a bytea* that is the concatenation of
2351  * the two.
2352  *
2353  * Cloned from textcat and modified as required.
2354  */
2355 Datum
2357 {
2358  bytea *t1 = PG_GETARG_BYTEA_PP(0);
2359  bytea *t2 = PG_GETARG_BYTEA_PP(1);
2360 
2362 }
2363 
2364 /*
2365  * bytea_catenate
2366  * Guts of byteacat(), broken out so it can be used by other functions
2367  *
2368  * Arguments can be in short-header form, but not compressed or out-of-line
2369  */
2370 static bytea *
2372 {
2373  bytea *result;
2374  int len1,
2375  len2,
2376  len;
2377  char *ptr;
2378 
2379  len1 = VARSIZE_ANY_EXHDR(t1);
2380  len2 = VARSIZE_ANY_EXHDR(t2);
2381 
2382  /* paranoia ... probably should throw error instead? */
2383  if (len1 < 0)
2384  len1 = 0;
2385  if (len2 < 0)
2386  len2 = 0;
2387 
2388  len = len1 + len2 + VARHDRSZ;
2389  result = (bytea *) palloc(len);
2390 
2391  /* Set size of result string... */
2392  SET_VARSIZE(result, len);
2393 
2394  /* Fill data field of result string... */
2395  ptr = VARDATA(result);
2396  if (len1 > 0)
2397  memcpy(ptr, VARDATA_ANY(t1), len1);
2398  if (len2 > 0)
2399  memcpy(ptr + len1, VARDATA_ANY(t2), len2);
2400 
2401  return result;
2402 }
2403 
2404 #define PG_STR_GET_BYTEA(str_) \
2405  DatumGetByteaP(DirectFunctionCall1(byteain, CStringGetDatum(str_)))
2406 
2407 /*
2408  * bytea_substr()
2409  * Return a substring starting at the specified position.
2410  * Cloned from text_substr and modified as required.
2411  *
2412  * Input:
2413  * - string
2414  * - starting position (is one-based)
2415  * - string length (optional)
2416  *
2417  * If the starting position is zero or less, then return from the start of the string
2418  * adjusting the length to be consistent with the "negative start" per SQL.
2419  * If the length is less than zero, an ERROR is thrown. If no third argument
2420  * (length) is provided, the length to the end of the string is assumed.
2421  */
2422 Datum
2424 {
2426  PG_GETARG_INT32(1),
2427  PG_GETARG_INT32(2),
2428  false));
2429 }
2430 
2431 /*
2432  * bytea_substr_no_len -
2433  * Wrapper to avoid opr_sanity failure due to
2434  * one function accepting a different number of args.
2435  */
2436 Datum
2438 {
2440  PG_GETARG_INT32(1),
2441  -1,
2442  true));
2443 }
2444 
2445 static bytea *
2447  int S,
2448  int L,
2449  bool length_not_specified)
2450 {
2451  int S1; /* adjusted start position */
2452  int L1; /* adjusted substring length */
2453 
2454  S1 = Max(S, 1);
2455 
2456  if (length_not_specified)
2457  {
2458  /*
2459  * Not passed a length - DatumGetByteaPSlice() grabs everything to the
2460  * end of the string if we pass it a negative value for length.
2461  */
2462  L1 = -1;
2463  }
2464  else
2465  {
2466  /* end position */
2467  int E = S + L;
2468 
2469  /*
2470  * A negative value for L is the only way for the end position to be
2471  * before the start. SQL99 says to throw an error.
2472  */
2473  if (E < S)
2474  ereport(ERROR,
2475  (errcode(ERRCODE_SUBSTRING_ERROR),
2476  errmsg("negative substring length not allowed")));
2477 
2478  /*
2479  * A zero or negative value for the end position can happen if the
2480  * start was negative or one. SQL99 says to return a zero-length
2481  * string.
2482  */
2483  if (E < 1)
2484  return PG_STR_GET_BYTEA("");
2485 
2486  L1 = E - S1;
2487  }
2488 
2489  /*
2490  * If the start position is past the end of the string, SQL99 says to
2491  * return a zero-length string -- DatumGetByteaPSlice() will do that for
2492  * us. Convert to zero-based starting position
2493  */
2494  return DatumGetByteaPSlice(str, S1 - 1, L1);
2495 }
2496 
2497 /*
2498  * byteaoverlay
2499  * Replace specified substring of first string with second
2500  *
2501  * The SQL standard defines OVERLAY() in terms of substring and concatenation.
2502  * This code is a direct implementation of what the standard says.
2503  */
2504 Datum
2506 {
2507  bytea *t1 = PG_GETARG_BYTEA_PP(0);
2508  bytea *t2 = PG_GETARG_BYTEA_PP(1);
2509  int sp = PG_GETARG_INT32(2); /* substring start position */
2510  int sl = PG_GETARG_INT32(3); /* substring length */
2511 
2512  PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl));
2513 }
2514 
2515 Datum
2517 {
2518  bytea *t1 = PG_GETARG_BYTEA_PP(0);
2519  bytea *t2 = PG_GETARG_BYTEA_PP(1);
2520  int sp = PG_GETARG_INT32(2); /* substring start position */
2521  int sl;
2522 
2523  sl = VARSIZE_ANY_EXHDR(t2); /* defaults to length(t2) */
2524  PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl));
2525 }
2526 
2527 static bytea *
2528 bytea_overlay(bytea *t1, bytea *t2, int sp, int sl)
2529 {
2530  bytea *result;
2531  bytea *s1;
2532  bytea *s2;
2533  int sp_pl_sl;
2534 
2535  /*
2536  * Check for possible integer-overflow cases. For negative sp, throw a
2537  * "substring length" error because that's what should be expected
2538  * according to the spec's definition of OVERLAY().
2539  */
2540  if (sp <= 0)
2541  ereport(ERROR,
2542  (errcode(ERRCODE_SUBSTRING_ERROR),
2543  errmsg("negative substring length not allowed")));
2544  sp_pl_sl = sp + sl;
2545  if (sp_pl_sl <= sl)
2546  ereport(ERROR,
2547  (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
2548  errmsg("integer out of range")));
2549 
2550  s1 = bytea_substring(PointerGetDatum(t1), 1, sp - 1, false);
2551  s2 = bytea_substring(PointerGetDatum(t1), sp_pl_sl, -1, true);
2552  result = bytea_catenate(s1, t2);
2553  result = bytea_catenate(result, s2);
2554 
2555  return result;
2556 }
2557 
2558 /*
2559  * byteapos -
2560  * Return the position of the specified substring.
2561  * Implements the SQL POSITION() function.
2562  * Cloned from textpos and modified as required.
2563  */
2564 Datum
2566 {
2567  bytea *t1 = PG_GETARG_BYTEA_PP(0);
2568  bytea *t2 = PG_GETARG_BYTEA_PP(1);
2569  int pos;
2570  int px,
2571  p;
2572  int len1,
2573  len2;
2574  char *p1,
2575  *p2;
2576 
2577  len1 = VARSIZE_ANY_EXHDR(t1);
2578  len2 = VARSIZE_ANY_EXHDR(t2);
2579 
2580  if (len2 <= 0)
2581  PG_RETURN_INT32(1); /* result for empty pattern */
2582 
2583  p1 = VARDATA_ANY(t1);
2584  p2 = VARDATA_ANY(t2);
2585 
2586  pos = 0;
2587  px = (len1 - len2);
2588  for (p = 0; p <= px; p++)
2589  {
2590  if ((*p2 == *p1) && (memcmp(p1, p2, len2) == 0))
2591  {
2592  pos = p + 1;
2593  break;
2594  };
2595  p1++;
2596  };
2597 
2598  PG_RETURN_INT32(pos);
2599 }
2600 
2601 /*-------------------------------------------------------------
2602  * byteaGetByte
2603  *
2604  * this routine treats "bytea" as an array of bytes.
2605  * It returns the Nth byte (a number between 0 and 255).
2606  *-------------------------------------------------------------
2607  */
2608 Datum
2610 {
2611  bytea *v = PG_GETARG_BYTEA_PP(0);
2612  int32 n = PG_GETARG_INT32(1);
2613  int len;
2614  int byte;
2615 
2616  len = VARSIZE_ANY_EXHDR(v);
2617 
2618  if (n < 0 || n >= len)
2619  ereport(ERROR,
2620  (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
2621  errmsg("index %d out of valid range, 0..%d",
2622  n, len - 1)));
2623 
2624  byte = ((unsigned char *) VARDATA_ANY(v))[n];
2625 
2626  PG_RETURN_INT32(byte);
2627 }
2628 
2629 /*-------------------------------------------------------------
2630  * byteaGetBit
2631  *
2632  * This routine treats a "bytea" type like an array of bits.
2633  * It returns the value of the Nth bit (0 or 1).
2634  *
2635  *-------------------------------------------------------------
2636  */
2637 Datum
2639 {
2640  bytea *v = PG_GETARG_BYTEA_PP(0);
2641  int32 n = PG_GETARG_INT32(1);
2642  int byteNo,
2643  bitNo;
2644  int len;
2645  int byte;
2646 
2647  len = VARSIZE_ANY_EXHDR(v);
2648 
2649  if (n < 0 || n >= len * 8)
2650  ereport(ERROR,
2651  (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
2652  errmsg("index %d out of valid range, 0..%d",
2653  n, len * 8 - 1)));
2654 
2655  byteNo = n / 8;
2656  bitNo = n % 8;
2657 
2658  byte = ((unsigned char *) VARDATA_ANY(v))[byteNo];
2659 
2660  if (byte & (1 << bitNo))
2661  PG_RETURN_INT32(1);
2662  else
2663  PG_RETURN_INT32(0);
2664 }
2665 
2666 /*-------------------------------------------------------------
2667  * byteaSetByte
2668  *
2669  * Given an instance of type 'bytea' creates a new one with
2670  * the Nth byte set to the given value.
2671  *
2672  *-------------------------------------------------------------
2673  */
2674 Datum
2676 {
2677  bytea *v = PG_GETARG_BYTEA_P(0);
2678  int32 n = PG_GETARG_INT32(1);
2679  int32 newByte = PG_GETARG_INT32(2);
2680  int len;
2681  bytea *res;
2682 
2683  len = VARSIZE(v) - VARHDRSZ;
2684 
2685  if (n < 0 || n >= len)
2686  ereport(ERROR,
2687  (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
2688  errmsg("index %d out of valid range, 0..%d",
2689  n, len - 1)));
2690 
2691  /*
2692  * Make a copy of the original varlena.
2693  */
2694  res = (bytea *) palloc(VARSIZE(v));
2695  memcpy((char *) res, (char *) v, VARSIZE(v));
2696 
2697  /*
2698  * Now set the byte.
2699  */
2700  ((unsigned char *) VARDATA(res))[n] = newByte;
2701 
2702  PG_RETURN_BYTEA_P(res);
2703 }
2704 
2705 /*-------------------------------------------------------------
2706  * byteaSetBit
2707  *
2708  * Given an instance of type 'bytea' creates a new one with
2709  * the Nth bit set to the given value.
2710  *
2711  *-------------------------------------------------------------
2712  */
2713 Datum
2715 {
2716  bytea *v = PG_GETARG_BYTEA_P(0);
2717  int32 n = PG_GETARG_INT32(1);
2718  int32 newBit = PG_GETARG_INT32(2);
2719  bytea *res;
2720  int len;
2721  int oldByte,
2722  newByte;
2723  int byteNo,
2724  bitNo;
2725 
2726  len = VARSIZE(v) - VARHDRSZ;
2727 
2728  if (n < 0 || n >= len * 8)
2729  ereport(ERROR,
2730  (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
2731  errmsg("index %d out of valid range, 0..%d",
2732  n, len * 8 - 1)));
2733 
2734  byteNo = n / 8;
2735  bitNo = n % 8;
2736 
2737  /*
2738  * sanity check!
2739  */
2740  if (newBit != 0 && newBit != 1)
2741  ereport(ERROR,
2742  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2743  errmsg("new bit must be 0 or 1")));
2744 
2745  /*
2746  * Make a copy of the original varlena.
2747  */
2748  res = (bytea *) palloc(VARSIZE(v));
2749  memcpy((char *) res, (char *) v, VARSIZE(v));
2750 
2751  /*
2752  * Update the byte.
2753  */
2754  oldByte = ((unsigned char *) VARDATA(res))[byteNo];
2755 
2756  if (newBit == 0)
2757  newByte = oldByte & (~(1 << bitNo));
2758  else
2759  newByte = oldByte | (1 << bitNo);
2760 
2761  ((unsigned char *) VARDATA(res))[byteNo] = newByte;
2762 
2763  PG_RETURN_BYTEA_P(res);
2764 }
2765 
2766 
2767 /* text_name()
2768  * Converts a text type to a Name type.
2769  */
2770 Datum
2772 {
2773  text *s = PG_GETARG_TEXT_PP(0);
2774  Name result;
2775  int len;
2776 
2777  len = VARSIZE_ANY_EXHDR(s);
2778 
2779  /* Truncate oversize input */
2780  if (len >= NAMEDATALEN)
2781  len = pg_mbcliplen(VARDATA_ANY(s), len, NAMEDATALEN - 1);
2782 
2783  /* We use palloc0 here to ensure result is zero-padded */
2784  result = (Name) palloc0(NAMEDATALEN);
2785  memcpy(NameStr(*result), VARDATA_ANY(s), len);
2786 
2787  PG_RETURN_NAME(result);
2788 }
2789 
2790 /* name_text()
2791  * Converts a Name type to a text type.
2792  */
2793 Datum
2795 {
2796  Name s = PG_GETARG_NAME(0);
2797 
2799 }
2800 
2801 
2802 /*
2803  * textToQualifiedNameList - convert a text object to list of names
2804  *
2805  * This implements the input parsing needed by nextval() and other
2806  * functions that take a text parameter representing a qualified name.
2807  * We split the name at dots, downcase if not double-quoted, and
2808  * truncate names if they're too long.
2809  */
2810 List *
2812 {
2813  char *rawname;
2814  List *result = NIL;
2815  List *namelist;
2816  ListCell *l;
2817 
2818  /* Convert to C string (handles possible detoasting). */
2819  /* Note we rely on being able to modify rawname below. */
2820  rawname = text_to_cstring(textval);
2821 
2822  if (!SplitIdentifierString(rawname, '.', &namelist))
2823  ereport(ERROR,
2824  (errcode(ERRCODE_INVALID_NAME),
2825  errmsg("invalid name syntax")));
2826 
2827  if (namelist == NIL)
2828  ereport(ERROR,
2829  (errcode(ERRCODE_INVALID_NAME),
2830  errmsg("invalid name syntax")));
2831 
2832  foreach(l, namelist)
2833  {
2834  char *curname = (char *) lfirst(l);
2835 
2836  result = lappend(result, makeString(pstrdup(curname)));
2837  }
2838 
2839  pfree(rawname);
2840  list_free(namelist);
2841 
2842  return result;
2843 }
2844 
2845 /*
2846  * SplitIdentifierString --- parse a string containing identifiers
2847  *
2848  * This is the guts of textToQualifiedNameList, and is exported for use in
2849  * other situations such as parsing GUC variables. In the GUC case, it's
2850  * important to avoid memory leaks, so the API is designed to minimize the
2851  * amount of stuff that needs to be allocated and freed.
2852  *
2853  * Inputs:
2854  * rawstring: the input string; must be overwritable! On return, it's
2855  * been modified to contain the separated identifiers.
2856  * separator: the separator punctuation expected between identifiers
2857  * (typically '.' or ','). Whitespace may also appear around
2858  * identifiers.
2859  * Outputs:
2860  * namelist: filled with a palloc'd list of pointers to identifiers within
2861  * rawstring. Caller should list_free() this even on error return.
2862  *
2863  * Returns TRUE if okay, FALSE if there is a syntax error in the string.
2864  *
2865  * Note that an empty string is considered okay here, though not in
2866  * textToQualifiedNameList.
2867  */
2868 bool
2869 SplitIdentifierString(char *rawstring, char separator,
2870  List **namelist)
2871 {
2872  char *nextp = rawstring;
2873  bool done = false;
2874 
2875  *namelist = NIL;
2876 
2877  while (isspace((unsigned char) *nextp))
2878  nextp++; /* skip leading whitespace */
2879 
2880  if (*nextp == '\0')
2881  return true; /* allow empty string */
2882 
2883  /* At the top of the loop, we are at start of a new identifier. */
2884  do
2885  {
2886  char *curname;
2887  char *endp;
2888 
2889  if (*nextp == '\"')
2890  {
2891  /* Quoted name --- collapse quote-quote pairs, no downcasing */
2892  curname = nextp + 1;
2893  for (;;)
2894  {
2895  endp = strchr(nextp + 1, '\"');
2896  if (endp == NULL)
2897  return false; /* mismatched quotes */
2898  if (endp[1] != '\"')
2899  break; /* found end of quoted name */
2900  /* Collapse adjacent quotes into one quote, and look again */
2901  memmove(endp, endp + 1, strlen(endp));
2902  nextp = endp;
2903  }
2904  /* endp now points at the terminating quote */
2905  nextp = endp + 1;
2906  }
2907  else
2908  {
2909  /* Unquoted name --- extends to separator or whitespace */
2910  char *downname;
2911  int len;
2912 
2913  curname = nextp;
2914  while (*nextp && *nextp != separator &&
2915  !isspace((unsigned char) *nextp))
2916  nextp++;
2917  endp = nextp;
2918  if (curname == nextp)
2919  return false; /* empty unquoted name not allowed */
2920 
2921  /*
2922  * Downcase the identifier, using same code as main lexer does.
2923  *
2924  * XXX because we want to overwrite the input in-place, we cannot
2925  * support a downcasing transformation that increases the string
2926  * length. This is not a problem given the current implementation
2927  * of downcase_truncate_identifier, but we'll probably have to do
2928  * something about this someday.
2929  */
2930  len = endp - curname;
2931  downname = downcase_truncate_identifier(curname, len, false);
2932  Assert(strlen(downname) <= len);
2933  strncpy(curname, downname, len); /* strncpy is required here */
2934  pfree(downname);
2935  }
2936 
2937  while (isspace((unsigned char) *nextp))
2938  nextp++; /* skip trailing whitespace */
2939 
2940  if (*nextp == separator)
2941  {
2942  nextp++;
2943  while (isspace((unsigned char) *nextp))
2944  nextp++; /* skip leading whitespace for next */
2945  /* we expect another name, so done remains false */
2946  }
2947  else if (*nextp == '\0')
2948  done = true;
2949  else
2950  return false; /* invalid syntax */
2951 
2952  /* Now safe to overwrite separator with a null */
2953  *endp = '\0';
2954 
2955  /* Truncate name if it's overlength */
2956  truncate_identifier(curname, strlen(curname), false);
2957 
2958  /*
2959  * Finished isolating current name --- add it to list
2960  */
2961  *namelist = lappend(*namelist, curname);
2962 
2963  /* Loop back if we didn't reach end of string */
2964  } while (!done);
2965 
2966  return true;
2967 }
2968 
2969 
2970 /*
2971  * SplitDirectoriesString --- parse a string containing directory names
2972  *
2973  * This is similar to SplitIdentifierString, except that the parsing
2974  * rules are meant to handle pathnames instead of identifiers: there is
2975  * no downcasing, embedded spaces are allowed, the max length is MAXPGPATH-1,
2976  * and we apply canonicalize_path() to each extracted string. Because of the
2977  * last, the returned strings are separately palloc'd rather than being
2978  * pointers into rawstring --- but we still scribble on rawstring.
2979  *
2980  * Inputs:
2981  * rawstring: the input string; must be modifiable!
2982  * separator: the separator punctuation expected between directories
2983  * (typically ',' or ';'). Whitespace may also appear around
2984  * directories.
2985  * Outputs:
2986  * namelist: filled with a palloc'd list of directory names.
2987  * Caller should list_free_deep() this even on error return.
2988  *
2989  * Returns TRUE if okay, FALSE if there is a syntax error in the string.
2990  *
2991  * Note that an empty string is considered okay here.
2992  */
2993 bool
2994 SplitDirectoriesString(char *rawstring, char separator,
2995  List **namelist)
2996 {
2997  char *nextp = rawstring;
2998  bool done = false;
2999 
3000  *namelist = NIL;
3001 
3002  while (isspace((unsigned char) *nextp))
3003  nextp++; /* skip leading whitespace */
3004 
3005  if (*nextp == '\0')
3006  return true; /* allow empty string */
3007 
3008  /* At the top of the loop, we are at start of a new directory. */
3009  do
3010  {
3011  char *curname;
3012  char *endp;
3013 
3014  if (*nextp == '\"')
3015  {
3016  /* Quoted name --- collapse quote-quote pairs */
3017  curname = nextp + 1;
3018  for (;;)
3019  {
3020  endp = strchr(nextp + 1, '\"');
3021  if (endp == NULL)
3022  return false; /* mismatched quotes */
3023  if (endp[1] != '\"')
3024  break; /* found end of quoted name */
3025  /* Collapse adjacent quotes into one quote, and look again */
3026  memmove(endp, endp + 1, strlen(endp));
3027  nextp = endp;
3028  }
3029  /* endp now points at the terminating quote */
3030  nextp = endp + 1;
3031  }
3032  else
3033  {
3034  /* Unquoted name --- extends to separator or end of string */
3035  curname = endp = nextp;
3036  while (*nextp && *nextp != separator)
3037  {
3038  /* trailing whitespace should not be included in name */
3039  if (!isspace((unsigned char) *nextp))
3040  endp = nextp + 1;
3041  nextp++;
3042  }
3043  if (curname == endp)
3044  return false; /* empty unquoted name not allowed */
3045  }
3046 
3047  while (isspace((unsigned char) *nextp))
3048  nextp++; /* skip trailing whitespace */
3049 
3050  if (*nextp == separator)
3051  {
3052  nextp++;
3053  while (isspace((unsigned char) *nextp))
3054  nextp++; /* skip leading whitespace for next */
3055  /* we expect another name, so done remains false */
3056  }
3057  else if (*nextp == '\0')
3058  done = true;
3059  else
3060  return false; /* invalid syntax */
3061 
3062  /* Now safe to overwrite separator with a null */
3063  *endp = '\0';
3064 
3065  /* Truncate path if it's overlength */
3066  if (strlen(curname) >= MAXPGPATH)
3067  curname[MAXPGPATH - 1] = '\0';
3068 
3069  /*
3070  * Finished isolating current name --- add it to list
3071  */
3072  curname = pstrdup(curname);
3073  canonicalize_path(curname);
3074  *namelist = lappend(*namelist, curname);
3075 
3076  /* Loop back if we didn't reach end of string */
3077  } while (!done);
3078 
3079  return true;
3080 }
3081 
3082 
3083 /*****************************************************************************
3084  * Comparison Functions used for bytea
3085  *
3086  * Note: btree indexes need these routines not to leak memory; therefore,
3087  * be careful to free working copies of toasted datums. Most places don't
3088  * need to be so careful.
3089  *****************************************************************************/
3090 
3091 Datum
3093 {
3094  Datum arg1 = PG_GETARG_DATUM(0);
3095  Datum arg2 = PG_GETARG_DATUM(1);
3096  bool result;
3097  Size len1,
3098  len2;
3099 
3100  /*
3101  * We can use a fast path for unequal lengths, which might save us from
3102  * having to detoast one or both values.
3103  */
3104  len1 = toast_raw_datum_size(arg1);
3105  len2 = toast_raw_datum_size(arg2);
3106  if (len1 != len2)
3107  result = false;
3108  else
3109  {
3110  bytea *barg1 = DatumGetByteaPP(arg1);
3111  bytea *barg2 = DatumGetByteaPP(arg2);
3112 
3113  result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2),
3114  len1 - VARHDRSZ) == 0);
3115 
3116  PG_FREE_IF_COPY(barg1, 0);
3117  PG_FREE_IF_COPY(barg2, 1);
3118  }
3119 
3120  PG_RETURN_BOOL(result);
3121 }
3122 
3123 Datum
3125 {
3126  Datum arg1 = PG_GETARG_DATUM(0);
3127  Datum arg2 = PG_GETARG_DATUM(1);
3128  bool result;
3129  Size len1,
3130  len2;
3131 
3132  /*
3133  * We can use a fast path for unequal lengths, which might save us from
3134  * having to detoast one or both values.
3135  */
3136  len1 = toast_raw_datum_size(arg1);
3137  len2 = toast_raw_datum_size(arg2);
3138  if (len1 != len2)
3139  result = true;
3140  else
3141  {
3142  bytea *barg1 = DatumGetByteaPP(arg1);
3143  bytea *barg2 = DatumGetByteaPP(arg2);
3144 
3145  result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2),
3146  len1 - VARHDRSZ) != 0);
3147 
3148  PG_FREE_IF_COPY(barg1, 0);
3149  PG_FREE_IF_COPY(barg2, 1);
3150  }
3151 
3152  PG_RETURN_BOOL(result);
3153 }
3154 
3155 Datum
3157 {
3158  bytea *arg1 = PG_GETARG_BYTEA_PP(0);
3159  bytea *arg2 = PG_GETARG_BYTEA_PP(1);
3160  int len1,
3161  len2;
3162  int cmp;
3163 
3164  len1 = VARSIZE_ANY_EXHDR(arg1);
3165  len2 = VARSIZE_ANY_EXHDR(arg2);
3166 
3167  cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3168 
3169  PG_FREE_IF_COPY(arg1, 0);
3170  PG_FREE_IF_COPY(arg2, 1);
3171 
3172  PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 < len2)));
3173 }
3174 
3175 Datum
3177 {
3178  bytea *arg1 = PG_GETARG_BYTEA_PP(0);
3179  bytea *arg2 = PG_GETARG_BYTEA_PP(1);
3180  int len1,
3181  len2;
3182  int cmp;
3183 
3184  len1 = VARSIZE_ANY_EXHDR(arg1);
3185  len2 = VARSIZE_ANY_EXHDR(arg2);
3186 
3187  cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3188 
3189  PG_FREE_IF_COPY(arg1, 0);
3190  PG_FREE_IF_COPY(arg2, 1);
3191 
3192  PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 <= len2)));
3193 }
3194 
3195 Datum
3197 {
3198  bytea *arg1 = PG_GETARG_BYTEA_PP(0);
3199  bytea *arg2 = PG_GETARG_BYTEA_PP(1);
3200  int len1,
3201  len2;
3202  int cmp;
3203 
3204  len1 = VARSIZE_ANY_EXHDR(arg1);
3205  len2 = VARSIZE_ANY_EXHDR(arg2);
3206 
3207  cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3208 
3209  PG_FREE_IF_COPY(arg1, 0);
3210  PG_FREE_IF_COPY(arg2, 1);
3211 
3212  PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 > len2)));
3213 }
3214 
3215 Datum
3217 {
3218  bytea *arg1 = PG_GETARG_BYTEA_PP(0);
3219  bytea *arg2 = PG_GETARG_BYTEA_PP(1);
3220  int len1,
3221  len2;
3222  int cmp;
3223 
3224  len1 = VARSIZE_ANY_EXHDR(arg1);
3225  len2 = VARSIZE_ANY_EXHDR(arg2);
3226 
3227  cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3228 
3229  PG_FREE_IF_COPY(arg1, 0);
3230  PG_FREE_IF_COPY(arg2, 1);
3231 
3232  PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 >= len2)));
3233 }
3234 
3235 Datum
3237 {
3238  bytea *arg1 = PG_GETARG_BYTEA_PP(0);
3239  bytea *arg2 = PG_GETARG_BYTEA_PP(1);
3240  int len1,
3241  len2;
3242  int cmp;
3243 
3244  len1 = VARSIZE_ANY_EXHDR(arg1);
3245  len2 = VARSIZE_ANY_EXHDR(arg2);
3246 
3247  cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3248  if ((cmp == 0) && (len1 != len2))
3249  cmp = (len1 < len2) ? -1 : 1;
3250 
3251  PG_FREE_IF_COPY(arg1, 0);
3252  PG_FREE_IF_COPY(arg2, 1);
3253 
3254  PG_RETURN_INT32(cmp);
3255 }
3256 
3257 /*
3258  * appendStringInfoText
3259  *
3260  * Append a text to str.
3261  * Like appendStringInfoString(str, text_to_cstring(t)) but faster.
3262  */
3263 static void
3265 {
3267 }
3268 
3269 /*
3270  * replace_text
3271  * replace all occurrences of 'old_sub_str' in 'orig_str'
3272  * with 'new_sub_str' to form 'new_str'
3273  *
3274  * returns 'orig_str' if 'old_sub_str' == '' or 'orig_str' == ''
3275  * otherwise returns 'new_str'
3276  */
3277 Datum
3279 {
3280  text *src_text = PG_GETARG_TEXT_PP(0);
3281  text *from_sub_text = PG_GETARG_TEXT_PP(1);
3282  text *to_sub_text = PG_GETARG_TEXT_PP(2);
3283  int src_text_len;
3284  int from_sub_text_len;
3286  text *ret_text;
3287  int start_posn;
3288  int curr_posn;
3289  int chunk_len;
3290  char *start_ptr;
3291  StringInfoData str;
3292 
3293  text_position_setup(src_text, from_sub_text, &state);
3294 
3295  /*
3296  * Note: we check the converted string length, not the original, because
3297  * they could be different if the input contained invalid encoding.
3298  */
3299  src_text_len = state.len1;
3300  from_sub_text_len = state.len2;
3301 
3302  /* Return unmodified source string if empty source or pattern */
3303  if (src_text_len < 1 || from_sub_text_len < 1)
3304  {
3305  text_position_cleanup(&state);
3306  PG_RETURN_TEXT_P(src_text);
3307  }
3308 
3309  start_posn = 1;
3310  curr_posn = text_position_next(1, &state);
3311 
3312  /* When the from_sub_text is not found, there is nothing to do. */
3313  if (curr_posn == 0)
3314  {
3315  text_position_cleanup(&state);
3316  PG_RETURN_TEXT_P(src_text);
3317  }
3318 
3319  /* start_ptr points to the start_posn'th character of src_text */
3320  start_ptr = VARDATA_ANY(src_text);
3321 
3322  initStringInfo(&str);
3323 
3324  do
3325  {
3327 
3328  /* copy the data skipped over by last text_position_next() */
3329  chunk_len = charlen_to_bytelen(start_ptr, curr_posn - start_posn);
3330  appendBinaryStringInfo(&str, start_ptr, chunk_len);
3331 
3332  appendStringInfoText(&str, to_sub_text);
3333 
3334  start_posn = curr_posn;
3335  start_ptr += chunk_len;
3336  start_posn += from_sub_text_len;
3337  start_ptr += charlen_to_bytelen(start_ptr, from_sub_text_len);
3338 
3339  curr_posn = text_position_next(start_posn, &state);
3340  }
3341  while (curr_posn > 0);
3342 
3343  /* copy trailing data */
3344  chunk_len = ((char *) src_text + VARSIZE_ANY(src_text)) - start_ptr;
3345  appendBinaryStringInfo(&str, start_ptr, chunk_len);
3346 
3347  text_position_cleanup(&state);
3348 
3349  ret_text = cstring_to_text_with_len(str.data, str.len);
3350  pfree(str.data);
3351 
3352  PG_RETURN_TEXT_P(ret_text);
3353 }
3354 
3355 /*
3356  * check_replace_text_has_escape_char
3357  *
3358  * check whether replace_text contains escape char.
3359  */
3360 static bool
3362 {
3363  const char *p = VARDATA_ANY(replace_text);
3364  const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);
3365 
3367  {
3368  for (; p < p_end; p++)
3369  {
3370  if (*p == '\\')
3371  return true;
3372  }
3373  }
3374  else
3375  {
3376  for (; p < p_end; p += pg_mblen(p))
3377  {
3378  if (*p == '\\')
3379  return true;
3380  }
3381  }
3382 
3383  return false;
3384 }
3385 
3386 /*
3387  * appendStringInfoRegexpSubstr
3388  *
3389  * Append replace_text to str, substituting regexp back references for
3390  * \n escapes. start_ptr is the start of the match in the source string,
3391  * at logical character position data_pos.
3392  */
3393 static void
3395  regmatch_t *pmatch,
3396  char *start_ptr, int data_pos)
3397 {
3398  const char *p = VARDATA_ANY(replace_text);
3399  const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);
3400  int eml = pg_database_encoding_max_length();
3401 
3402  for (;;)
3403  {
3404  const char *chunk_start = p;
3405  int so;
3406  int eo;
3407 
3408  /* Find next escape char. */
3409  if (eml == 1)
3410  {
3411  for (; p < p_end && *p != '\\'; p++)
3412  /* nothing */ ;
3413  }
3414  else
3415  {
3416  for (; p < p_end && *p != '\\'; p += pg_mblen(p))
3417  /* nothing */ ;
3418  }
3419 
3420  /* Copy the text we just scanned over, if any. */
3421  if (p > chunk_start)
3422  appendBinaryStringInfo(str, chunk_start, p - chunk_start);
3423 
3424  /* Done if at end of string, else advance over escape char. */
3425  if (p >= p_end)
3426  break;
3427  p++;
3428 
3429  if (p >= p_end)
3430  {
3431  /* Escape at very end of input. Treat same as unexpected char */
3432  appendStringInfoChar(str, '\\');
3433  break;
3434  }
3435 
3436  if (*p >= '1' && *p <= '9')
3437  {
3438  /* Use the back reference of regexp. */
3439  int idx = *p - '0';
3440 
3441  so = pmatch[idx].rm_so;
3442  eo = pmatch[idx].rm_eo;
3443  p++;
3444  }
3445  else if (*p == '&')
3446  {
3447  /* Use the entire matched string. */
3448  so = pmatch[0].rm_so;
3449  eo = pmatch[0].rm_eo;
3450  p++;
3451  }
3452  else if (*p == '\\')
3453  {
3454  /* \\ means transfer one \ to output. */
3455  appendStringInfoChar(str, '\\');
3456  p++;
3457  continue;
3458  }
3459  else
3460  {
3461  /*
3462  * If escape char is not followed by any expected char, just treat
3463  * it as ordinary data to copy. (XXX would it be better to throw
3464  * an error?)
3465  */
3466  appendStringInfoChar(str, '\\');
3467  continue;
3468  }
3469 
3470  if (so != -1 && eo != -1)
3471  {
3472  /*
3473  * Copy the text that is back reference of regexp. Note so and eo
3474  * are counted in characters not bytes.
3475  */
3476  char *chunk_start;
3477  int chunk_len;
3478 
3479  Assert(so >= data_pos);
3480  chunk_start = start_ptr;
3481  chunk_start += charlen_to_bytelen(chunk_start, so - data_pos);
3482  chunk_len = charlen_to_bytelen(chunk_start, eo - so);
3483  appendBinaryStringInfo(str, chunk_start, chunk_len);
3484  }
3485  }
3486 }
3487 
3488 #define REGEXP_REPLACE_BACKREF_CNT 10
3489 
3490 /*
3491  * replace_text_regexp
3492  *
3493  * replace text that matches to regexp in src_text to replace_text.
3494  *
3495  * Note: to avoid having to include regex.h in builtins.h, we declare
3496  * the regexp argument as void *, but really it's regex_t *.
3497  */
3498 text *
3499 replace_text_regexp(text *src_text, void *regexp,
3500  text *replace_text, bool glob)
3501 {
3502  text *ret_text;
3503  regex_t *re = (regex_t *) regexp;
3504  int src_text_len = VARSIZE_ANY_EXHDR(src_text);
3507  pg_wchar *data;
3508  size_t data_len;
3509  int search_start;
3510  int data_pos;
3511  char *start_ptr;
3512  bool have_escape;
3513 
3514  initStringInfo(&buf);
3515 
3516  /* Convert data string to wide characters. */
3517  data = (pg_wchar *) palloc((src_text_len + 1) * sizeof(pg_wchar));
3518  data_len = pg_mb2wchar_with_len(VARDATA_ANY(src_text), data, src_text_len);
3519 
3520  /* Check whether replace_text has escape char. */
3521  have_escape = check_replace_text_has_escape_char(replace_text);
3522 
3523  /* start_ptr points to the data_pos'th character of src_text */
3524  start_ptr = (char *) VARDATA_ANY(src_text);
3525  data_pos = 0;
3526 
3527  search_start = 0;
3528  while (search_start <= data_len)
3529  {
3530  int regexec_result;
3531 
3533 
3534  regexec_result = pg_regexec(re,
3535  data,
3536  data_len,
3537  search_start,
3538  NULL, /* no details */
3540  pmatch,
3541  0);
3542 
3543  if (regexec_result == REG_NOMATCH)
3544  break;
3545 
3546  if (regexec_result != REG_OKAY)
3547  {
3548  char errMsg[100];
3549 
3551  pg_regerror(regexec_result, re, errMsg, sizeof(errMsg));
3552  ereport(ERROR,
3553  (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
3554  errmsg("regular expression failed: %s", errMsg)));
3555  }
3556 
3557  /*
3558  * Copy the text to the left of the match position. Note we are given
3559  * character not byte indexes.
3560  */
3561  if (pmatch[0].rm_so - data_pos > 0)
3562  {
3563  int chunk_len;
3564 
3565  chunk_len = charlen_to_bytelen(start_ptr,
3566  pmatch[0].rm_so - data_pos);
3567  appendBinaryStringInfo(&buf, start_ptr, chunk_len);
3568 
3569  /*
3570  * Advance start_ptr over that text, to avoid multiple rescans of
3571  * it if the replace_text contains multiple back-references.
3572  */
3573  start_ptr += chunk_len;
3574  data_pos = pmatch[0].rm_so;
3575  }
3576 
3577  /*
3578  * Copy the replace_text. Process back references when the
3579  * replace_text has escape characters.
3580  */
3581  if (have_escape)
3582  appendStringInfoRegexpSubstr(&buf, replace_text, pmatch,
3583  start_ptr, data_pos);
3584  else
3585  appendStringInfoText(&buf, replace_text);
3586 
3587  /* Advance start_ptr and data_pos over the matched text. */
3588  start_ptr += charlen_to_bytelen(start_ptr,
3589  pmatch[0].rm_eo - data_pos);
3590  data_pos = pmatch[0].rm_eo;
3591 
3592  /*
3593  * When global option is off, replace the first instance only.
3594  */
3595  if (!glob)
3596  break;
3597 
3598  /*
3599  * Advance search position. Normally we start the next search at the
3600  * end of the previous match; but if the match was of zero length, we
3601  * have to advance by one character, or we'd just find the same match
3602  * again.
3603  */
3604  search_start = data_pos;
3605  if (pmatch[0].rm_so == pmatch[0].rm_eo)
3606  search_start++;
3607  }
3608 
3609  /*
3610  * Copy the text to the right of the last match.
3611  */
3612  if (data_pos < data_len)
3613  {
3614  int chunk_len;
3615 
3616  chunk_len = ((char *) src_text + VARSIZE_ANY(src_text)) - start_ptr;
3617  appendBinaryStringInfo(&buf, start_ptr, chunk_len);
3618  }
3619 
3620  ret_text = cstring_to_text_with_len(buf.data, buf.len);
3621  pfree(buf.data);
3622  pfree(data);
3623 
3624  return ret_text;
3625 }
3626 
3627 /*
3628  * split_text
3629  * parse input string
3630  * return ord item (1 based)
3631  * based on provided field separator
3632  */
3633 Datum
3635 {
3636  text *inputstring = PG_GETARG_TEXT_PP(0);
3637  text *fldsep = PG_GETARG_TEXT_PP(1);
3638  int fldnum = PG_GETARG_INT32(2);
3639  int inputstring_len;
3640  int fldsep_len;
3642  int start_posn;
3643  int end_posn;
3644  text *result_text;
3645 
3646  /* field number is 1 based */
3647  if (fldnum < 1)
3648  ereport(ERROR,
3649  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
3650  errmsg("field position must be greater than zero")));
3651 
3652  text_position_setup(inputstring, fldsep, &state);
3653 
3654  /*
3655  * Note: we check the converted string length, not the original, because
3656  * they could be different if the input contained invalid encoding.
3657  */
3658  inputstring_len = state.len1;
3659  fldsep_len = state.len2;
3660 
3661  /* return empty string for empty input string */
3662  if (inputstring_len < 1)
3663  {
3664  text_position_cleanup(&state);
3666  }
3667 
3668  /* empty field separator */
3669  if (fldsep_len < 1)
3670  {
3671  text_position_cleanup(&state);
3672  /* if first field, return input string, else empty string */
3673  if (fldnum == 1)
3674  PG_RETURN_TEXT_P(inputstring);
3675  else
3677  }
3678 
3679  /* identify bounds of first field */
3680  start_posn = 1;
3681  end_posn = text_position_next(1, &state);
3682 
3683  /* special case if fldsep not found at all */
3684  if (end_posn == 0)
3685  {
3686  text_position_cleanup(&state);
3687  /* if field 1 requested, return input string, else empty string */
3688  if (fldnum == 1)
3689  PG_RETURN_TEXT_P(inputstring);
3690  else
3692  }
3693 
3694  while (end_posn > 0 && --fldnum > 0)
3695  {
3696  /* identify bounds of next field */
3697  start_posn = end_posn + fldsep_len;
3698  end_posn = text_position_next(start_posn, &state);
3699  }
3700 
3701  text_position_cleanup(&state);
3702 
3703  if (fldnum > 0)
3704  {
3705  /* N'th field separator not found */
3706  /* if last field requested, return it, else empty string */
3707  if (fldnum == 1)
3708  result_text = text_substring(PointerGetDatum(inputstring),
3709  start_posn,
3710  -1,
3711  true);
3712  else
3713  result_text = cstring_to_text("");
3714  }
3715  else
3716  {
3717  /* non-last field requested */
3718  result_text = text_substring(PointerGetDatum(inputstring),
3719  start_posn,
3720  end_posn - start_posn,
3721  false);
3722  }
3723 
3724  PG_RETURN_TEXT_P(result_text);
3725 }
3726 
3727 /*
3728  * Convenience function to return true when two text params are equal.
3729  */
3730 static bool
3731 text_isequal(text *txt1, text *txt2)
3732 {
3734  PointerGetDatum(txt1),
3735  PointerGetDatum(txt2)));
3736 }
3737 
3738 /*
3739  * text_to_array
3740  * parse input string and return text array of elements,
3741  * based on provided field separator
3742  */
3743 Datum
3745 {
3746  return text_to_array_internal(fcinfo);
3747 }
3748 
3749 /*
3750  * text_to_array_null
3751  * parse input string and return text array of elements,
3752  * based on provided field separator and null string
3753  *
3754  * This is a separate entry point only to prevent the regression tests from
3755  * complaining about different argument sets for the same internal function.
3756  */
3757 Datum
3759 {
3760  return text_to_array_internal(fcinfo);
3761 }
3762 
3763 /*
3764  * common code for text_to_array and text_to_array_null functions
3765  *
3766  * These are not strict so we have to test for null inputs explicitly.
3767  */
3768 static Datum
3770 {
3771  text *inputstring;
3772  text *fldsep;
3773  text *null_string;
3774  int inputstring_len;
3775  int fldsep_len;
3776  char *start_ptr;
3777  text *result_text;
3778  bool is_null;
3779  ArrayBuildState *astate = NULL;
3780 
3781  /* when input string is NULL, then result is NULL too */
3782  if (PG_ARGISNULL(0))
3783  PG_RETURN_NULL();
3784 
3785  inputstring = PG_GETARG_TEXT_PP(0);
3786 
3787  /* fldsep can be NULL */
3788  if (!PG_ARGISNULL(1))
3789  fldsep = PG_GETARG_TEXT_PP(1);
3790  else
3791  fldsep = NULL;
3792 
3793  /* null_string can be NULL or omitted */
3794  if (PG_NARGS() > 2 && !PG_ARGISNULL(2))
3795  null_string = PG_GETARG_TEXT_PP(2);
3796  else
3797  null_string = NULL;
3798 
3799  if (fldsep != NULL)
3800  {
3801  /*
3802  * Normal case with non-null fldsep. Use the text_position machinery
3803  * to search for occurrences of fldsep.
3804  */
3806  int fldnum;
3807  int start_posn;
3808  int end_posn;
3809  int chunk_len;
3810 
3811  text_position_setup(inputstring, fldsep, &state);
3812 
3813  /*
3814  * Note: we check the converted string length, not the original,
3815  * because they could be different if the input contained invalid
3816  * encoding.
3817  */
3818  inputstring_len = state.len1;
3819  fldsep_len = state.len2;
3820 
3821  /* return empty array for empty input string */
3822  if (inputstring_len < 1)
3823  {
3824  text_position_cleanup(&state);
3826  }
3827 
3828  /*
3829  * empty field separator: return the input string as a one-element
3830  * array
3831  */
3832  if (fldsep_len < 1)
3833  {
3834  text_position_cleanup(&state);
3835  /* single element can be a NULL too */
3836  is_null = null_string ? text_isequal(inputstring, null_string) : false;
3838  PointerGetDatum(inputstring),
3839  is_null, 1));
3840  }
3841 
3842  start_posn = 1;
3843  /* start_ptr points to the start_posn'th character of inputstring */
3844  start_ptr = VARDATA_ANY(inputstring);
3845 
3846  for (fldnum = 1;; fldnum++) /* field number is 1 based */
3847  {
3849 
3850  end_posn = text_position_next(start_posn, &state);
3851 
3852  if (end_posn == 0)
3853  {
3854  /* fetch last field */
3855  chunk_len = ((char *) inputstring + VARSIZE_ANY(inputstring)) - start_ptr;
3856  }
3857  else
3858  {
3859  /* fetch non-last field */
3860  chunk_len = charlen_to_bytelen(start_ptr, end_posn - start_posn);
3861  }
3862 
3863  /* must build a temp text datum to pass to accumArrayResult */
3864  result_text = cstring_to_text_with_len(start_ptr, chunk_len);
3865  is_null = null_string ? text_isequal(result_text, null_string) : false;
3866 
3867  /* stash away this field */
3868  astate = accumArrayResult(astate,
3869  PointerGetDatum(result_text),
3870  is_null,
3871  TEXTOID,
3873 
3874  pfree(result_text);
3875 
3876  if (end_posn == 0)
3877  break;
3878 
3879  start_posn = end_posn;
3880  start_ptr += chunk_len;
3881  start_posn += fldsep_len;
3882  start_ptr += charlen_to_bytelen(start_ptr, fldsep_len);
3883  }
3884 
3885  text_position_cleanup(&state);
3886  }
3887  else
3888  {
3889  /*
3890  * When fldsep is NULL, each character in the inputstring becomes an
3891  * element in the result array. The separator is effectively the
3892  * space between characters.
3893  */
3894  inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
3895 
3896  /* return empty array for empty input string */
3897  if (inputstring_len < 1)
3899 
3900  start_ptr = VARDATA_ANY(inputstring);
3901 
3902  while (inputstring_len > 0)
3903  {
3904  int chunk_len = pg_mblen(start_ptr);
3905 
3907 
3908  /* must build a temp text datum to pass to accumArrayResult */
3909  result_text = cstring_to_text_with_len(start_ptr, chunk_len);
3910  is_null = null_string ? text_isequal(result_text, null_string) : false;
3911 
3912  /* stash away this field */
3913  astate = accumArrayResult(astate,
3914  PointerGetDatum(result_text),
3915  is_null,
3916  TEXTOID,
3918 
3919  pfree(result_text);
3920 
3921  start_ptr += chunk_len;
3922  inputstring_len -= chunk_len;
3923  }
3924  }
3925 
3928 }
3929 
3930 /*
3931  * array_to_text
3932  * concatenate Cstring representation of input array elements
3933  * using provided field separator
3934  */
3935 Datum
3937 {
3939  char *fldsep = text_to_cstring(PG_GETARG_TEXT_PP(1));
3940 
3941  PG_RETURN_TEXT_P(array_to_text_internal(fcinfo, v, fldsep, NULL));
3942 }
3943 
3944 /*
3945  * array_to_text_null
3946  * concatenate Cstring representation of input array elements
3947  * using provided field separator and null string
3948  *
3949  * This version is not strict so we have to test for null inputs explicitly.
3950  */
3951 Datum
3953 {
3954  ArrayType *v;
3955  char *fldsep;
3956  char *null_string;
3957 
3958  /* returns NULL when first or second parameter is NULL */
3959  if (PG_ARGISNULL(0) || PG_ARGISNULL(1))
3960  PG_RETURN_NULL();
3961 
3962  v = PG_GETARG_ARRAYTYPE_P(0);
3963  fldsep = text_to_cstring(PG_GETARG_TEXT_PP(1));
3964 
3965  /* NULL null string is passed through as a null pointer */
3966  if (!PG_ARGISNULL(2))
3967  null_string = text_to_cstring(PG_GETARG_TEXT_PP(2));
3968  else
3969  null_string = NULL;
3970 
3971  PG_RETURN_TEXT_P(array_to_text_internal(fcinfo, v, fldsep, null_string));
3972 }
3973 
3974 /*
3975  * common code for array_to_text and array_to_text_null functions
3976  */
3977 static text *
3979  const char *fldsep, const char *null_string)
3980 {
3981  text *result;
3982  int nitems,
3983  *dims,
3984  ndims;
3985  Oid element_type;
3986  int typlen;
3987  bool typbyval;
3988  char typalign;
3990  bool printed = false;
3991  char *p;
3992  bits8 *bitmap;
3993  int bitmask;
3994  int i;
3995  ArrayMetaState *my_extra;
3996 
3997  ndims = ARR_NDIM(v);
3998  dims = ARR_DIMS(v);
3999  nitems = ArrayGetNItems(ndims, dims);
4000 
4001  /* if there are no elements, return an empty string */
4002  if (nitems == 0)
4003  return cstring_to_text_with_len("", 0);
4004 
4005  element_type = ARR_ELEMTYPE(v);
4006  initStringInfo(&buf);
4007 
4008  /*
4009  * We arrange to look up info about element type, including its output
4010  * conversion proc, only once per series of calls, assuming the element
4011  * type doesn't change underneath us.
4012  */
4013  my_extra = (ArrayMetaState *) fcinfo->flinfo->fn_extra;
4014  if (my_extra == NULL)
4015  {
4016  fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
4017  sizeof(ArrayMetaState));
4018  my_extra = (ArrayMetaState *) fcinfo->flinfo->fn_extra;
4019  my_extra->element_type = ~element_type;
4020  }
4021 
4022  if (my_extra->element_type != element_type)
4023  {
4024  /*
4025  * Get info about element type, including its output conversion proc
4026  */
4027  get_type_io_data(element_type, IOFunc_output,
4028  &my_extra->typlen, &my_extra->typbyval,
4029  &my_extra->typalign, &my_extra->typdelim,
4030  &my_extra->typioparam, &my_extra->typiofunc);
4031  fmgr_info_cxt(my_extra->typiofunc, &my_extra->proc,
4032  fcinfo->flinfo->fn_mcxt);
4033  my_extra->element_type = element_type;
4034  }
4035  typlen = my_extra->typlen;
4036  typbyval = my_extra->typbyval;
4037  typalign = my_extra->typalign;
4038 
4039  p = ARR_DATA_PTR(v);
4040  bitmap = ARR_NULLBITMAP(v);
4041  bitmask = 1;
4042 
4043  for (i = 0; i < nitems; i++)
4044  {
4045  Datum itemvalue;
4046  char *value;
4047 
4048  /* Get source element, checking for NULL */
4049  if (bitmap && (*bitmap & bitmask) == 0)
4050  {
4051  /* if null_string is NULL, we just ignore null elements */
4052  if (null_string != NULL)
4053  {
4054  if (printed)
4055  appendStringInfo(&buf, "%s%s", fldsep, null_string);
4056  else
4057  appendStringInfoString(&buf, null_string);
4058  printed = true;
4059  }
4060  }
4061  else
4062  {
4063  itemvalue = fetch_att(p, typbyval, typlen);
4064 
4065  value = OutputFunctionCall(&my_extra->proc, itemvalue);
4066 
4067  if (printed)
4068  appendStringInfo(&buf, "%s%s", fldsep, value);
4069  else
4070  appendStringInfoString(&buf, value);
4071  printed = true;
4072 
4073  p = att_addlength_pointer(p, typlen, p);
4074  p = (char *) att_align_nominal(p, typalign);
4075  }
4076 
4077  /* advance bitmap pointer if any */
4078  if (bitmap)
4079  {
4080  bitmask <<= 1;
4081  if (bitmask == 0x100)
4082  {
4083  bitmap++;
4084  bitmask = 1;
4085  }
4086  }
4087  }
4088 
4089  result = cstring_to_text_with_len(buf.data, buf.len);
4090  pfree(buf.data);
4091 
4092  return result;
4093 }
4094 
4095 #define HEXBASE 16
4096 /*
4097  * Convert a int32 to a string containing a base 16 (hex) representation of
4098  * the number.
4099  */
4100 Datum
4102 {
4104  char *ptr;
4105  const char *digits = "0123456789abcdef";
4106  char buf[32]; /* bigger than needed, but reasonable */
4107 
4108  ptr = buf + sizeof(buf) - 1;
4109  *ptr = '\0';
4110 
4111  do
4112  {
4113  *--ptr = digits[value % HEXBASE];
4114  value /= HEXBASE;
4115  } while (ptr > buf && value);
4116 
4118 }
4119 
4120 /*
4121  * Convert a int64 to a string containing a base 16 (hex) representation of
4122  * the number.
4123  */
4124 Datum
4126 {
4127  uint64 value = (uint64) PG_GETARG_INT64(0);
4128  char *ptr;
4129  const char *digits = "0123456789abcdef";
4130  char buf[32]; /* bigger than needed, but reasonable */
4131 
4132  ptr = buf + sizeof(buf) - 1;
4133  *ptr = '\0';
4134 
4135  do
4136  {
4137  *--ptr = digits[value % HEXBASE];
4138  value /= HEXBASE;
4139  } while (ptr > buf && value);
4140 
4142 }
4143 
4144 /*
4145  * Create an md5 hash of a text string and return it as hex
4146  *
4147  * md5 produces a 16 byte (128 bit) hash; double it for hex
4148  */
4149 #define MD5_HASH_LEN 32
4150 
4151 Datum
4153 {
4154  text *in_text = PG_GETARG_TEXT_PP(0);
4155  size_t len;
4156  char hexsum[MD5_HASH_LEN + 1];
4157 
4158  /* Calculate the length of the buffer using varlena metadata */
4159  len = VARSIZE_ANY_EXHDR(in_text);
4160 
4161  /* get the hash result */
4162  if (pg_md5_hash(VARDATA_ANY(in_text), len, hexsum) == false)
4163  ereport(ERROR,
4164  (errcode(ERRCODE_OUT_OF_MEMORY),
4165  errmsg("out of memory")));
4166 
4167  /* convert to text and return it */
4169 }
4170 
4171 /*
4172  * Create an md5 hash of a bytea field and return it as a hex string:
4173  * 16-byte md5 digest is represented in 32 hex characters.
4174  */
4175 Datum
4177 {
4178  bytea *in = PG_GETARG_BYTEA_PP(0);
4179  size_t len;
4180  char hexsum[MD5_HASH_LEN + 1];
4181 
4182  len = VARSIZE_ANY_EXHDR(in);
4183  if (pg_md5_hash(VARDATA_ANY(in), len, hexsum) == false)
4184  ereport(ERROR,
4185  (errcode(ERRCODE_OUT_OF_MEMORY),
4186  errmsg("out of memory")));
4187 
4189 }
4190 
4191 /*
4192  * Return the size of a datum, possibly compressed
4193  *
4194  * Works on any data type
4195  */
4196 Datum
4198 {
4200  int32 result;
4201  int typlen;
4202 
4203  /* On first call, get the input type's typlen, and save at *fn_extra */
4204  if (fcinfo->flinfo->fn_extra == NULL)
4205  {
4206  /* Lookup the datatype of the supplied argument */
4207  Oid argtypeid = get_fn_expr_argtype(fcinfo->flinfo, 0);
4208 
4209  typlen = get_typlen(argtypeid);
4210  if (typlen == 0) /* should not happen */
4211  elog(ERROR, "cache lookup failed for type %u", argtypeid);
4212 
4213  fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
4214  sizeof(int));
4215  *((int *) fcinfo->flinfo->fn_extra) = typlen;
4216  }
4217  else
4218  typlen = *((int *) fcinfo->flinfo->fn_extra);
4219 
4220  if (typlen == -1)
4221  {
4222  /* varlena type, possibly toasted */
4223  result = toast_datum_size(value);
4224  }
4225  else if (typlen == -2)
4226  {
4227  /* cstring */
4228  result = strlen(DatumGetCString(value)) + 1;
4229  }
4230  else
4231  {
4232  /* ordinary fixed-width type */
4233  result = typlen;
4234  }
4235 
4236  PG_RETURN_INT32(result);
4237 }
4238 
4239 /*
4240  * string_agg - Concatenates values and returns string.
4241  *
4242  * Syntax: string_agg(value text, delimiter text) RETURNS text
4243  *
4244  * Note: Any NULL values are ignored. The first-call delimiter isn't
4245  * actually used at all, and on subsequent calls the delimiter precedes
4246  * the associated value.
4247  */
4248 
4249 /* subroutine to initialize state */
4250 static StringInfo
4252 {
4253  StringInfo state;
4254  MemoryContext aggcontext;
4255  MemoryContext oldcontext;
4256 
4257  if (!AggCheckCallContext(fcinfo, &aggcontext))
4258  {
4259  /* cannot be called directly because of internal-type argument */
4260  elog(ERROR, "string_agg_transfn called in non-aggregate context");
4261  }
4262 
4263  /*
4264  * Create state in aggregate context. It'll stay there across subsequent
4265  * calls.
4266  */
4267  oldcontext = MemoryContextSwitchTo(aggcontext);
4268  state = makeStringInfo();
4269  MemoryContextSwitchTo(oldcontext);
4270 
4271  return state;
4272 }
4273 
4274 Datum
4276 {
4277  StringInfo state;
4278 
4279  state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
4280 
4281  /* Append the value unless null. */
4282  if (!PG_ARGISNULL(1))
4283  {
4284  /* On the first time through, we ignore the delimiter. */
4285  if (state == NULL)
4286  state = makeStringAggState(fcinfo);
4287  else if (!PG_ARGISNULL(2))
4288  appendStringInfoText(state, PG_GETARG_TEXT_PP(2)); /* delimiter */
4289 
4290  appendStringInfoText(state, PG_GETARG_TEXT_PP(1)); /* value */
4291  }
4292 
4293  /*
4294  * The transition type for string_agg() is declared to be "internal",
4295  * which is a pass-by-value type the same size as a pointer.
4296  */
4297  PG_RETURN_POINTER(state);
4298 }
4299 
4300 Datum
4302 {
4303  StringInfo state;
4304 
4305  /* cannot be called directly because of internal-type argument */
4306  Assert(AggCheckCallContext(fcinfo, NULL));
4307 
4308  state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
4309 
4310  if (state != NULL)
4312  else
4313  PG_RETURN_NULL();
4314 }
4315 
4316 /*
4317  * Implementation of both concat() and concat_ws().
4318  *
4319  * sepstr is the separator string to place between values.
4320  * argidx identifies the first argument to concatenate (counting from zero).
4321  * Returns NULL if result should be NULL, else text value.
4322  */
4323 static text *
4324 concat_internal(const char *sepstr, int argidx,
4325  FunctionCallInfo fcinfo)
4326 {
4327  text *result;
4328  StringInfoData str;
4329  bool first_arg = true;
4330  int i;
4331 
4332  /*
4333  * concat(VARIADIC some-array) is essentially equivalent to
4334  * array_to_text(), ie concat the array elements with the given separator.
4335  * So we just pass the case off to that code.
4336  */
4337  if (get_fn_expr_variadic(fcinfo->flinfo))
4338  {
4339  ArrayType *arr;
4340 
4341  /* Should have just the one argument */
4342  Assert(argidx == PG_NARGS() - 1);
4343 
4344  /* concat(VARIADIC NULL) is defined as NULL */
4345  if (PG_ARGISNULL(argidx))
4346  return NULL;
4347 
4348  /*
4349  * Non-null argument had better be an array. We assume that any call
4350  * context that could let get_fn_expr_variadic return true will have
4351  * checked that a VARIADIC-labeled parameter actually is an array. So
4352  * it should be okay to just Assert that it's an array rather than
4353  * doing a full-fledged error check.
4354  */
4356 
4357  /* OK, safe to fetch the array value */
4358  arr = PG_GETARG_ARRAYTYPE_P(argidx);
4359 
4360  /*
4361  * And serialize the array. We tell array_to_text to ignore null
4362  * elements, which matches the behavior of the loop below.
4363  */
4364  return array_to_text_internal(fcinfo, arr, sepstr, NULL);
4365  }
4366 
4367  /* Normal case without explicit VARIADIC marker */
4368  initStringInfo(&str);
4369 
4370  for (i = argidx; i < PG_NARGS(); i++)
4371  {
4372  if (!PG_ARGISNULL(i))
4373  {
4375  Oid valtype;
4376  Oid typOutput;
4377  bool typIsVarlena;
4378 
4379  /* add separator if appropriate */
4380  if (first_arg)
4381  first_arg = false;
4382  else
4383  appendStringInfoString(&str, sepstr);
4384 
4385  /* call the appropriate type output function, append the result */
4386  valtype = get_fn_expr_argtype(fcinfo->flinfo, i);
4387  if (!OidIsValid(valtype))
4388  elog(ERROR, "could not determine data type of concat() input");
4389  getTypeOutputInfo(valtype, &typOutput, &typIsVarlena);
4391  OidOutputFunctionCall(typOutput, value));
4392  }
4393  }
4394 
4395  result = cstring_to_text_with_len(str.data, str.len);
4396  pfree(str.data);
4397 
4398  return result;
4399 }
4400 
4401 /*
4402  * Concatenate all arguments. NULL arguments are ignored.
4403  */
4404 Datum
4406 {
4407  text *result;
4408 
4409  result = concat_internal("", 0, fcinfo);
4410  if (result == NULL)
4411  PG_RETURN_NULL();
4412  PG_RETURN_TEXT_P(result);
4413 }
4414 
4415 /*
4416  * Concatenate all but first argument value with separators. The first
4417  * parameter is used as the separator. NULL arguments are ignored.
4418  */
4419 Datum
4421 {
4422  char *sep;
4423  text *result;
4424 
4425  /* return NULL when separator is NULL */
4426  if (PG_ARGISNULL(0))
4427  PG_RETURN_NULL();
4429 
4430  result = concat_internal(sep, 1, fcinfo);
4431  if (result == NULL)
4432  PG_RETURN_NULL();
4433  PG_RETURN_TEXT_P(result);
4434 }
4435 
4436 /*
4437  * Return first n characters in the string. When n is negative,
4438  * return all but last |n| characters.
4439  */
4440 Datum
4442 {
4443  text *str = PG_GETARG_TEXT_PP(0);
4444  const char *p = VARDATA_ANY(str);
4445  int len = VARSIZE_ANY_EXHDR(str);
4446  int n = PG_GETARG_INT32(1);
4447  int rlen;
4448 
4449  if (n < 0)
4450  n = pg_mbstrlen_with_len(p, len) + n;
4451  rlen = pg_mbcharcliplen(p, len, n);
4452 
4454 }
4455 
4456 /*
4457  * Return last n characters in the string. When n is negative,
4458  * return all but first |n| characters.
4459  */
4460 Datum
4462 {
4463  text *str = PG_GETARG_TEXT_PP(0);
4464  const char *p = VARDATA_ANY(str);
4465  int len = VARSIZE_ANY_EXHDR(str);
4466  int n = PG_GETARG_INT32(1);
4467  int off;
4468 
4469  if (n < 0)
4470  n = -n;
4471  else
4472  n = pg_mbstrlen_with_len(p, len) - n;
4473  off = pg_mbcharcliplen(p, len, n);
4474 
4475  PG_RETURN_TEXT_P(cstring_to_text_with_len(p + off, len - off));
4476 }
4477 
4478 /*
4479  * Return reversed string
4480  */
4481 Datum
4483 {
4484  text *str = PG_GETARG_TEXT_PP(0);
4485  const char *p = VARDATA_ANY(str);
4486  int len = VARSIZE_ANY_EXHDR(str);
4487  const char *endp = p + len;
4488  text *result;
4489  char *dst;
4490 
4491  result = palloc(len + VARHDRSZ);
4492  dst = (char *) VARDATA(result) + len;
4493  SET_VARSIZE(result, len + VARHDRSZ);
4494 
4496  {
4497  /* multibyte version */
4498  while (p < endp)
4499  {
4500  int sz;
4501 
4502  sz = pg_mblen(p);
4503  dst -= sz;
4504  memcpy(dst, p, sz);
4505  p += sz;
4506  }
4507  }
4508  else
4509  {
4510  /* single byte version */
4511  while (p < endp)
4512  *(--dst) = *p++;
4513  }
4514 
4515  PG_RETURN_TEXT_P(result);
4516 }
4517 
4518 
4519 /*
4520  * Support macros for text_format()
4521  */
4522 #define TEXT_FORMAT_FLAG_MINUS 0x0001 /* is minus flag present? */
4523 
4524 #define ADVANCE_PARSE_POINTER(ptr,end_ptr) \
4525  do { \
4526  if (++(ptr) >= (end_ptr)) \
4527  ereport(ERROR, \
4528  (errcode(ERRCODE_INVALID_PARAMETER_VALUE), \
4529  errmsg("unterminated format specifier"))); \
4530  } while (0)
4531 
4532 /*
4533  * Returns a formatted string
4534  */
4535 Datum
4537 {
4538  text *fmt;
4539  StringInfoData str;
4540  const char *cp;
4541  const char *start_ptr;
4542  const char *end_ptr;
4543  text *result;
4544  int arg;
4545  bool funcvariadic;
4546  int nargs;
4547  Datum *elements = NULL;
4548  bool *nulls = NULL;
4549  Oid element_type = InvalidOid;
4550  Oid prev_type = InvalidOid;
4551  Oid prev_width_type = InvalidOid;
4552  FmgrInfo typoutputfinfo;
4553  FmgrInfo typoutputinfo_width;
4554 
4555  /* When format string is null, immediately return null */
4556  if (PG_ARGISNULL(0))
4557  PG_RETURN_NULL();
4558 
4559  /* If argument is marked VARIADIC, expand array into elements */
4560  if (get_fn_expr_variadic(fcinfo->flinfo))
4561  {
4562  ArrayType *arr;
4563  int16 elmlen;
4564  bool elmbyval;
4565  char elmalign;
4566  int nitems;
4567 
4568  /* Should have just the one argument */
4569  Assert(PG_NARGS() == 2);
4570 
4571  /* If argument is NULL, we treat it as zero-length array */
4572  if (PG_ARGISNULL(1))
4573  nitems = 0;
4574  else
4575  {
4576  /*
4577  * Non-null argument had better be an array. We assume that any
4578  * call context that could let get_fn_expr_variadic return true
4579  * will have checked that a VARIADIC-labeled parameter actually is
4580  * an array. So it should be okay to just Assert that it's an
4581  * array rather than doing a full-fledged error check.
4582  */
4584 
4585  /* OK, safe to fetch the array value */
4586  arr = PG_GETARG_ARRAYTYPE_P(1);
4587 
4588  /* Get info about array element type */
4589  element_type = ARR_ELEMTYPE(arr);
4590  get_typlenbyvalalign(element_type,
4591  &elmlen, &elmbyval, &elmalign);
4592 
4593  /* Extract all array elements */
4594  deconstruct_array(arr, element_type, elmlen, elmbyval, elmalign,
4595  &elements, &nulls, &nitems);
4596  }
4597 
4598  nargs = nitems + 1;
4599  funcvariadic = true;
4600  }
4601  else
4602  {
4603  /* Non-variadic case, we'll process the arguments individually */
4604  nargs = PG_NARGS();
4605  funcvariadic = false;
4606  }
4607 
4608  /* Setup for main loop. */
4609  fmt = PG_GETARG_TEXT_PP(0);
4610  start_ptr = VARDATA_ANY(fmt);
4611  end_ptr = start_ptr + VARSIZE_ANY_EXHDR(fmt);
4612  initStringInfo(&str);
4613  arg = 1; /* next argument position to print */
4614 
4615  /* Scan format string, looking for conversion specifiers. */
4616  for (cp = start_ptr; cp < end_ptr; cp++)
4617  {
4618  int argpos;
4619  int widthpos;
4620  int flags;
4621  int width;
4622  Datum value;
4623  bool isNull;
4624  Oid typid;
4625 
4626  /*
4627  * If it's not the start of a conversion specifier, just copy it to
4628  * the output buffer.
4629  */
4630  if (*cp != '%')
4631  {
4632  appendStringInfoCharMacro(&str, *cp);
4633  continue;
4634  }
4635 
4636  ADVANCE_PARSE_POINTER(cp, end_ptr);
4637 
4638  /* Easy case: %% outputs a single % */
4639  if (*cp == '%')
4640  {
4641  appendStringInfoCharMacro(&str, *cp);
4642  continue;
4643  }
4644 
4645  /* Parse the optional portions of the format specifier */
4646  cp = text_format_parse_format(cp, end_ptr,
4647  &argpos, &widthpos,
4648  &flags, &width);
4649 
4650  /*
4651  * Next we should see the main conversion specifier. Whether or not
4652  * an argument position was present, it's known that at least one
4653  * character remains in the string at this point. Experience suggests
4654  * that it's worth checking that that character is one of the expected
4655  * ones before we try to fetch arguments, so as to produce the least
4656  * confusing response to a mis-formatted specifier.
4657  */
4658  if (strchr("sIL", *cp) == NULL)
4659  ereport(ERROR,
4660  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4661  errmsg("unrecognized conversion type specifier \"%c\"",
4662  *cp)));
4663 
4664  /* If indirect width was specified, get its value */
4665  if (widthpos >= 0)
4666  {
4667  /* Collect the specified or next argument position */
4668  if (widthpos > 0)
4669  arg = widthpos;
4670  if (arg >= nargs)
4671  ereport(ERROR,
4672  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4673  errmsg("too few arguments for format")));
4674 
4675  /* Get the value and type of the selected argument */
4676  if (!funcvariadic)
4677  {
4678  value = PG_GETARG_DATUM(arg);
4679  isNull = PG_ARGISNULL(arg);
4680  typid = get_fn_expr_argtype(fcinfo->flinfo, arg);
4681  }
4682  else
4683  {
4684  value = elements[arg - 1];
4685  isNull = nulls[arg - 1];
4686  typid = element_type;
4687  }
4688  if (!OidIsValid(typid))
4689  elog(ERROR, "could not determine data type of format() input");
4690 
4691  arg++;
4692 
4693  /* We can treat NULL width the same as zero */
4694  if (isNull)
4695  width = 0;
4696  else if (typid == INT4OID)
4697  width = DatumGetInt32(value);
4698  else if (typid == INT2OID)
4699  width = DatumGetInt16(value);
4700  else
4701  {
4702  /* For less-usual datatypes, convert to text then to int */
4703  char *str;
4704 
4705  if (typid != prev_width_type)
4706  {
4707  Oid typoutputfunc;
4708  bool typIsVarlena;
4709 
4710  getTypeOutputInfo(typid, &typoutputfunc, &typIsVarlena);
4711  fmgr_info(typoutputfunc, &typoutputinfo_width);
4712  prev_width_type = typid;
4713  }
4714 
4715  str = OutputFunctionCall(&typoutputinfo_width, value);
4716 
4717  /* pg_atoi will complain about bad data or overflow */
4718  width = pg_atoi(str, sizeof(int), '\0');
4719 
4720  pfree(str);
4721  }
4722  }
4723 
4724  /* Collect the specified or next argument position */
4725  if (argpos > 0)
4726  arg = argpos;
4727  if (arg >= nargs)
4728  ereport(ERROR,
4729  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4730  errmsg("too few arguments for format")));
4731 
4732  /* Get the value and type of the selected argument */
4733  if (!funcvariadic)
4734  {
4735  value = PG_GETARG_DATUM(arg);
4736  isNull = PG_ARGISNULL(arg);
4737  typid = get_fn_expr_argtype(fcinfo->flinfo, arg);
4738  }
4739  else
4740  {
4741  value = elements[arg - 1];
4742  isNull = nulls[arg - 1];
4743  typid = element_type;
4744  }
4745  if (!OidIsValid(typid))
4746  elog(ERROR, "could not determine data type of format() input");
4747 
4748  arg++;
4749 
4750  /*
4751  * Get the appropriate typOutput function, reusing previous one if
4752  * same type as previous argument. That's particularly useful in the
4753  * variadic-array case, but often saves work even for ordinary calls.
4754  */
4755  if (typid != prev_type)
4756  {
4757  Oid typoutputfunc;
4758  bool typIsVarlena;
4759 
4760  getTypeOutputInfo(typid, &typoutputfunc, &typIsVarlena);
4761  fmgr_info(typoutputfunc, &typoutputfinfo);
4762  prev_type = typid;
4763  }
4764 
4765  /*
4766  * And now we can format the value.
4767  */
4768  switch (*cp)
4769  {
4770  case 's':
4771  case 'I':
4772  case 'L':
4773  text_format_string_conversion(&str, *cp, &typoutputfinfo,
4774  value, isNull,
4775  flags, width);
4776  break;
4777  default:
4778  /* should not get here, because of previous check */
4779  ereport(ERROR,
4780  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4781  errmsg("unrecognized conversion type specifier \"%c\"",
4782  *cp)));
4783  break;
4784  }
4785  }
4786 
4787  /* Don't need deconstruct_array results anymore. */
4788  if (elements != NULL)
4789  pfree(elements);
4790  if (nulls != NULL)
4791  pfree(nulls);
4792 
4793  /* Generate results. */
4794  result = cstring_to_text_with_len(str.data, str.len);
4795  pfree(str.data);
4796 
4797  PG_RETURN_TEXT_P(result);
4798 }
4799 
4800 /*
4801  * Parse contiguous digits as a decimal number.
4802  *
4803  * Returns true if some digits could be parsed.
4804  * The value is returned into *value, and *ptr is advanced to the next
4805  * character to be parsed.
4806  *
4807  * Note parsing invariant: at least one character is known available before
4808  * string end (end_ptr) at entry, and this is still true at exit.
4809  */
4810 static bool
4811 text_format_parse_digits(const char **ptr, const char *end_ptr, int *value)
4812 {
4813  bool found = false;
4814  const char *cp = *ptr;
4815  int val = 0;
4816 
4817  while (*cp >= '0' && *cp <= '9')
4818  {
4819  int newval = val * 10 + (*cp - '0');
4820 
4821  if (newval / 10 != val) /* overflow? */
4822  ereport(ERROR,
4823  (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
4824  errmsg("number is out of range")));
4825  val = newval;
4826  ADVANCE_PARSE_POINTER(cp, end_ptr);
4827  found = true;
4828  }
4829 
4830  *ptr = cp;
4831  *value = val;
4832 
4833  return found;
4834 }
4835 
4836 /*
4837  * Parse a format specifier (generally following the SUS printf spec).
4838  *
4839  * We have already advanced over the initial '%', and we are looking for
4840  * [argpos][flags][width]type (but the type character is not consumed here).
4841  *
4842  * Inputs are start_ptr (the position after '%') and end_ptr (string end + 1).
4843  * Output parameters:
4844  * argpos: argument position for value to be printed. -1 means unspecified.
4845  * widthpos: argument position for width. Zero means the argument position
4846  * was unspecified (ie, take the next arg) and -1 means no width
4847  * argument (width was omitted or specified as a constant).
4848  * flags: bitmask of flags.
4849  * width: directly-specified width value. Zero means the width was omitted
4850  * (note it's not necessary to distinguish this case from an explicit
4851  * zero width value).
4852  *
4853  * The function result is the next character position to be parsed, ie, the
4854  * location where the type character is/should be.
4855  *
4856  * Note parsing invariant: at least one character is known available before
4857  * string end (end_ptr) at entry, and this is still true at exit.
4858  */
4859 static const char *
4860 text_format_parse_format(const char *start_ptr, const char *end_ptr,
4861  int *argpos, int *widthpos,
4862  int *flags, int *width)
4863 {
4864  const char *cp = start_ptr;
4865  int n;
4866 
4867  /* set defaults for output parameters */
4868  *argpos = -1;
4869  *widthpos = -1;
4870  *flags = 0;
4871  *width = 0;
4872 
4873  /* try to identify first number */
4874  if (text_format_parse_digits(&cp, end_ptr, &n))
4875  {
4876  if (*cp != '$')
4877  {
4878  /* Must be just a width and a type, so we're done */
4879  *width = n;
4880  return cp;
4881  }
4882  /* The number was argument position */
4883  *argpos = n;
4884  /* Explicit 0 for argument index is immediately refused */
4885  if (n == 0)
4886  ereport(ERROR,
4887  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4888  errmsg("format specifies argument 0, but arguments are numbered from 1")));
4889  ADVANCE_PARSE_POINTER(cp, end_ptr);
4890  }
4891 
4892  /* Handle flags (only minus is supported now) */
4893  while (*cp == '-')
4894  {
4895  *flags |= TEXT_FORMAT_FLAG_MINUS;
4896  ADVANCE_PARSE_POINTER(cp, end_ptr);
4897  }
4898 
4899  if (*cp == '*')
4900  {
4901  /* Handle indirect width */
4902  ADVANCE_PARSE_POINTER(cp, end_ptr);
4903  if (text_format_parse_digits(&cp, end_ptr, &n))
4904  {
4905  /* number in this position must be closed by $ */
4906  if (*cp != '$')
4907  ereport(ERROR,
4908  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4909  errmsg("width argument position must be ended by \"$\"")));
4910  /* The number was width argument position */
4911  *widthpos = n;
4912  /* Explicit 0 for argument index is immediately refused */
4913  if (n == 0)
4914  ereport(ERROR,
4915  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4916  errmsg("format specifies argument 0, but arguments are numbered from 1")));
4917  ADVANCE_PARSE_POINTER(cp, end_ptr);
4918  }
4919  else
4920  *widthpos = 0; /* width's argument position is unspecified */
4921  }
4922  else
4923  {
4924  /* Check for direct width specification */
4925  if (text_format_parse_digits(&cp, end_ptr, &n))
4926  *width = n;
4927  }
4928 
4929  /* cp should now be pointing at type character */
4930  return cp;
4931 }
4932 
4933 /*
4934  * Format a %s, %I, or %L conversion
4935  */
4936 static void
4938  FmgrInfo *typOutputInfo,
4939  Datum value, bool isNull,
4940  int flags, int width)
4941 {
4942  char *str;
4943 
4944  /* Handle NULL arguments before trying to stringify the value. */
4945  if (isNull)
4946  {
4947  if (conversion == 's')
4948  text_format_append_string(buf, "", flags, width);
4949  else if (conversion == 'L')
4950  text_format_append_string(buf, "NULL", flags, width);
4951  else if (conversion == 'I')
4952  ereport(ERROR,
4953  (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
4954  errmsg("null values cannot be formatted as an SQL identifier")));
4955  return;
4956  }
4957 
4958  /* Stringify. */
4959  str = OutputFunctionCall(typOutputInfo, value);
4960 
4961  /* Escape. */
4962  if (conversion == 'I')
4963  {
4964  /* quote_identifier may or may not allocate a new string. */
4965  text_format_append_string(buf, quote_identifier(str), flags, width);
4966  }
4967  else if (conversion == 'L')
4968  {
4969  char *qstr = quote_literal_cstr(str);
4970 
4971  text_format_append_string(buf, qstr, flags, width);
4972  /* quote_literal_cstr() always allocates a new string */
4973  pfree(qstr);
4974  }
4975  else
4976  text_format_append_string(buf, str, flags, width);
4977 
4978  /* Cleanup. */
4979  pfree(str);
4980 }
4981 
4982 /*
4983  * Append str to buf, padding as directed by flags/width
4984  */
4985 static void
4987  int flags, int width)
4988 {
4989  bool align_to_left = false;
4990  int len;
4991 
4992  /* fast path for typical easy case */
4993  if (width == 0)
4994  {
4995  appendStringInfoString(buf, str);
4996  return;
4997  }
4998 
4999  if (width < 0)
5000  {
5001  /* Negative width: implicit '-' flag, then take absolute value */
5002  align_to_left = true;
5003  /* -INT_MIN is undefined */
5004  if (width <= INT_MIN)
5005  ereport(ERROR,
5006  (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
5007  errmsg("number is out of range")));
5008  width = -width;
5009  }
5010  else if (flags & TEXT_FORMAT_FLAG_MINUS)
5011  align_to_left = true;
5012 
5013  len = pg_mbstrlen(str);
5014  if (align_to_left)
5015  {
5016  /* left justify */
5017  appendStringInfoString(buf, str);
5018  if (len < width)
5019  appendStringInfoSpaces(buf, width - len);
5020  }
5021  else
5022  {
5023  /* right justify */
5024  if (len < width)
5025  appendStringInfoSpaces(buf, width - len);
5026  appendStringInfoString(buf, str);
5027  }
5028 }
5029 
5030 /*
5031  * text_format_nv - nonvariadic wrapper for text_format function.
5032  *
5033  * note: this wrapper is necessary to pass the sanity check in opr_sanity,
5034  * which checks that all built-in functions that share the implementing C
5035  * function take the same number of arguments.
5036  */
5037 Datum
5039 {
5040  return text_format(fcinfo);
5041 }
5042 
5043 /*
5044  * Helper function for Levenshtein distance functions. Faster than memcmp(),
5045  * for this use case.
5046  */
5047 static inline bool
5048 rest_of_char_same(const char *s1, const char *s2, int len)
5049 {
5050  while (len > 0)
5051  {
5052  len--;
5053  if (s1[len] != s2[len])
5054  return false;
5055  }
5056  return true;
5057 }
5058 
5059 /* Expand each Levenshtein distance variant */
5060 #include "levenshtein.c"
5061 #define LEVENSHTEIN_LESS_EQUAL
5062 #include "levenshtein.c"