PostgreSQL Source Code  git master
jsonapi.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * jsonapi.c
4  * JSON parser and lexer interfaces
5  *
6  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  * IDENTIFICATION
10  * src/common/jsonapi.c
11  *
12  *-------------------------------------------------------------------------
13  */
14 #ifndef FRONTEND
15 #include "postgres.h"
16 #else
17 #include "postgres_fe.h"
18 #endif
19 
20 #include "common/jsonapi.h"
21 #include "mb/pg_wchar.h"
22 #include "port/pg_lfind.h"
23 
24 #ifndef FRONTEND
25 #include "miscadmin.h"
26 #endif
27 
28 /*
29  * The context of the parser is maintained by the recursive descent
30  * mechanism, but is passed explicitly to the error reporting routine
31  * for better diagnostics.
32  */
33 typedef enum /* contexts of JSON parser */
34 {
35  JSON_PARSE_VALUE, /* expecting a value */
36  JSON_PARSE_STRING, /* expecting a string (for a field name) */
37  JSON_PARSE_ARRAY_START, /* saw '[', expecting value or ']' */
38  JSON_PARSE_ARRAY_NEXT, /* saw array element, expecting ',' or ']' */
39  JSON_PARSE_OBJECT_START, /* saw '{', expecting label or '}' */
40  JSON_PARSE_OBJECT_LABEL, /* saw object label, expecting ':' */
41  JSON_PARSE_OBJECT_NEXT, /* saw object value, expecting ',' or '}' */
42  JSON_PARSE_OBJECT_COMMA, /* saw object ',', expecting next label */
43  JSON_PARSE_END, /* saw the end of a document, expect nothing */
45 
47 static inline JsonParseErrorType json_lex_number(JsonLexContext *lex, char *s,
48  bool *num_err, int *total_len);
55 
56 /* the null action object used for pure validation */
58 {
59  NULL, NULL, NULL, NULL, NULL,
60  NULL, NULL, NULL, NULL, NULL
61 };
62 
63 /* Recursive Descent parser support routines */
64 
65 /*
66  * lex_peek
67  *
68  * what is the current look_ahead token?
69 */
70 static inline JsonTokenType
72 {
73  return lex->token_type;
74 }
75 
76 /*
77  * lex_expect
78  *
79  * move the lexer to the next token if the current look_ahead token matches
80  * the parameter token. Otherwise, report an error.
81  */
82 static inline JsonParseErrorType
84 {
85  if (lex_peek(lex) == token)
86  return json_lex(lex);
87  else
88  return report_parse_error(ctx, lex);
89 }
90 
91 /* chars to consider as part of an alphanumeric token */
92 #define JSON_ALPHANUMERIC_CHAR(c) \
93  (((c) >= 'a' && (c) <= 'z') || \
94  ((c) >= 'A' && (c) <= 'Z') || \
95  ((c) >= '0' && (c) <= '9') || \
96  (c) == '_' || \
97  IS_HIGHBIT_SET(c))
98 
99 /*
100  * Utility function to check if a string is a valid JSON number.
101  *
102  * str is of length len, and need not be null-terminated.
103  */
104 bool
105 IsValidJsonNumber(const char *str, int len)
106 {
107  bool numeric_error;
108  int total_len;
109  JsonLexContext dummy_lex;
110 
111  if (len <= 0)
112  return false;
113 
114  /*
115  * json_lex_number expects a leading '-' to have been eaten already.
116  *
117  * having to cast away the constness of str is ugly, but there's not much
118  * easy alternative.
119  */
120  if (*str == '-')
121  {
122  dummy_lex.input = unconstify(char *, str) + 1;
123  dummy_lex.input_length = len - 1;
124  }
125  else
126  {
127  dummy_lex.input = unconstify(char *, str);
128  dummy_lex.input_length = len;
129  }
130 
131  json_lex_number(&dummy_lex, dummy_lex.input, &numeric_error, &total_len);
132 
133  return (!numeric_error) && (total_len == dummy_lex.input_length);
134 }
135 
136 /*
137  * makeJsonLexContextCstringLen
138  * Initialize the given JsonLexContext object, or create one
139  *
140  * If a valid 'lex' pointer is given, it is initialized. This can
141  * be used for stack-allocated structs, saving overhead. If NULL is
142  * given, a new struct is allocated.
143  *
144  * If need_escapes is true, ->strval stores the unescaped lexemes.
145  * Unescaping is expensive, so only request it when necessary.
146  *
147  * If need_escapes is true or lex was given as NULL, then caller is
148  * responsible for freeing the returned struct, either by calling
149  * freeJsonLexContext() or (in backend environment) via memory context
150  * cleanup.
151  */
154  int len, int encoding, bool need_escapes)
155 {
156  if (lex == NULL)
157  {
158  lex = palloc0(sizeof(JsonLexContext));
159  lex->flags |= JSONLEX_FREE_STRUCT;
160  }
161  else
162  memset(lex, 0, sizeof(JsonLexContext));
163 
164  lex->input = lex->token_terminator = lex->line_start = json;
165  lex->line_number = 1;
166  lex->input_length = len;
167  lex->input_encoding = encoding;
168  if (need_escapes)
169  {
170  lex->strval = makeStringInfo();
171  lex->flags |= JSONLEX_FREE_STRVAL;
172  }
173 
174  return lex;
175 }
176 
177 /*
178  * Free memory in a JsonLexContext. There's no need for this if a *lex
179  * pointer was given when the object was made and need_escapes was false,
180  * or (in backend environment) a memory context delete/reset is imminent.
181  */
182 void
184 {
185  if (lex->flags & JSONLEX_FREE_STRVAL)
186  {
187  pfree(lex->strval->data);
188  pfree(lex->strval);
189  }
190  if (lex->flags & JSONLEX_FREE_STRUCT)
191  pfree(lex);
192 }
193 
194 /*
195  * pg_parse_json
196  *
197  * Publicly visible entry point for the JSON parser.
198  *
199  * lex is a lexing context, set up for the json to be processed by calling
200  * makeJsonLexContext(). sem is a structure of function pointers to semantic
201  * action routines to be called at appropriate spots during parsing, and a
202  * pointer to a state object to be passed to those routines.
203  */
206 {
207  JsonTokenType tok;
208  JsonParseErrorType result;
209 
210  /* get the initial token */
211  result = json_lex(lex);
212  if (result != JSON_SUCCESS)
213  return result;
214 
215  tok = lex_peek(lex);
216 
217  /* parse by recursive descent */
218  switch (tok)
219  {
221  result = parse_object(lex, sem);
222  break;
224  result = parse_array(lex, sem);
225  break;
226  default:
227  result = parse_scalar(lex, sem); /* json can be a bare scalar */
228  }
229 
230  if (result == JSON_SUCCESS)
231  result = lex_expect(JSON_PARSE_END, lex, JSON_TOKEN_END);
232 
233  return result;
234 }
235 
236 /*
237  * json_count_array_elements
238  *
239  * Returns number of array elements in lex context at start of array token
240  * until end of array token at same nesting level.
241  *
242  * Designed to be called from array_start routines.
243  */
246 {
247  JsonLexContext copylex;
248  int count;
249  JsonParseErrorType result;
250 
251  /*
252  * It's safe to do this with a shallow copy because the lexical routines
253  * don't scribble on the input. They do scribble on the other pointers
254  * etc, so doing this with a copy makes that safe.
255  */
256  memcpy(&copylex, lex, sizeof(JsonLexContext));
257  copylex.strval = NULL; /* not interested in values here */
258  copylex.lex_level++;
259 
260  count = 0;
261  result = lex_expect(JSON_PARSE_ARRAY_START, &copylex,
263  if (result != JSON_SUCCESS)
264  return result;
265  if (lex_peek(&copylex) != JSON_TOKEN_ARRAY_END)
266  {
267  while (1)
268  {
269  count++;
270  result = parse_array_element(&copylex, &nullSemAction);
271  if (result != JSON_SUCCESS)
272  return result;
273  if (copylex.token_type != JSON_TOKEN_COMMA)
274  break;
275  result = json_lex(&copylex);
276  if (result != JSON_SUCCESS)
277  return result;
278  }
279  }
280  result = lex_expect(JSON_PARSE_ARRAY_NEXT, &copylex,
282  if (result != JSON_SUCCESS)
283  return result;
284 
285  *elements = count;
286  return JSON_SUCCESS;
287 }
288 
289 /*
290  * Recursive Descent parse routines. There is one for each structural
291  * element in a json document:
292  * - scalar (string, number, true, false, null)
293  * - array ( [ ] )
294  * - array element
295  * - object ( { } )
296  * - object field
297  */
298 static inline JsonParseErrorType
300 {
301  char *val = NULL;
302  json_scalar_action sfunc = sem->scalar;
303  JsonTokenType tok = lex_peek(lex);
304  JsonParseErrorType result;
305 
306  /* a scalar must be a string, a number, true, false, or null */
307  if (tok != JSON_TOKEN_STRING && tok != JSON_TOKEN_NUMBER &&
308  tok != JSON_TOKEN_TRUE && tok != JSON_TOKEN_FALSE &&
309  tok != JSON_TOKEN_NULL)
311 
312  /* if no semantic function, just consume the token */
313  if (sfunc == NULL)
314  return json_lex(lex);
315 
316  /* extract the de-escaped string value, or the raw lexeme */
317  if (lex_peek(lex) == JSON_TOKEN_STRING)
318  {
319  if (lex->strval != NULL)
320  val = pstrdup(lex->strval->data);
321  }
322  else
323  {
324  int len = (lex->token_terminator - lex->token_start);
325 
326  val = palloc(len + 1);
327  memcpy(val, lex->token_start, len);
328  val[len] = '\0';
329  }
330 
331  /* consume the token */
332  result = json_lex(lex);
333  if (result != JSON_SUCCESS)
334  return result;
335 
336  /* invoke the callback */
337  result = (*sfunc) (sem->semstate, val, tok);
338 
339  return result;
340 }
341 
342 static JsonParseErrorType
344 {
345  /*
346  * An object field is "fieldname" : value where value can be a scalar,
347  * object or array. Note: in user-facing docs and error messages, we
348  * generally call a field name a "key".
349  */
350 
351  char *fname = NULL; /* keep compiler quiet */
354  bool isnull;
355  JsonTokenType tok;
356  JsonParseErrorType result;
357 
358  if (lex_peek(lex) != JSON_TOKEN_STRING)
360  if ((ostart != NULL || oend != NULL) && lex->strval != NULL)
361  fname = pstrdup(lex->strval->data);
362  result = json_lex(lex);
363  if (result != JSON_SUCCESS)
364  return result;
365 
367  if (result != JSON_SUCCESS)
368  return result;
369 
370  tok = lex_peek(lex);
371  isnull = tok == JSON_TOKEN_NULL;
372 
373  if (ostart != NULL)
374  {
375  result = (*ostart) (sem->semstate, fname, isnull);
376  if (result != JSON_SUCCESS)
377  return result;
378  }
379 
380  switch (tok)
381  {
383  result = parse_object(lex, sem);
384  break;
386  result = parse_array(lex, sem);
387  break;
388  default:
389  result = parse_scalar(lex, sem);
390  }
391  if (result != JSON_SUCCESS)
392  return result;
393 
394  if (oend != NULL)
395  {
396  result = (*oend) (sem->semstate, fname, isnull);
397  if (result != JSON_SUCCESS)
398  return result;
399  }
400 
401  return JSON_SUCCESS;
402 }
403 
404 static JsonParseErrorType
406 {
407  /*
408  * an object is a possibly empty sequence of object fields, separated by
409  * commas and surrounded by curly braces.
410  */
411  json_struct_action ostart = sem->object_start;
412  json_struct_action oend = sem->object_end;
413  JsonTokenType tok;
414  JsonParseErrorType result;
415 
416 #ifndef FRONTEND
418 #endif
419 
420  if (ostart != NULL)
421  {
422  result = (*ostart) (sem->semstate);
423  if (result != JSON_SUCCESS)
424  return result;
425  }
426 
427  /*
428  * Data inside an object is at a higher nesting level than the object
429  * itself. Note that we increment this after we call the semantic routine
430  * for the object start and restore it before we call the routine for the
431  * object end.
432  */
433  lex->lex_level++;
434 
436  result = json_lex(lex);
437  if (result != JSON_SUCCESS)
438  return result;
439 
440  tok = lex_peek(lex);
441  switch (tok)
442  {
443  case JSON_TOKEN_STRING:
444  result = parse_object_field(lex, sem);
445  while (result == JSON_SUCCESS && lex_peek(lex) == JSON_TOKEN_COMMA)
446  {
447  result = json_lex(lex);
448  if (result != JSON_SUCCESS)
449  break;
450  result = parse_object_field(lex, sem);
451  }
452  break;
454  break;
455  default:
456  /* case of an invalid initial token inside the object */
458  }
459  if (result != JSON_SUCCESS)
460  return result;
461 
463  if (result != JSON_SUCCESS)
464  return result;
465 
466  lex->lex_level--;
467 
468  if (oend != NULL)
469  {
470  result = (*oend) (sem->semstate);
471  if (result != JSON_SUCCESS)
472  return result;
473  }
474 
475  return JSON_SUCCESS;
476 }
477 
478 static JsonParseErrorType
480 {
483  JsonTokenType tok = lex_peek(lex);
484  JsonParseErrorType result;
485  bool isnull;
486 
487  isnull = tok == JSON_TOKEN_NULL;
488 
489  if (astart != NULL)
490  {
491  result = (*astart) (sem->semstate, isnull);
492  if (result != JSON_SUCCESS)
493  return result;
494  }
495 
496  /* an array element is any object, array or scalar */
497  switch (tok)
498  {
500  result = parse_object(lex, sem);
501  break;
503  result = parse_array(lex, sem);
504  break;
505  default:
506  result = parse_scalar(lex, sem);
507  }
508 
509  if (result != JSON_SUCCESS)
510  return result;
511 
512  if (aend != NULL)
513  {
514  result = (*aend) (sem->semstate, isnull);
515  if (result != JSON_SUCCESS)
516  return result;
517  }
518 
519  return JSON_SUCCESS;
520 }
521 
522 static JsonParseErrorType
524 {
525  /*
526  * an array is a possibly empty sequence of array elements, separated by
527  * commas and surrounded by square brackets.
528  */
529  json_struct_action astart = sem->array_start;
530  json_struct_action aend = sem->array_end;
531  JsonParseErrorType result;
532 
533 #ifndef FRONTEND
535 #endif
536 
537  if (astart != NULL)
538  {
539  result = (*astart) (sem->semstate);
540  if (result != JSON_SUCCESS)
541  return result;
542  }
543 
544  /*
545  * Data inside an array is at a higher nesting level than the array
546  * itself. Note that we increment this after we call the semantic routine
547  * for the array start and restore it before we call the routine for the
548  * array end.
549  */
550  lex->lex_level++;
551 
553  if (result == JSON_SUCCESS && lex_peek(lex) != JSON_TOKEN_ARRAY_END)
554  {
555  result = parse_array_element(lex, sem);
556 
557  while (result == JSON_SUCCESS && lex_peek(lex) == JSON_TOKEN_COMMA)
558  {
559  result = json_lex(lex);
560  if (result != JSON_SUCCESS)
561  break;
562  result = parse_array_element(lex, sem);
563  }
564  }
565  if (result != JSON_SUCCESS)
566  return result;
567 
569  if (result != JSON_SUCCESS)
570  return result;
571 
572  lex->lex_level--;
573 
574  if (aend != NULL)
575  {
576  result = (*aend) (sem->semstate);
577  if (result != JSON_SUCCESS)
578  return result;
579  }
580 
581  return JSON_SUCCESS;
582 }
583 
584 /*
585  * Lex one token from the input stream.
586  */
589 {
590  char *s;
591  char *const end = lex->input + lex->input_length;
592  JsonParseErrorType result;
593 
594  /* Skip leading whitespace. */
595  s = lex->token_terminator;
596  while (s < end && (*s == ' ' || *s == '\t' || *s == '\n' || *s == '\r'))
597  {
598  if (*s++ == '\n')
599  {
600  ++lex->line_number;
601  lex->line_start = s;
602  }
603  }
604  lex->token_start = s;
605 
606  /* Determine token type. */
607  if (s >= end)
608  {
609  lex->token_start = NULL;
611  lex->token_terminator = s;
612  lex->token_type = JSON_TOKEN_END;
613  }
614  else
615  {
616  switch (*s)
617  {
618  /* Single-character token, some kind of punctuation mark. */
619  case '{':
621  lex->token_terminator = s + 1;
623  break;
624  case '}':
626  lex->token_terminator = s + 1;
628  break;
629  case '[':
631  lex->token_terminator = s + 1;
633  break;
634  case ']':
636  lex->token_terminator = s + 1;
638  break;
639  case ',':
641  lex->token_terminator = s + 1;
643  break;
644  case ':':
646  lex->token_terminator = s + 1;
648  break;
649  case '"':
650  /* string */
651  result = json_lex_string(lex);
652  if (result != JSON_SUCCESS)
653  return result;
655  break;
656  case '-':
657  /* Negative number. */
658  result = json_lex_number(lex, s + 1, NULL, NULL);
659  if (result != JSON_SUCCESS)
660  return result;
662  break;
663  case '0':
664  case '1':
665  case '2':
666  case '3':
667  case '4':
668  case '5':
669  case '6':
670  case '7':
671  case '8':
672  case '9':
673  /* Positive number. */
674  result = json_lex_number(lex, s, NULL, NULL);
675  if (result != JSON_SUCCESS)
676  return result;
678  break;
679  default:
680  {
681  char *p;
682 
683  /*
684  * We're not dealing with a string, number, legal
685  * punctuation mark, or end of string. The only legal
686  * tokens we might find here are true, false, and null,
687  * but for error reporting purposes we scan until we see a
688  * non-alphanumeric character. That way, we can report
689  * the whole word as an unexpected token, rather than just
690  * some unintuitive prefix thereof.
691  */
692  for (p = s; p < end && JSON_ALPHANUMERIC_CHAR(*p); p++)
693  /* skip */ ;
694 
695  /*
696  * We got some sort of unexpected punctuation or an
697  * otherwise unexpected character, so just complain about
698  * that one character.
699  */
700  if (p == s)
701  {
703  lex->token_terminator = s + 1;
704  return JSON_INVALID_TOKEN;
705  }
706 
707  /*
708  * We've got a real alphanumeric token here. If it
709  * happens to be true, false, or null, all is well. If
710  * not, error out.
711  */
713  lex->token_terminator = p;
714  if (p - s == 4)
715  {
716  if (memcmp(s, "true", 4) == 0)
718  else if (memcmp(s, "null", 4) == 0)
720  else
721  return JSON_INVALID_TOKEN;
722  }
723  else if (p - s == 5 && memcmp(s, "false", 5) == 0)
725  else
726  return JSON_INVALID_TOKEN;
727  }
728  } /* end of switch */
729  }
730 
731  return JSON_SUCCESS;
732 }
733 
734 /*
735  * The next token in the input stream is known to be a string; lex it.
736  *
737  * If lex->strval isn't NULL, fill it with the decoded string.
738  * Set lex->token_terminator to the end of the decoded input, and in
739  * success cases, transfer its previous value to lex->prev_token_terminator.
740  * Return JSON_SUCCESS or an error code.
741  *
742  * Note: be careful that all error exits advance lex->token_terminator
743  * to the point after the character we detected the error on.
744  */
745 static inline JsonParseErrorType
747 {
748  char *s;
749  char *const end = lex->input + lex->input_length;
750  int hi_surrogate = -1;
751 
752  /* Convenience macros for error exits */
753 #define FAIL_AT_CHAR_START(code) \
754  do { \
755  lex->token_terminator = s; \
756  return code; \
757  } while (0)
758 #define FAIL_AT_CHAR_END(code) \
759  do { \
760  lex->token_terminator = \
761  s + pg_encoding_mblen_bounded(lex->input_encoding, s); \
762  return code; \
763  } while (0)
764 
765  if (lex->strval != NULL)
766  resetStringInfo(lex->strval);
767 
768  Assert(lex->input_length > 0);
769  s = lex->token_start;
770  for (;;)
771  {
772  s++;
773  /* Premature end of the string. */
774  if (s >= end)
776  else if (*s == '"')
777  break;
778  else if (*s == '\\')
779  {
780  /* OK, we have an escape character. */
781  s++;
782  if (s >= end)
784  else if (*s == 'u')
785  {
786  int i;
787  int ch = 0;
788 
789  for (i = 1; i <= 4; i++)
790  {
791  s++;
792  if (s >= end)
794  else if (*s >= '0' && *s <= '9')
795  ch = (ch * 16) + (*s - '0');
796  else if (*s >= 'a' && *s <= 'f')
797  ch = (ch * 16) + (*s - 'a') + 10;
798  else if (*s >= 'A' && *s <= 'F')
799  ch = (ch * 16) + (*s - 'A') + 10;
800  else
802  }
803  if (lex->strval != NULL)
804  {
805  /*
806  * Combine surrogate pairs.
807  */
808  if (is_utf16_surrogate_first(ch))
809  {
810  if (hi_surrogate != -1)
812  hi_surrogate = ch;
813  continue;
814  }
815  else if (is_utf16_surrogate_second(ch))
816  {
817  if (hi_surrogate == -1)
819  ch = surrogate_pair_to_codepoint(hi_surrogate, ch);
820  hi_surrogate = -1;
821  }
822 
823  if (hi_surrogate != -1)
825 
826  /*
827  * Reject invalid cases. We can't have a value above
828  * 0xFFFF here (since we only accepted 4 hex digits
829  * above), so no need to test for out-of-range chars.
830  */
831  if (ch == 0)
832  {
833  /* We can't allow this, since our TEXT type doesn't */
835  }
836 
837  /*
838  * Add the represented character to lex->strval. In the
839  * backend, we can let pg_unicode_to_server_noerror()
840  * handle any required character set conversion; in
841  * frontend, we can only deal with trivial conversions.
842  */
843 #ifndef FRONTEND
844  {
845  char cbuf[MAX_UNICODE_EQUIVALENT_STRING + 1];
846 
847  if (!pg_unicode_to_server_noerror(ch, (unsigned char *) cbuf))
849  appendStringInfoString(lex->strval, cbuf);
850  }
851 #else
852  if (lex->input_encoding == PG_UTF8)
853  {
854  /* OK, we can map the code point to UTF8 easily */
855  char utf8str[5];
856  int utf8len;
857 
858  unicode_to_utf8(ch, (unsigned char *) utf8str);
859  utf8len = pg_utf_mblen((unsigned char *) utf8str);
860  appendBinaryStringInfo(lex->strval, utf8str, utf8len);
861  }
862  else if (ch <= 0x007f)
863  {
864  /* The ASCII range is the same in all encodings */
865  appendStringInfoChar(lex->strval, (char) ch);
866  }
867  else
869 #endif /* FRONTEND */
870  }
871  }
872  else if (lex->strval != NULL)
873  {
874  if (hi_surrogate != -1)
876 
877  switch (*s)
878  {
879  case '"':
880  case '\\':
881  case '/':
882  appendStringInfoChar(lex->strval, *s);
883  break;
884  case 'b':
885  appendStringInfoChar(lex->strval, '\b');
886  break;
887  case 'f':
888  appendStringInfoChar(lex->strval, '\f');
889  break;
890  case 'n':
891  appendStringInfoChar(lex->strval, '\n');
892  break;
893  case 'r':
894  appendStringInfoChar(lex->strval, '\r');
895  break;
896  case 't':
897  appendStringInfoChar(lex->strval, '\t');
898  break;
899  default:
900 
901  /*
902  * Not a valid string escape, so signal error. We
903  * adjust token_start so that just the escape sequence
904  * is reported, not the whole string.
905  */
906  lex->token_start = s;
908  }
909  }
910  else if (strchr("\"\\/bfnrt", *s) == NULL)
911  {
912  /*
913  * Simpler processing if we're not bothered about de-escaping
914  *
915  * It's very tempting to remove the strchr() call here and
916  * replace it with a switch statement, but testing so far has
917  * shown it's not a performance win.
918  */
919  lex->token_start = s;
921  }
922  }
923  else
924  {
925  char *p = s;
926 
927  if (hi_surrogate != -1)
929 
930  /*
931  * Skip to the first byte that requires special handling, so we
932  * can batch calls to appendBinaryStringInfo.
933  */
934  while (p < end - sizeof(Vector8) &&
935  !pg_lfind8('\\', (uint8 *) p, sizeof(Vector8)) &&
936  !pg_lfind8('"', (uint8 *) p, sizeof(Vector8)) &&
937  !pg_lfind8_le(31, (uint8 *) p, sizeof(Vector8)))
938  p += sizeof(Vector8);
939 
940  for (; p < end; p++)
941  {
942  if (*p == '\\' || *p == '"')
943  break;
944  else if ((unsigned char) *p <= 31)
945  {
946  /* Per RFC4627, these characters MUST be escaped. */
947  /*
948  * Since *p isn't printable, exclude it from the context
949  * string
950  */
951  lex->token_terminator = p;
952  return JSON_ESCAPING_REQUIRED;
953  }
954  }
955 
956  if (lex->strval != NULL)
957  appendBinaryStringInfo(lex->strval, s, p - s);
958 
959  /*
960  * s will be incremented at the top of the loop, so set it to just
961  * behind our lookahead position
962  */
963  s = p - 1;
964  }
965  }
966 
967  if (hi_surrogate != -1)
968  {
969  lex->token_terminator = s + 1;
971  }
972 
973  /* Hooray, we found the end of the string! */
975  lex->token_terminator = s + 1;
976  return JSON_SUCCESS;
977 
978 #undef FAIL_AT_CHAR_START
979 #undef FAIL_AT_CHAR_END
980 }
981 
982 /*
983  * The next token in the input stream is known to be a number; lex it.
984  *
985  * In JSON, a number consists of four parts:
986  *
987  * (1) An optional minus sign ('-').
988  *
989  * (2) Either a single '0', or a string of one or more digits that does not
990  * begin with a '0'.
991  *
992  * (3) An optional decimal part, consisting of a period ('.') followed by
993  * one or more digits. (Note: While this part can be omitted
994  * completely, it's not OK to have only the decimal point without
995  * any digits afterwards.)
996  *
997  * (4) An optional exponent part, consisting of 'e' or 'E', optionally
998  * followed by '+' or '-', followed by one or more digits. (Note:
999  * As with the decimal part, if 'e' or 'E' is present, it must be
1000  * followed by at least one digit.)
1001  *
1002  * The 's' argument to this function points to the ostensible beginning
1003  * of part 2 - i.e. the character after any optional minus sign, or the
1004  * first character of the string if there is none.
1005  *
1006  * If num_err is not NULL, we return an error flag to *num_err rather than
1007  * raising an error for a badly-formed number. Also, if total_len is not NULL
1008  * the distance from lex->input to the token end+1 is returned to *total_len.
1009  */
1010 static inline JsonParseErrorType
1012  bool *num_err, int *total_len)
1013 {
1014  bool error = false;
1015  int len = s - lex->input;
1016 
1017  /* Part (1): leading sign indicator. */
1018  /* Caller already did this for us; so do nothing. */
1019 
1020  /* Part (2): parse main digit string. */
1021  if (len < lex->input_length && *s == '0')
1022  {
1023  s++;
1024  len++;
1025  }
1026  else if (len < lex->input_length && *s >= '1' && *s <= '9')
1027  {
1028  do
1029  {
1030  s++;
1031  len++;
1032  } while (len < lex->input_length && *s >= '0' && *s <= '9');
1033  }
1034  else
1035  error = true;
1036 
1037  /* Part (3): parse optional decimal portion. */
1038  if (len < lex->input_length && *s == '.')
1039  {
1040  s++;
1041  len++;
1042  if (len == lex->input_length || *s < '0' || *s > '9')
1043  error = true;
1044  else
1045  {
1046  do
1047  {
1048  s++;
1049  len++;
1050  } while (len < lex->input_length && *s >= '0' && *s <= '9');
1051  }
1052  }
1053 
1054  /* Part (4): parse optional exponent. */
1055  if (len < lex->input_length && (*s == 'e' || *s == 'E'))
1056  {
1057  s++;
1058  len++;
1059  if (len < lex->input_length && (*s == '+' || *s == '-'))
1060  {
1061  s++;
1062  len++;
1063  }
1064  if (len == lex->input_length || *s < '0' || *s > '9')
1065  error = true;
1066  else
1067  {
1068  do
1069  {
1070  s++;
1071  len++;
1072  } while (len < lex->input_length && *s >= '0' && *s <= '9');
1073  }
1074  }
1075 
1076  /*
1077  * Check for trailing garbage. As in json_lex(), any alphanumeric stuff
1078  * here should be considered part of the token for error-reporting
1079  * purposes.
1080  */
1081  for (; len < lex->input_length && JSON_ALPHANUMERIC_CHAR(*s); s++, len++)
1082  error = true;
1083 
1084  if (total_len != NULL)
1085  *total_len = len;
1086 
1087  if (num_err != NULL)
1088  {
1089  /* let the caller handle any error */
1090  *num_err = error;
1091  }
1092  else
1093  {
1094  /* return token endpoint */
1096  lex->token_terminator = s;
1097  /* handle error if any */
1098  if (error)
1099  return JSON_INVALID_TOKEN;
1100  }
1101 
1102  return JSON_SUCCESS;
1103 }
1104 
1105 /*
1106  * Report a parse error.
1107  *
1108  * lex->token_start and lex->token_terminator must identify the current token.
1109  */
1110 static JsonParseErrorType
1112 {
1113  /* Handle case where the input ended prematurely. */
1114  if (lex->token_start == NULL || lex->token_type == JSON_TOKEN_END)
1115  return JSON_EXPECTED_MORE;
1116 
1117  /* Otherwise choose the error type based on the parsing context. */
1118  switch (ctx)
1119  {
1120  case JSON_PARSE_END:
1121  return JSON_EXPECTED_END;
1122  case JSON_PARSE_VALUE:
1123  return JSON_EXPECTED_JSON;
1124  case JSON_PARSE_STRING:
1125  return JSON_EXPECTED_STRING;
1128  case JSON_PARSE_ARRAY_NEXT:
1129  return JSON_EXPECTED_ARRAY_NEXT;
1133  return JSON_EXPECTED_COLON;
1137  return JSON_EXPECTED_STRING;
1138  }
1139 
1140  /*
1141  * We don't use a default: case, so that the compiler will warn about
1142  * unhandled enum values.
1143  */
1144  Assert(false);
1145  return JSON_SUCCESS; /* silence stupider compilers */
1146 }
1147 
1148 
1149 #ifndef FRONTEND
1150 /*
1151  * Extract the current token from a lexing context, for error reporting.
1152  */
1153 static char *
1155 {
1156  int toklen = lex->token_terminator - lex->token_start;
1157  char *token = palloc(toklen + 1);
1158 
1159  memcpy(token, lex->token_start, toklen);
1160  token[toklen] = '\0';
1161  return token;
1162 }
1163 
1164 /*
1165  * Construct an (already translated) detail message for a JSON error.
1166  *
1167  * Note that the error message generated by this routine may not be
1168  * palloc'd, making it unsafe for frontend code as there is no way to
1169  * know if this can be safely pfree'd or not.
1170  */
1171 char *
1173 {
1174  switch (error)
1175  {
1176  case JSON_SUCCESS:
1177  /* fall through to the error code after switch */
1178  break;
1179  case JSON_ESCAPING_INVALID:
1180  return psprintf(_("Escape sequence \"\\%s\" is invalid."),
1181  extract_token(lex));
1183  return psprintf(_("Character with value 0x%02x must be escaped."),
1184  (unsigned char) *(lex->token_terminator));
1185  case JSON_EXPECTED_END:
1186  return psprintf(_("Expected end of input, but found \"%s\"."),
1187  extract_token(lex));
1189  return psprintf(_("Expected array element or \"]\", but found \"%s\"."),
1190  extract_token(lex));
1192  return psprintf(_("Expected \",\" or \"]\", but found \"%s\"."),
1193  extract_token(lex));
1194  case JSON_EXPECTED_COLON:
1195  return psprintf(_("Expected \":\", but found \"%s\"."),
1196  extract_token(lex));
1197  case JSON_EXPECTED_JSON:
1198  return psprintf(_("Expected JSON value, but found \"%s\"."),
1199  extract_token(lex));
1200  case JSON_EXPECTED_MORE:
1201  return _("The input string ended unexpectedly.");
1203  return psprintf(_("Expected string or \"}\", but found \"%s\"."),
1204  extract_token(lex));
1206  return psprintf(_("Expected \",\" or \"}\", but found \"%s\"."),
1207  extract_token(lex));
1208  case JSON_EXPECTED_STRING:
1209  return psprintf(_("Expected string, but found \"%s\"."),
1210  extract_token(lex));
1211  case JSON_INVALID_TOKEN:
1212  return psprintf(_("Token \"%s\" is invalid."),
1213  extract_token(lex));
1215  return _("\\u0000 cannot be converted to text.");
1217  return _("\"\\u\" must be followed by four hexadecimal digits.");
1219  /* note: this case is only reachable in frontend not backend */
1220  return _("Unicode escape values cannot be used for code point values above 007F when the encoding is not UTF8.");
1222  /* note: this case is only reachable in backend not frontend */
1223  return psprintf(_("Unicode escape value could not be translated to the server's encoding %s."),
1226  return _("Unicode high surrogate must not follow a high surrogate.");
1228  return _("Unicode low surrogate must follow a high surrogate.");
1230  /* fall through to the error code after switch */
1231  break;
1232  }
1233 
1234  /*
1235  * We don't use a default: case, so that the compiler will warn about
1236  * unhandled enum values. But this needs to be here anyway to cover the
1237  * possibility of an incorrect input.
1238  */
1239  elog(ERROR, "unexpected json parse error type: %d", (int) error);
1240  return NULL;
1241 }
1242 #endif
#define unconstify(underlying_type, expr)
Definition: c.h:1234
unsigned char uint8
Definition: c.h:493
#define _(x)
Definition: elog.c:91
#define ERROR
Definition: elog.h:39
#define token
Definition: indent_globs.h:126
long val
Definition: informix.c:664
int i
Definition: isn.c:73
if(TABLE==NULL||TABLE_index==NULL)
Definition: isn.c:77
JsonParseContext
Definition: jsonapi.c:34
@ JSON_PARSE_OBJECT_LABEL
Definition: jsonapi.c:40
@ JSON_PARSE_VALUE
Definition: jsonapi.c:35
@ JSON_PARSE_OBJECT_START
Definition: jsonapi.c:39
@ JSON_PARSE_ARRAY_START
Definition: jsonapi.c:37
@ JSON_PARSE_END
Definition: jsonapi.c:43
@ JSON_PARSE_OBJECT_NEXT
Definition: jsonapi.c:41
@ JSON_PARSE_ARRAY_NEXT
Definition: jsonapi.c:38
@ JSON_PARSE_OBJECT_COMMA
Definition: jsonapi.c:42
@ JSON_PARSE_STRING
Definition: jsonapi.c:36
static JsonParseErrorType json_lex_string(JsonLexContext *lex)
Definition: jsonapi.c:746
#define JSON_ALPHANUMERIC_CHAR(c)
Definition: jsonapi.c:92
static JsonParseErrorType parse_object_field(JsonLexContext *lex, JsonSemAction *sem)
Definition: jsonapi.c:343
#define FAIL_AT_CHAR_START(code)
static JsonParseErrorType parse_array_element(JsonLexContext *lex, JsonSemAction *sem)
Definition: jsonapi.c:479
static char * extract_token(JsonLexContext *lex)
Definition: jsonapi.c:1154
static JsonTokenType lex_peek(JsonLexContext *lex)
Definition: jsonapi.c:71
static JsonParseErrorType parse_object(JsonLexContext *lex, JsonSemAction *sem)
Definition: jsonapi.c:405
char * json_errdetail(JsonParseErrorType error, JsonLexContext *lex)
Definition: jsonapi.c:1172
bool IsValidJsonNumber(const char *str, int len)
Definition: jsonapi.c:105
static JsonParseErrorType report_parse_error(JsonParseContext ctx, JsonLexContext *lex)
Definition: jsonapi.c:1111
JsonSemAction nullSemAction
Definition: jsonapi.c:57
static JsonParseErrorType lex_expect(JsonParseContext ctx, JsonLexContext *lex, JsonTokenType token)
Definition: jsonapi.c:83
JsonParseErrorType pg_parse_json(JsonLexContext *lex, JsonSemAction *sem)
Definition: jsonapi.c:205
static JsonParseErrorType parse_array(JsonLexContext *lex, JsonSemAction *sem)
Definition: jsonapi.c:523
static JsonParseErrorType parse_scalar(JsonLexContext *lex, JsonSemAction *sem)
Definition: jsonapi.c:299
JsonParseErrorType json_lex(JsonLexContext *lex)
Definition: jsonapi.c:588
JsonLexContext * makeJsonLexContextCstringLen(JsonLexContext *lex, char *json, int len, int encoding, bool need_escapes)
Definition: jsonapi.c:153
JsonParseErrorType json_count_array_elements(JsonLexContext *lex, int *elements)
Definition: jsonapi.c:245
void freeJsonLexContext(JsonLexContext *lex)
Definition: jsonapi.c:183
static JsonParseErrorType json_lex_number(JsonLexContext *lex, char *s, bool *num_err, int *total_len)
Definition: jsonapi.c:1011
#define FAIL_AT_CHAR_END(code)
JsonParseErrorType(* json_struct_action)(void *state)
Definition: jsonapi.h:94
JsonParseErrorType(* json_aelem_action)(void *state, bool isnull)
Definition: jsonapi.h:96
#define JSONLEX_FREE_STRVAL
Definition: jsonapi.h:77
JsonParseErrorType
Definition: jsonapi.h:37
@ JSON_SEM_ACTION_FAILED
Definition: jsonapi.h:57
@ JSON_EXPECTED_ARRAY_FIRST
Definition: jsonapi.h:41
@ JSON_EXPECTED_MORE
Definition: jsonapi.h:46
@ JSON_UNICODE_HIGH_SURROGATE
Definition: jsonapi.h:55
@ JSON_EXPECTED_COLON
Definition: jsonapi.h:43
@ JSON_EXPECTED_OBJECT_FIRST
Definition: jsonapi.h:47
@ JSON_UNICODE_CODE_POINT_ZERO
Definition: jsonapi.h:51
@ JSON_EXPECTED_STRING
Definition: jsonapi.h:49
@ JSON_UNICODE_ESCAPE_FORMAT
Definition: jsonapi.h:52
@ JSON_SUCCESS
Definition: jsonapi.h:38
@ JSON_UNICODE_UNTRANSLATABLE
Definition: jsonapi.h:54
@ JSON_EXPECTED_OBJECT_NEXT
Definition: jsonapi.h:48
@ JSON_ESCAPING_REQUIRED
Definition: jsonapi.h:40
@ JSON_EXPECTED_JSON
Definition: jsonapi.h:45
@ JSON_INVALID_TOKEN
Definition: jsonapi.h:50
@ JSON_ESCAPING_INVALID
Definition: jsonapi.h:39
@ JSON_EXPECTED_END
Definition: jsonapi.h:44
@ JSON_EXPECTED_ARRAY_NEXT
Definition: jsonapi.h:42
@ JSON_UNICODE_HIGH_ESCAPE
Definition: jsonapi.h:53
@ JSON_UNICODE_LOW_SURROGATE
Definition: jsonapi.h:56
JsonParseErrorType(* json_ofield_action)(void *state, char *fname, bool isnull)
Definition: jsonapi.h:95
#define JSONLEX_FREE_STRUCT
Definition: jsonapi.h:76
JsonTokenType
Definition: jsonapi.h:20
@ JSON_TOKEN_COMMA
Definition: jsonapi.h:28
@ JSON_TOKEN_FALSE
Definition: jsonapi.h:31
@ JSON_TOKEN_END
Definition: jsonapi.h:33
@ JSON_TOKEN_TRUE
Definition: jsonapi.h:30
@ JSON_TOKEN_OBJECT_END
Definition: jsonapi.h:25
@ JSON_TOKEN_NULL
Definition: jsonapi.h:32
@ JSON_TOKEN_ARRAY_END
Definition: jsonapi.h:27
@ JSON_TOKEN_OBJECT_START
Definition: jsonapi.h:24
@ JSON_TOKEN_NUMBER
Definition: jsonapi.h:23
@ JSON_TOKEN_STRING
Definition: jsonapi.h:22
@ JSON_TOKEN_COLON
Definition: jsonapi.h:29
@ JSON_TOKEN_ARRAY_START
Definition: jsonapi.h:26
JsonParseErrorType(* json_scalar_action)(void *state, char *token, JsonTokenType tokentype)
Definition: jsonapi.h:97
Assert(fmt[strlen(fmt) - 1] !='\n')
bool pg_unicode_to_server_noerror(pg_wchar c, unsigned char *s)
Definition: mbutils.c:927
const char * GetDatabaseEncodingName(void)
Definition: mbutils.c:1274
char * pstrdup(const char *in)
Definition: mcxt.c:1619
void pfree(void *pointer)
Definition: mcxt.c:1431
void * palloc0(Size size)
Definition: mcxt.c:1232
void * palloc(Size size)
Definition: mcxt.c:1201
const void size_t len
int32 encoding
Definition: pg_database.h:41
static bool pg_lfind8_le(uint8 key, uint8 *base, uint32 nelem)
Definition: pg_lfind.h:58
static bool pg_lfind8(uint8 key, uint8 *base, uint32 nelem)
Definition: pg_lfind.h:26
#define pg_utf_mblen
Definition: pg_wchar.h:564
@ PG_UTF8
Definition: pg_wchar.h:233
#define MAX_UNICODE_EQUIVALENT_STRING
Definition: pg_wchar.h:330
static pg_wchar surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second)
Definition: pg_wchar.h:544
static bool is_utf16_surrogate_first(pg_wchar c)
Definition: pg_wchar.h:532
static bool is_utf16_surrogate_second(pg_wchar c)
Definition: pg_wchar.h:538
void check_stack_depth(void)
Definition: postgres.c:3523
char * psprintf(const char *fmt,...)
Definition: psprintf.c:46
uint64 Vector8
Definition: simd.h:60
static void error(void)
Definition: sql-dyntest.c:147
StringInfo makeStringInfo(void)
Definition: stringinfo.c:41
void resetStringInfo(StringInfo str)
Definition: stringinfo.c:78
void appendBinaryStringInfo(StringInfo str, const void *data, int datalen)
Definition: stringinfo.c:233
void appendStringInfoString(StringInfo str, const char *s)
Definition: stringinfo.c:182
void appendStringInfoChar(StringInfo str, char ch)
Definition: stringinfo.c:194
char * input
Definition: jsonapi.h:80
bits32 flags
Definition: jsonapi.h:88
char * token_start
Definition: jsonapi.h:83
int input_encoding
Definition: jsonapi.h:82
StringInfo strval
Definition: jsonapi.h:91
char * token_terminator
Definition: jsonapi.h:84
char * prev_token_terminator
Definition: jsonapi.h:85
char * line_start
Definition: jsonapi.h:90
int lex_level
Definition: jsonapi.h:87
int input_length
Definition: jsonapi.h:81
int line_number
Definition: jsonapi.h:89
JsonTokenType token_type
Definition: jsonapi.h:86
json_struct_action array_end
Definition: jsonapi.h:123
json_struct_action object_start
Definition: jsonapi.h:120
json_ofield_action object_field_start
Definition: jsonapi.h:124
json_aelem_action array_element_start
Definition: jsonapi.h:126
json_scalar_action scalar
Definition: jsonapi.h:128
void * semstate
Definition: jsonapi.h:119
json_aelem_action array_element_end
Definition: jsonapi.h:127
json_struct_action array_start
Definition: jsonapi.h:122
json_struct_action object_end
Definition: jsonapi.h:121
json_ofield_action object_field_end
Definition: jsonapi.h:125
unsigned char * unicode_to_utf8(pg_wchar c, unsigned char *utf8string)
Definition: wchar.c:484