PostgreSQL Source Code  git master
jsonapi.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * jsonapi.c
4  * JSON parser and lexer interfaces
5  *
6  * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  * IDENTIFICATION
10  * src/common/jsonapi.c
11  *
12  *-------------------------------------------------------------------------
13  */
14 #ifndef FRONTEND
15 #include "postgres.h"
16 #else
17 #include "postgres_fe.h"
18 #endif
19 
20 #include "common/jsonapi.h"
21 #include "mb/pg_wchar.h"
22 
23 #ifdef FRONTEND
24 #include "common/logging.h"
25 #else
26 #include "miscadmin.h"
27 #endif
28 
29 #ifdef FRONTEND
30 #define check_stack_depth()
31 #define json_log_and_abort(...) \
32  do { pg_log_fatal(__VA_ARGS__); exit(1); } while(0)
33 #else
34 #define json_log_and_abort(...) elog(ERROR, __VA_ARGS__)
35 #endif
36 
37 /*
38  * The context of the parser is maintained by the recursive descent
39  * mechanism, but is passed explicitly to the error reporting routine
40  * for better diagnostics.
41  */
42 typedef enum /* contexts of JSON parser */
43 {
44  JSON_PARSE_VALUE, /* expecting a value */
45  JSON_PARSE_STRING, /* expecting a string (for a field name) */
46  JSON_PARSE_ARRAY_START, /* saw '[', expecting value or ']' */
47  JSON_PARSE_ARRAY_NEXT, /* saw array element, expecting ',' or ']' */
48  JSON_PARSE_OBJECT_START, /* saw '{', expecting label or '}' */
49  JSON_PARSE_OBJECT_LABEL, /* saw object label, expecting ':' */
50  JSON_PARSE_OBJECT_NEXT, /* saw object value, expecting ',' or '}' */
51  JSON_PARSE_OBJECT_COMMA, /* saw object ',', expecting next label */
52  JSON_PARSE_END /* saw the end of a document, expect nothing */
54 
56 static inline JsonParseErrorType json_lex_number(JsonLexContext *lex, char *s,
57  bool *num_err, int *total_len);
64 static char *extract_token(JsonLexContext *lex);
65 
66 /* the null action object used for pure validation */
68 {
69  NULL, NULL, NULL, NULL, NULL,
70  NULL, NULL, NULL, NULL, NULL
71 };
72 
73 /* Recursive Descent parser support routines */
74 
75 /*
76  * lex_peek
77  *
78  * what is the current look_ahead token?
79 */
80 static inline JsonTokenType
82 {
83  return lex->token_type;
84 }
85 
86 /*
87  * lex_expect
88  *
89  * move the lexer to the next token if the current look_ahead token matches
90  * the parameter token. Otherwise, report an error.
91  */
92 static inline JsonParseErrorType
94 {
95  if (lex_peek(lex) == token)
96  return json_lex(lex);
97  else
98  return report_parse_error(ctx, lex);
99 }
100 
101 /* chars to consider as part of an alphanumeric token */
102 #define JSON_ALPHANUMERIC_CHAR(c) \
103  (((c) >= 'a' && (c) <= 'z') || \
104  ((c) >= 'A' && (c) <= 'Z') || \
105  ((c) >= '0' && (c) <= '9') || \
106  (c) == '_' || \
107  IS_HIGHBIT_SET(c))
108 
109 /*
110  * Utility function to check if a string is a valid JSON number.
111  *
112  * str is of length len, and need not be null-terminated.
113  */
114 bool
115 IsValidJsonNumber(const char *str, int len)
116 {
117  bool numeric_error;
118  int total_len;
119  JsonLexContext dummy_lex;
120 
121  if (len <= 0)
122  return false;
123 
124  /*
125  * json_lex_number expects a leading '-' to have been eaten already.
126  *
127  * having to cast away the constness of str is ugly, but there's not much
128  * easy alternative.
129  */
130  if (*str == '-')
131  {
132  dummy_lex.input = unconstify(char *, str) + 1;
133  dummy_lex.input_length = len - 1;
134  }
135  else
136  {
137  dummy_lex.input = unconstify(char *, str);
138  dummy_lex.input_length = len;
139  }
140 
141  json_lex_number(&dummy_lex, dummy_lex.input, &numeric_error, &total_len);
142 
143  return (!numeric_error) && (total_len == dummy_lex.input_length);
144 }
145 
146 /*
147  * makeJsonLexContextCstringLen
148  *
149  * lex constructor, with or without StringInfo object for de-escaped lexemes.
150  *
151  * Without is better as it makes the processing faster, so only make one
152  * if really required.
153  */
155 makeJsonLexContextCstringLen(char *json, int len, int encoding, bool need_escapes)
156 {
157  JsonLexContext *lex = palloc0(sizeof(JsonLexContext));
158 
159  lex->input = lex->token_terminator = lex->line_start = json;
160  lex->line_number = 1;
161  lex->input_length = len;
162  lex->input_encoding = encoding;
163  if (need_escapes)
164  lex->strval = makeStringInfo();
165  return lex;
166 }
167 
168 /*
169  * pg_parse_json
170  *
171  * Publicly visible entry point for the JSON parser.
172  *
173  * lex is a lexing context, set up for the json to be processed by calling
174  * makeJsonLexContext(). sem is a structure of function pointers to semantic
175  * action routines to be called at appropriate spots during parsing, and a
176  * pointer to a state object to be passed to those routines.
177  */
180 {
181  JsonTokenType tok;
182  JsonParseErrorType result;
183 
184  /* get the initial token */
185  result = json_lex(lex);
186  if (result != JSON_SUCCESS)
187  return result;
188 
189  tok = lex_peek(lex);
190 
191  /* parse by recursive descent */
192  switch (tok)
193  {
195  result = parse_object(lex, sem);
196  break;
198  result = parse_array(lex, sem);
199  break;
200  default:
201  result = parse_scalar(lex, sem); /* json can be a bare scalar */
202  }
203 
204  if (result == JSON_SUCCESS)
205  result = lex_expect(JSON_PARSE_END, lex, JSON_TOKEN_END);
206 
207  return result;
208 }
209 
210 /*
211  * json_count_array_elements
212  *
213  * Returns number of array elements in lex context at start of array token
214  * until end of array token at same nesting level.
215  *
216  * Designed to be called from array_start routines.
217  */
220 {
221  JsonLexContext copylex;
222  int count;
223  JsonParseErrorType result;
224 
225  /*
226  * It's safe to do this with a shallow copy because the lexical routines
227  * don't scribble on the input. They do scribble on the other pointers
228  * etc, so doing this with a copy makes that safe.
229  */
230  memcpy(&copylex, lex, sizeof(JsonLexContext));
231  copylex.strval = NULL; /* not interested in values here */
232  copylex.lex_level++;
233 
234  count = 0;
235  result = lex_expect(JSON_PARSE_ARRAY_START, &copylex,
237  if (result != JSON_SUCCESS)
238  return result;
239  if (lex_peek(&copylex) != JSON_TOKEN_ARRAY_END)
240  {
241  while (1)
242  {
243  count++;
244  result = parse_array_element(&copylex, &nullSemAction);
245  if (result != JSON_SUCCESS)
246  return result;
247  if (copylex.token_type != JSON_TOKEN_COMMA)
248  break;
249  result = json_lex(&copylex);
250  if (result != JSON_SUCCESS)
251  return result;
252  }
253  }
254  result = lex_expect(JSON_PARSE_ARRAY_NEXT, &copylex,
256  if (result != JSON_SUCCESS)
257  return result;
258 
259  *elements = count;
260  return JSON_SUCCESS;
261 }
262 
263 /*
264  * Recursive Descent parse routines. There is one for each structural
265  * element in a json document:
266  * - scalar (string, number, true, false, null)
267  * - array ( [ ] )
268  * - array element
269  * - object ( { } )
270  * - object field
271  */
272 static inline JsonParseErrorType
274 {
275  char *val = NULL;
276  json_scalar_action sfunc = sem->scalar;
277  JsonTokenType tok = lex_peek(lex);
278  JsonParseErrorType result;
279 
280  /* a scalar must be a string, a number, true, false, or null */
281  if (tok != JSON_TOKEN_STRING && tok != JSON_TOKEN_NUMBER &&
282  tok != JSON_TOKEN_TRUE && tok != JSON_TOKEN_FALSE &&
283  tok != JSON_TOKEN_NULL)
285 
286  /* if no semantic function, just consume the token */
287  if (sfunc == NULL)
288  return json_lex(lex);
289 
290  /* extract the de-escaped string value, or the raw lexeme */
291  if (lex_peek(lex) == JSON_TOKEN_STRING)
292  {
293  if (lex->strval != NULL)
294  val = pstrdup(lex->strval->data);
295  }
296  else
297  {
298  int len = (lex->token_terminator - lex->token_start);
299 
300  val = palloc(len + 1);
301  memcpy(val, lex->token_start, len);
302  val[len] = '\0';
303  }
304 
305  /* consume the token */
306  result = json_lex(lex);
307  if (result != JSON_SUCCESS)
308  return result;
309 
310  /* invoke the callback */
311  (*sfunc) (sem->semstate, val, tok);
312 
313  return JSON_SUCCESS;
314 }
315 
316 static JsonParseErrorType
318 {
319  /*
320  * An object field is "fieldname" : value where value can be a scalar,
321  * object or array. Note: in user-facing docs and error messages, we
322  * generally call a field name a "key".
323  */
324 
325  char *fname = NULL; /* keep compiler quiet */
328  bool isnull;
329  JsonTokenType tok;
330  JsonParseErrorType result;
331 
332  if (lex_peek(lex) != JSON_TOKEN_STRING)
334  if ((ostart != NULL || oend != NULL) && lex->strval != NULL)
335  fname = pstrdup(lex->strval->data);
336  result = json_lex(lex);
337  if (result != JSON_SUCCESS)
338  return result;
339 
341  if (result != JSON_SUCCESS)
342  return result;
343 
344  tok = lex_peek(lex);
345  isnull = tok == JSON_TOKEN_NULL;
346 
347  if (ostart != NULL)
348  (*ostart) (sem->semstate, fname, isnull);
349 
350  switch (tok)
351  {
353  result = parse_object(lex, sem);
354  break;
356  result = parse_array(lex, sem);
357  break;
358  default:
359  result = parse_scalar(lex, sem);
360  }
361  if (result != JSON_SUCCESS)
362  return result;
363 
364  if (oend != NULL)
365  (*oend) (sem->semstate, fname, isnull);
366  return JSON_SUCCESS;
367 }
368 
369 static JsonParseErrorType
371 {
372  /*
373  * an object is a possibly empty sequence of object fields, separated by
374  * commas and surrounded by curly braces.
375  */
376  json_struct_action ostart = sem->object_start;
377  json_struct_action oend = sem->object_end;
378  JsonTokenType tok;
379  JsonParseErrorType result;
380 
382 
383  if (ostart != NULL)
384  (*ostart) (sem->semstate);
385 
386  /*
387  * Data inside an object is at a higher nesting level than the object
388  * itself. Note that we increment this after we call the semantic routine
389  * for the object start and restore it before we call the routine for the
390  * object end.
391  */
392  lex->lex_level++;
393 
395  result = json_lex(lex);
396  if (result != JSON_SUCCESS)
397  return result;
398 
399  tok = lex_peek(lex);
400  switch (tok)
401  {
402  case JSON_TOKEN_STRING:
403  result = parse_object_field(lex, sem);
404  while (result == JSON_SUCCESS && lex_peek(lex) == JSON_TOKEN_COMMA)
405  {
406  result = json_lex(lex);
407  if (result != JSON_SUCCESS)
408  break;
409  result = parse_object_field(lex, sem);
410  }
411  break;
413  break;
414  default:
415  /* case of an invalid initial token inside the object */
417  }
418  if (result != JSON_SUCCESS)
419  return result;
420 
422  if (result != JSON_SUCCESS)
423  return result;
424 
425  lex->lex_level--;
426 
427  if (oend != NULL)
428  (*oend) (sem->semstate);
429 
430  return JSON_SUCCESS;
431 }
432 
433 static JsonParseErrorType
435 {
438  JsonTokenType tok = lex_peek(lex);
439  JsonParseErrorType result;
440 
441  bool isnull;
442 
443  isnull = tok == JSON_TOKEN_NULL;
444 
445  if (astart != NULL)
446  (*astart) (sem->semstate, isnull);
447 
448  /* an array element is any object, array or scalar */
449  switch (tok)
450  {
452  result = parse_object(lex, sem);
453  break;
455  result = parse_array(lex, sem);
456  break;
457  default:
458  result = parse_scalar(lex, sem);
459  }
460 
461  if (result != JSON_SUCCESS)
462  return result;
463 
464  if (aend != NULL)
465  (*aend) (sem->semstate, isnull);
466 
467  return JSON_SUCCESS;
468 }
469 
470 static JsonParseErrorType
472 {
473  /*
474  * an array is a possibly empty sequence of array elements, separated by
475  * commas and surrounded by square brackets.
476  */
477  json_struct_action astart = sem->array_start;
478  json_struct_action aend = sem->array_end;
479  JsonParseErrorType result;
480 
482 
483  if (astart != NULL)
484  (*astart) (sem->semstate);
485 
486  /*
487  * Data inside an array is at a higher nesting level than the array
488  * itself. Note that we increment this after we call the semantic routine
489  * for the array start and restore it before we call the routine for the
490  * array end.
491  */
492  lex->lex_level++;
493 
495  if (result == JSON_SUCCESS && lex_peek(lex) != JSON_TOKEN_ARRAY_END)
496  {
497  result = parse_array_element(lex, sem);
498 
499  while (result == JSON_SUCCESS && lex_peek(lex) == JSON_TOKEN_COMMA)
500  {
501  result = json_lex(lex);
502  if (result != JSON_SUCCESS)
503  break;
504  result = parse_array_element(lex, sem);
505  }
506  }
507  if (result != JSON_SUCCESS)
508  return result;
509 
511  if (result != JSON_SUCCESS)
512  return result;
513 
514  lex->lex_level--;
515 
516  if (aend != NULL)
517  (*aend) (sem->semstate);
518 
519  return JSON_SUCCESS;
520 }
521 
522 /*
523  * Lex one token from the input stream.
524  */
527 {
528  char *s;
529  int len;
530  JsonParseErrorType result;
531 
532  /* Skip leading whitespace. */
533  s = lex->token_terminator;
534  len = s - lex->input;
535  while (len < lex->input_length &&
536  (*s == ' ' || *s == '\t' || *s == '\n' || *s == '\r'))
537  {
538  if (*s == '\n')
539  ++lex->line_number;
540  ++s;
541  ++len;
542  }
543  lex->token_start = s;
544 
545  /* Determine token type. */
546  if (len >= lex->input_length)
547  {
548  lex->token_start = NULL;
550  lex->token_terminator = s;
551  lex->token_type = JSON_TOKEN_END;
552  }
553  else
554  {
555  switch (*s)
556  {
557  /* Single-character token, some kind of punctuation mark. */
558  case '{':
560  lex->token_terminator = s + 1;
562  break;
563  case '}':
565  lex->token_terminator = s + 1;
567  break;
568  case '[':
570  lex->token_terminator = s + 1;
572  break;
573  case ']':
575  lex->token_terminator = s + 1;
577  break;
578  case ',':
580  lex->token_terminator = s + 1;
582  break;
583  case ':':
585  lex->token_terminator = s + 1;
587  break;
588  case '"':
589  /* string */
590  result = json_lex_string(lex);
591  if (result != JSON_SUCCESS)
592  return result;
594  break;
595  case '-':
596  /* Negative number. */
597  result = json_lex_number(lex, s + 1, NULL, NULL);
598  if (result != JSON_SUCCESS)
599  return result;
601  break;
602  case '0':
603  case '1':
604  case '2':
605  case '3':
606  case '4':
607  case '5':
608  case '6':
609  case '7':
610  case '8':
611  case '9':
612  /* Positive number. */
613  result = json_lex_number(lex, s, NULL, NULL);
614  if (result != JSON_SUCCESS)
615  return result;
617  break;
618  default:
619  {
620  char *p;
621 
622  /*
623  * We're not dealing with a string, number, legal
624  * punctuation mark, or end of string. The only legal
625  * tokens we might find here are true, false, and null,
626  * but for error reporting purposes we scan until we see a
627  * non-alphanumeric character. That way, we can report
628  * the whole word as an unexpected token, rather than just
629  * some unintuitive prefix thereof.
630  */
631  for (p = s; p - s < lex->input_length - len && JSON_ALPHANUMERIC_CHAR(*p); p++)
632  /* skip */ ;
633 
634  /*
635  * We got some sort of unexpected punctuation or an
636  * otherwise unexpected character, so just complain about
637  * that one character.
638  */
639  if (p == s)
640  {
642  lex->token_terminator = s + 1;
643  return JSON_INVALID_TOKEN;
644  }
645 
646  /*
647  * We've got a real alphanumeric token here. If it
648  * happens to be true, false, or null, all is well. If
649  * not, error out.
650  */
652  lex->token_terminator = p;
653  if (p - s == 4)
654  {
655  if (memcmp(s, "true", 4) == 0)
657  else if (memcmp(s, "null", 4) == 0)
659  else
660  return JSON_INVALID_TOKEN;
661  }
662  else if (p - s == 5 && memcmp(s, "false", 5) == 0)
664  else
665  return JSON_INVALID_TOKEN;
666 
667  }
668  } /* end of switch */
669  }
670 
671  return JSON_SUCCESS;
672 }
673 
674 /*
675  * The next token in the input stream is known to be a string; lex it.
676  */
677 static inline JsonParseErrorType
679 {
680  char *s;
681  int len;
682  int hi_surrogate = -1;
683 
684  if (lex->strval != NULL)
685  resetStringInfo(lex->strval);
686 
687  Assert(lex->input_length > 0);
688  s = lex->token_start;
689  len = lex->token_start - lex->input;
690  for (;;)
691  {
692  s++;
693  len++;
694  /* Premature end of the string. */
695  if (len >= lex->input_length)
696  {
697  lex->token_terminator = s;
698  return JSON_INVALID_TOKEN;
699  }
700  else if (*s == '"')
701  break;
702  else if ((unsigned char) *s < 32)
703  {
704  /* Per RFC4627, these characters MUST be escaped. */
705  /* Since *s isn't printable, exclude it from the context string */
706  lex->token_terminator = s;
707  return JSON_ESCAPING_REQUIRED;
708  }
709  else if (*s == '\\')
710  {
711  /* OK, we have an escape character. */
712  s++;
713  len++;
714  if (len >= lex->input_length)
715  {
716  lex->token_terminator = s;
717  return JSON_INVALID_TOKEN;
718  }
719  else if (*s == 'u')
720  {
721  int i;
722  int ch = 0;
723 
724  for (i = 1; i <= 4; i++)
725  {
726  s++;
727  len++;
728  if (len >= lex->input_length)
729  {
730  lex->token_terminator = s;
731  return JSON_INVALID_TOKEN;
732  }
733  else if (*s >= '0' && *s <= '9')
734  ch = (ch * 16) + (*s - '0');
735  else if (*s >= 'a' && *s <= 'f')
736  ch = (ch * 16) + (*s - 'a') + 10;
737  else if (*s >= 'A' && *s <= 'F')
738  ch = (ch * 16) + (*s - 'A') + 10;
739  else
740  {
743  }
744  }
745  if (lex->strval != NULL)
746  {
747  /*
748  * Combine surrogate pairs.
749  */
750  if (is_utf16_surrogate_first(ch))
751  {
752  if (hi_surrogate != -1)
754  hi_surrogate = ch;
755  continue;
756  }
757  else if (is_utf16_surrogate_second(ch))
758  {
759  if (hi_surrogate == -1)
761  ch = surrogate_pair_to_codepoint(hi_surrogate, ch);
762  hi_surrogate = -1;
763  }
764 
765  if (hi_surrogate != -1)
767 
768  /*
769  * Reject invalid cases. We can't have a value above
770  * 0xFFFF here (since we only accepted 4 hex digits
771  * above), so no need to test for out-of-range chars.
772  */
773  if (ch == 0)
774  {
775  /* We can't allow this, since our TEXT type doesn't */
777  }
778 
779  /*
780  * Add the represented character to lex->strval. In the
781  * backend, we can let pg_unicode_to_server() handle any
782  * required character set conversion; in frontend, we can
783  * only deal with trivial conversions.
784  *
785  * Note: pg_unicode_to_server() will throw an error for a
786  * conversion failure, rather than returning a failure
787  * indication. That seems OK.
788  */
789 #ifndef FRONTEND
790  {
791  char cbuf[MAX_UNICODE_EQUIVALENT_STRING + 1];
792 
793  pg_unicode_to_server(ch, (unsigned char *) cbuf);
794  appendStringInfoString(lex->strval, cbuf);
795  }
796 #else
797  if (lex->input_encoding == PG_UTF8)
798  {
799  /* OK, we can map the code point to UTF8 easily */
800  char utf8str[5];
801  int utf8len;
802 
803  unicode_to_utf8(ch, (unsigned char *) utf8str);
804  utf8len = pg_utf_mblen((unsigned char *) utf8str);
805  appendBinaryStringInfo(lex->strval, utf8str, utf8len);
806  }
807  else if (ch <= 0x007f)
808  {
809  /* The ASCII range is the same in all encodings */
810  appendStringInfoChar(lex->strval, (char) ch);
811  }
812  else
814 #endif /* FRONTEND */
815  }
816  }
817  else if (lex->strval != NULL)
818  {
819  if (hi_surrogate != -1)
821 
822  switch (*s)
823  {
824  case '"':
825  case '\\':
826  case '/':
827  appendStringInfoChar(lex->strval, *s);
828  break;
829  case 'b':
830  appendStringInfoChar(lex->strval, '\b');
831  break;
832  case 'f':
833  appendStringInfoChar(lex->strval, '\f');
834  break;
835  case 'n':
836  appendStringInfoChar(lex->strval, '\n');
837  break;
838  case 'r':
839  appendStringInfoChar(lex->strval, '\r');
840  break;
841  case 't':
842  appendStringInfoChar(lex->strval, '\t');
843  break;
844  default:
845  /* Not a valid string escape, so signal error. */
846  lex->token_start = s;
848  return JSON_ESCAPING_INVALID;
849  }
850  }
851  else if (strchr("\"\\/bfnrt", *s) == NULL)
852  {
853  /*
854  * Simpler processing if we're not bothered about de-escaping
855  *
856  * It's very tempting to remove the strchr() call here and
857  * replace it with a switch statement, but testing so far has
858  * shown it's not a performance win.
859  */
860  lex->token_start = s;
862  return JSON_ESCAPING_INVALID;
863  }
864 
865  }
866  else if (lex->strval != NULL)
867  {
868  if (hi_surrogate != -1)
870 
871  appendStringInfoChar(lex->strval, *s);
872  }
873 
874  }
875 
876  if (hi_surrogate != -1)
878 
879  /* Hooray, we found the end of the string! */
881  lex->token_terminator = s + 1;
882  return JSON_SUCCESS;
883 }
884 
885 /*
886  * The next token in the input stream is known to be a number; lex it.
887  *
888  * In JSON, a number consists of four parts:
889  *
890  * (1) An optional minus sign ('-').
891  *
892  * (2) Either a single '0', or a string of one or more digits that does not
893  * begin with a '0'.
894  *
895  * (3) An optional decimal part, consisting of a period ('.') followed by
896  * one or more digits. (Note: While this part can be omitted
897  * completely, it's not OK to have only the decimal point without
898  * any digits afterwards.)
899  *
900  * (4) An optional exponent part, consisting of 'e' or 'E', optionally
901  * followed by '+' or '-', followed by one or more digits. (Note:
902  * As with the decimal part, if 'e' or 'E' is present, it must be
903  * followed by at least one digit.)
904  *
905  * The 's' argument to this function points to the ostensible beginning
906  * of part 2 - i.e. the character after any optional minus sign, or the
907  * first character of the string if there is none.
908  *
909  * If num_err is not NULL, we return an error flag to *num_err rather than
910  * raising an error for a badly-formed number. Also, if total_len is not NULL
911  * the distance from lex->input to the token end+1 is returned to *total_len.
912  */
913 static inline JsonParseErrorType
915  bool *num_err, int *total_len)
916 {
917  bool error = false;
918  int len = s - lex->input;
919 
920  /* Part (1): leading sign indicator. */
921  /* Caller already did this for us; so do nothing. */
922 
923  /* Part (2): parse main digit string. */
924  if (len < lex->input_length && *s == '0')
925  {
926  s++;
927  len++;
928  }
929  else if (len < lex->input_length && *s >= '1' && *s <= '9')
930  {
931  do
932  {
933  s++;
934  len++;
935  } while (len < lex->input_length && *s >= '0' && *s <= '9');
936  }
937  else
938  error = true;
939 
940  /* Part (3): parse optional decimal portion. */
941  if (len < lex->input_length && *s == '.')
942  {
943  s++;
944  len++;
945  if (len == lex->input_length || *s < '0' || *s > '9')
946  error = true;
947  else
948  {
949  do
950  {
951  s++;
952  len++;
953  } while (len < lex->input_length && *s >= '0' && *s <= '9');
954  }
955  }
956 
957  /* Part (4): parse optional exponent. */
958  if (len < lex->input_length && (*s == 'e' || *s == 'E'))
959  {
960  s++;
961  len++;
962  if (len < lex->input_length && (*s == '+' || *s == '-'))
963  {
964  s++;
965  len++;
966  }
967  if (len == lex->input_length || *s < '0' || *s > '9')
968  error = true;
969  else
970  {
971  do
972  {
973  s++;
974  len++;
975  } while (len < lex->input_length && *s >= '0' && *s <= '9');
976  }
977  }
978 
979  /*
980  * Check for trailing garbage. As in json_lex(), any alphanumeric stuff
981  * here should be considered part of the token for error-reporting
982  * purposes.
983  */
984  for (; len < lex->input_length && JSON_ALPHANUMERIC_CHAR(*s); s++, len++)
985  error = true;
986 
987  if (total_len != NULL)
988  *total_len = len;
989 
990  if (num_err != NULL)
991  {
992  /* let the caller handle any error */
993  *num_err = error;
994  }
995  else
996  {
997  /* return token endpoint */
999  lex->token_terminator = s;
1000  /* handle error if any */
1001  if (error)
1002  return JSON_INVALID_TOKEN;
1003  }
1004 
1005  return JSON_SUCCESS;
1006 }
1007 
1008 /*
1009  * Report a parse error.
1010  *
1011  * lex->token_start and lex->token_terminator must identify the current token.
1012  */
1013 static JsonParseErrorType
1015 {
1016  /* Handle case where the input ended prematurely. */
1017  if (lex->token_start == NULL || lex->token_type == JSON_TOKEN_END)
1018  return JSON_EXPECTED_MORE;
1019 
1020  /* Otherwise choose the error type based on the parsing context. */
1021  switch (ctx)
1022  {
1023  case JSON_PARSE_END:
1024  return JSON_EXPECTED_END;
1025  case JSON_PARSE_VALUE:
1026  return JSON_EXPECTED_JSON;
1027  case JSON_PARSE_STRING:
1028  return JSON_EXPECTED_STRING;
1031  case JSON_PARSE_ARRAY_NEXT:
1032  return JSON_EXPECTED_ARRAY_NEXT;
1036  return JSON_EXPECTED_COLON;
1040  return JSON_EXPECTED_STRING;
1041  }
1042 
1043  /*
1044  * We don't use a default: case, so that the compiler will warn about
1045  * unhandled enum values. But this needs to be here anyway to cover the
1046  * possibility of an incorrect input.
1047  */
1048  json_log_and_abort("unexpected json parse state: %d", (int) ctx);
1049  return JSON_SUCCESS; /* silence stupider compilers */
1050 }
1051 
1052 /*
1053  * Construct a detail message for a JSON error.
1054  */
1055 char *
1057 {
1058  switch (error)
1059  {
1060  case JSON_SUCCESS:
1061  /* fall through to the error code after switch */
1062  break;
1063  case JSON_ESCAPING_INVALID:
1064  return psprintf(_("Escape sequence \"\\%s\" is invalid."),
1065  extract_token(lex));
1067  return psprintf(_("Character with value 0x%02x must be escaped."),
1068  (unsigned char) *(lex->token_terminator));
1069  case JSON_EXPECTED_END:
1070  return psprintf(_("Expected end of input, but found \"%s\"."),
1071  extract_token(lex));
1073  return psprintf(_("Expected array element or \"]\", but found \"%s\"."),
1074  extract_token(lex));
1076  return psprintf(_("Expected \",\" or \"]\", but found \"%s\"."),
1077  extract_token(lex));
1078  case JSON_EXPECTED_COLON:
1079  return psprintf(_("Expected \":\", but found \"%s\"."),
1080  extract_token(lex));
1081  case JSON_EXPECTED_JSON:
1082  return psprintf(_("Expected JSON value, but found \"%s\"."),
1083  extract_token(lex));
1084  case JSON_EXPECTED_MORE:
1085  return _("The input string ended unexpectedly.");
1087  return psprintf(_("Expected string or \"}\", but found \"%s\"."),
1088  extract_token(lex));
1090  return psprintf(_("Expected \",\" or \"}\", but found \"%s\"."),
1091  extract_token(lex));
1092  case JSON_EXPECTED_STRING:
1093  return psprintf(_("Expected string, but found \"%s\"."),
1094  extract_token(lex));
1095  case JSON_INVALID_TOKEN:
1096  return psprintf(_("Token \"%s\" is invalid."),
1097  extract_token(lex));
1099  return _("\\u0000 cannot be converted to text.");
1101  return _("\"\\u\" must be followed by four hexadecimal digits.");
1103  /* note: this case is only reachable in frontend not backend */
1104  return _("Unicode escape values cannot be used for code point values above 007F when the encoding is not UTF8.");
1106  return _("Unicode high surrogate must not follow a high surrogate.");
1108  return _("Unicode low surrogate must follow a high surrogate.");
1109  }
1110 
1111  /*
1112  * We don't use a default: case, so that the compiler will warn about
1113  * unhandled enum values. But this needs to be here anyway to cover the
1114  * possibility of an incorrect input.
1115  */
1116  json_log_and_abort("unexpected json parse error type: %d", (int) error);
1117  return NULL; /* silence stupider compilers */
1118 }
1119 
1120 /*
1121  * Extract the current token from a lexing context, for error reporting.
1122  */
1123 static char *
1125 {
1126  int toklen = lex->token_terminator - lex->token_start;
1127  char *token = palloc(toklen + 1);
1128 
1129  memcpy(token, lex->token_start, toklen);
1130  token[toklen] = '\0';
1131  return token;
1132 }
json_struct_action array_end
Definition: jsonapi.h:110
int line_number
Definition: jsonapi.h:82
void(* json_scalar_action)(void *state, char *token, JsonTokenType tokentype)
Definition: jsonapi.h:90
unsigned char * unicode_to_utf8(pg_wchar c, unsigned char *utf8string)
Definition: wchar.c:483
static JsonParseErrorType parse_scalar(JsonLexContext *lex, JsonSemAction *sem)
Definition: jsonapi.c:273
static JsonParseErrorType json_lex_string(JsonLexContext *lex)
Definition: jsonapi.c:678
static void error(void)
Definition: sql-dyntest.c:147
json_struct_action object_end
Definition: jsonapi.h:108
char * pstrdup(const char *in)
Definition: mcxt.c:1187
char * psprintf(const char *fmt,...)
Definition: psprintf.c:46
StringInfo makeStringInfo(void)
Definition: stringinfo.c:41
JsonTokenType token_type
Definition: jsonapi.h:80
json_struct_action object_start
Definition: jsonapi.h:107
char * json_errdetail(JsonParseErrorType error, JsonLexContext *lex)
Definition: jsonapi.c:1056
char * prev_token_terminator
Definition: jsonapi.h:79
static bool is_utf16_surrogate_second(pg_wchar c)
Definition: pg_wchar.h:530
json_scalar_action scalar
Definition: jsonapi.h:115
JsonSemAction nullSemAction
Definition: jsonapi.c:67
JsonParseContext
Definition: jsonapi.c:42
int lex_level
Definition: jsonapi.h:81
void(* json_ofield_action)(void *state, char *fname, bool isnull)
Definition: jsonapi.h:88
static JsonParseErrorType report_parse_error(JsonParseContext ctx, JsonLexContext *lex)
Definition: jsonapi.c:1014
char * line_start
Definition: jsonapi.h:83
#define MAX_UNICODE_EQUIVALENT_STRING
Definition: pg_wchar.h:325
int input_length
Definition: jsonapi.h:75
int input_encoding
Definition: jsonapi.h:76
void appendStringInfoString(StringInfo str, const char *s)
Definition: stringinfo.c:176
JsonLexContext * makeJsonLexContextCstringLen(char *json, int len, int encoding, bool need_escapes)
Definition: jsonapi.c:155
void check_stack_depth(void)
Definition: postgres.c:3312
int pg_encoding_mblen(int encoding, const char *mbstr)
Definition: wchar.c:1554
StringInfo strval
Definition: jsonapi.h:84
void resetStringInfo(StringInfo str)
Definition: stringinfo.c:75
bool IsValidJsonNumber(const char *str, int len)
Definition: jsonapi.c:115
static char * extract_token(JsonLexContext *lex)
Definition: jsonapi.c:1124
JsonParseErrorType pg_parse_json(JsonLexContext *lex, JsonSemAction *sem)
Definition: jsonapi.c:179
json_ofield_action object_field_end
Definition: jsonapi.h:112
char * token_start
Definition: jsonapi.h:77
static pg_wchar surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second)
Definition: pg_wchar.h:536
void appendStringInfoChar(StringInfo str, char ch)
Definition: stringinfo.c:188
#define JSON_ALPHANUMERIC_CHAR(c)
Definition: jsonapi.c:102
#define unconstify(underlying_type, expr)
Definition: c.h:1185
void * palloc0(Size size)
Definition: mcxt.c:981
char * token_terminator
Definition: jsonapi.h:78
JsonParseErrorType
Definition: jsonapi.h:36
void pg_unicode_to_server(pg_wchar c, unsigned char *s)
Definition: mbutils.c:806
json_aelem_action array_element_start
Definition: jsonapi.h:113
static JsonParseErrorType parse_array_element(JsonLexContext *lex, JsonSemAction *sem)
Definition: jsonapi.c:434
#define json_log_and_abort(...)
Definition: jsonapi.c:34
static JsonParseErrorType parse_object_field(JsonLexContext *lex, JsonSemAction *sem)
Definition: jsonapi.c:317
JsonParseErrorType json_lex(JsonLexContext *lex)
Definition: jsonapi.c:526
#define Assert(condition)
Definition: c.h:746
char * input
Definition: jsonapi.h:74
json_struct_action array_start
Definition: jsonapi.h:109
JsonParseErrorType json_count_array_elements(JsonLexContext *lex, int *elements)
Definition: jsonapi.c:219
static JsonParseErrorType parse_object(JsonLexContext *lex, JsonSemAction *sem)
Definition: jsonapi.c:370
void(* json_struct_action)(void *state)
Definition: jsonapi.h:87
int32 encoding
Definition: pg_database.h:41
static JsonTokenType lex_peek(JsonLexContext *lex)
Definition: jsonapi.c:81
static JsonParseErrorType json_lex_number(JsonLexContext *lex, char *s, bool *num_err, int *total_len)
Definition: jsonapi.c:914
int pg_utf_mblen(const unsigned char *s)
Definition: wchar.c:549
void(* json_aelem_action)(void *state, bool isnull)
Definition: jsonapi.h:89
void * palloc(Size size)
Definition: mcxt.c:950
int i
json_ofield_action object_field_start
Definition: jsonapi.h:111
static bool is_utf16_surrogate_first(pg_wchar c)
Definition: pg_wchar.h:524
static JsonParseErrorType lex_expect(JsonParseContext ctx, JsonLexContext *lex, JsonTokenType token)
Definition: jsonapi.c:93
void * semstate
Definition: jsonapi.h:106
#define _(x)
Definition: elog.c:88
void appendBinaryStringInfo(StringInfo str, const char *data, int datalen)
Definition: stringinfo.c:227
long val
Definition: informix.c:664
json_aelem_action array_element_end
Definition: jsonapi.h:114
static JsonParseErrorType parse_array(JsonLexContext *lex, JsonSemAction *sem)
Definition: jsonapi.c:471
JsonTokenType
Definition: jsonapi.h:19