2 Copyright (C) 2009 Georgia Public Library Service
3 Scott McKellar <scott@esilibrary.com>
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License
7 as published by the Free Software Foundation; either version 2
8 of the License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
17 @file osrf_parse_json.c
18 @brief Recursive descent parser for JSON.
25 #include <opensrf/osrf_json.h>
26 #include <opensrf/osrf_json_utils.h>
29 @brief A collection of things the parser uses to keep track of what it's doing.
32 growing_buffer* str_buf; /**< for building strings */
33 size_t index; /**< index into input buffer */
34 const char* buff; /**< client's buffer holding current chunk of input */
38 @brief A small buffer for building Unicode byte sequences.
40 Because we pass a Unibuff* instead of a bare char*, the receiving function doesn't
41 have to worry about the size of the supplied buffer. The size is known.
44 /** @brief A small working buffer.
46 We fill the buffer with four hex characters, and then transform them into a byte
47 sequence up to three bytes long (plus terminal nul) encoding a UTF-8 character.
49 unsigned char buff[ 4 ];
52 static jsonObject* parse( Parser* parser );
54 static jsonObject* get_json_thing( Parser* parser, char firstc );
55 static const char* get_string( Parser* parser );
56 static jsonObject* get_number( Parser* parser, char firstc );
57 static jsonObject* get_array( Parser* parser );
58 static jsonObject* get_hash( Parser* parser );
59 static jsonObject* get_null( Parser* parser );
60 static jsonObject* get_true( Parser* parser );
61 static jsonObject* get_false( Parser* parser );
62 static int get_utf8( Parser* parser, Unibuff* unibuff );
64 static char skip_white_space( Parser* parser );
65 static inline void parser_ungetc( Parser* parser );
66 static inline char parser_nextc( Parser* parser );
67 static void report_error( Parser* parser, char badchar, const char* err );
69 /* ------------------------------------- */
72 @brief Parse a JSON string, with translation to classname hints.
73 @param str Pointer to the JSON string to parse.
74 @return A pointer to the resulting JSON object, or NULL on error.
76 If any node in the jsonObject tree is of type JSON_HASH, with a tag of JSON_CLASS_KEY
77 and another tag of JSON_DATA_KEY, the parser will collapse a level. The subobject
78 tagged with JSON_DATA_KEY will replace the JSON_HASH, and the string tagged as
79 JSON_CLASS_KEY will be stored as its classname.
81 The calling code is responsible for freeing the resulting jsonObject.
83 jsonObject* jsonParse( const char* str ) {
87 jsonObject* obj = jsonParseRaw( str );
89 jsonObject* obj2 = NULL;
91 obj2 = jsonObjectDecodeClass( obj );
93 jsonObjectFree( obj );
99 @brief Parse a JSON string received as a printf-style format string.
100 @param str A printf-style format string. Subsequent arguments, if any, are formatted
101 and inserted into the JSON string before parsing.
102 @return A pointer to the resulting JSON object, or NULL on error.
104 Unlike jsonParse(), this function does not give any special treatment to a JSON_HASH
105 with tags JSON_CLASS_KEY or JSON_DATA_KEY.
107 The calling code is responsible for freeing the resulting jsonObject.
109 jsonObject* jsonParseFmt( const char* str, ... ) {
112 VA_LIST_TO_STRING(str);
113 return jsonParseRaw( VA_BUF );
117 @brief Parse a JSON string, with no translation to classname hints.
118 @param s Pointer to the JSON string to parse.
119 @return A pointer to the resulting JSON object, or NULL on error.
121 This function is similar to jsonParse(), except that it does not give any special
122 treatment to a JSON_HASH with tags JSON_CLASS_KEY or JSON_DATA_KEY.
124 The calling code is responsible for freeing the resulting jsonObject.
126 jsonObject* jsonParseRaw( const char* s ) {
129 return NULL; // Nothing to parse
133 parser.str_buf = NULL;
137 jsonObject* obj = parse( &parser );
139 buffer_free( parser.str_buf );
144 @brief Parse a JSON string into a jsonObject.
145 @param parser Pointer to a Parser.
146 @return Pointer to the newly created jsonObject.
148 Call get_json_thing() to do the real work, then make sure that there's nothing but
149 white spaqce at the end.
151 Currently we call this function only from jsonParseRaw(), and its code could have been
152 incorporated there in-line. Having it in a separate function is intended to make
153 certain future developments easier.
155 static jsonObject* parse( Parser* parser ) {
157 if( ! parser->buff ) {
158 osrfLogError( OSRF_LOG_MARK, "Internal error; no input buffer available" );
159 return NULL; // Should never happen
162 jsonObject* obj = get_json_thing( parser, skip_white_space( parser ) );
165 if( obj && (c = skip_white_space( parser )) ) {
166 report_error( parser, c, "Extra material follows JSON string" );
167 jsonObjectFree( obj );
175 @brief Get the next JSON node -- be it string, number, hash, or whatever.
176 @param parser Pointer to a Parser.
177 @param firstc The first character in the part that we're parsing.
178 @return Pointer to the next JSON node, or NULL upon error.
180 The first character tells us what kind of thing we're parsing next: a string, an array,
181 a hash, a number, a boolean, or a null. Branch accordingly.
183 In the case of an array or a hash, this function indirectly calls itself in order to
184 parse subordinate nodes.
186 static jsonObject* get_json_thing( Parser* parser, char firstc ) {
188 jsonObject* obj = NULL;
190 // Branch on the first character
191 if( '"' == firstc ) {
192 const char* str = get_string( parser );
194 obj = jsonNewObject( NULL );
195 obj->type = JSON_STRING;
196 obj->value.s = strdup( str );
198 } else if( '[' == firstc ) {
199 obj = get_array( parser );
200 } else if( '{' == firstc ) {
201 obj = get_hash( parser );
202 } else if( 'n' == firstc ) {
203 obj = get_null( parser );
204 } else if( 't' == firstc ) {
205 obj = get_true( parser );
206 } else if( 'f' == firstc ) {
207 obj = get_false( parser );
209 else if( isdigit( (unsigned char) firstc ) ||
215 obj = get_number( parser, firstc );
217 report_error( parser, firstc, "Unexpected character" );
224 @brief Collect characters into a character string.
225 @param parser Pointer to a Parser.
226 @return Pointer to parser->str_buf if successful, or NULL upon error.
228 Translate the usual escape sequences. In particular, "\u" escapes a sequence of four
229 hex characters; turn the hex into the corresponding UTF-8 byte sequence.
231 Return the string we have built, without the enclosing quotation marks, in
232 parser->str_buf. In case of error, log an error message.
234 static const char* get_string( Parser* parser ) {
236 if( parser->str_buf )
237 buffer_reset( parser->str_buf );
239 parser->str_buf = buffer_init( 64 );
241 growing_buffer* gb = parser->str_buf;
243 // Collect the characters.
245 char c = parser_nextc( parser );
249 report_error( parser, parser->buff[ parser->index - 1 ],
250 "Quoted string not terminated" );
252 } else if( '\\' == c ) {
253 c = parser_nextc( parser );
255 case '"' : OSRF_BUFFER_ADD_CHAR( gb, '"' ); break;
256 case '\\' : OSRF_BUFFER_ADD_CHAR( gb, '\\' ); break;
257 case '/' : OSRF_BUFFER_ADD_CHAR( gb, '/' ); break;
258 case 'b' : OSRF_BUFFER_ADD_CHAR( gb, '\b' ); break;
259 case 'f' : OSRF_BUFFER_ADD_CHAR( gb, '\f' ); break;
260 case 'n' : OSRF_BUFFER_ADD_CHAR( gb, '\n' ); break;
261 case 'r' : OSRF_BUFFER_ADD_CHAR( gb, '\r' ); break;
262 case 't' : OSRF_BUFFER_ADD_CHAR( gb, '\t' ); break;
265 if( get_utf8( parser, &unibuff ) ) {
266 return NULL; // bad UTF-8
267 } else if( unibuff.buff[0] ) {
268 OSRF_BUFFER_ADD( gb, (char*) unibuff.buff );
270 report_error( parser, 'u', "Unicode sequence encodes a nul byte" );
275 default : OSRF_BUFFER_ADD_CHAR( gb, c ); break;
279 OSRF_BUFFER_ADD_CHAR( gb, c );
282 return OSRF_BUFFER_C_STR( gb );
286 @brief Collect characters into a number, and create a JSON_NUMBAER for it.
287 @param parser Pointer to a parser.
288 @param firstc The first character in the number.
289 @return Pointer to a newly created jsonObject of type JSON_NUMBER, or NULL upon error.
291 Collect digits, signs, decimal points, and 'E' or 'e' (for scientific notation) into
292 a buffer. Make sure that the result is numeric. If it's not numeric by strict JSON
293 rules, try to make it numeric by some judicious massaging (we aren't quite as strict
294 as the official JSON rules).
296 If successful, construct a jsonObject of type JSON_NUMBER containing the resulting
297 numeric string. Otherwise log an error message and return NULL.
299 static jsonObject* get_number( Parser* parser, char firstc ) {
301 growing_buffer* gb = buffer_init( 32 );
302 OSRF_BUFFER_ADD_CHAR( gb, firstc );
307 c = parser_nextc( parser );
308 if( isdigit( (unsigned char) c ) ||
314 OSRF_BUFFER_ADD_CHAR( gb, c );
316 if( ! isspace( (unsigned char) c ) )
317 parser_ungetc( parser );
322 char* s = buffer_release( gb );
323 if( ! jsonIsNumeric( s ) ) {
324 char* temp = jsonScrubNumber( s );
328 report_error( parser, parser->buff[ parser->index - 1 ],
329 "Invalid numeric format" );
334 jsonObject* obj = jsonNewObject( NULL );
335 obj->type = JSON_NUMBER;
342 @brief Parse an array, and create a JSON_ARRAY for it.
343 @param parser Pointer to a Parser.
344 @return Pointer to a newly created jsonObject of type JSON_ARRAY, or NULL upon error.
346 Look for a series of JSON nodes, separated by commas and terminated by a right square
347 bracket. Parse each node recursively, collect them all into a newly created jsonObject
348 of type JSON_ARRAY, and return a pointer to the result.
350 Upon error, log an error message and return NULL.
352 static jsonObject* get_array( Parser* parser ) {
354 jsonObject* array = jsonNewObjectType( JSON_ARRAY );
356 char c = skip_white_space( parser );
358 return array; // Empty array
361 jsonObject* obj = get_json_thing( parser, c );
363 jsonObjectFree( array );
364 return NULL; // Failed to get anything
367 // Add the entry to the array
368 jsonObjectPush( array, obj );
370 // Look for a comma or right bracket
371 c = skip_white_space( parser );
374 else if( c != ',' ) {
375 report_error( parser, c, "Expected comma or bracket in array; didn't find it\n" );
376 jsonObjectFree( array );
379 c = skip_white_space( parser );
386 @brief Parse a hash (JSON object), and create a JSON_HASH for it.
387 @param parser Pointer to a Parser.
388 @return Pointer to a newly created jsonObject of type JSON_HASH, or NULL upon error.
390 Look for a series of name/value pairs, separated by commas and terminated by a right
391 curly brace. Each name/value pair consists of a quoted string, followed by a colon,
392 followed a JSON node of any sort. Parse the value recursively.
394 Collect the name/value pairs into a newly created jsonObject of type JSON_ARRAY, and
395 return a pointer to it.
397 Upon error, log an error message and return NULL.
399 static jsonObject* get_hash( Parser* parser ) {
400 jsonObject* hash = jsonNewObjectType( JSON_HASH );
402 char c = skip_white_space( parser );
404 return hash; // Empty hash
408 // Get the key string
410 report_error( parser, c,
411 "Expected quotation mark to begin hash key; didn't find it\n" );
412 jsonObjectFree( hash );
416 const char* key = get_string( parser );
418 jsonObjectFree( hash );
421 char* key_copy = strdup( key );
423 if( jsonObjectGetKey( hash, key_copy ) ) {
424 report_error( parser, '"', "Duplicate key in JSON object" );
425 jsonObjectFree( hash );
430 c = skip_white_space( parser );
432 report_error( parser, c,
433 "Expected colon after hash key; didn't find it\n" );
435 jsonObjectFree( hash );
439 // Get the associated value
440 jsonObject* obj = get_json_thing( parser, skip_white_space( parser ) );
443 jsonObjectFree( hash );
447 // Add a new entry to the hash
448 jsonObjectSetKey( hash, key_copy, obj );
451 // Look for comma or right brace
452 c = skip_white_space( parser );
455 else if( c != ',' ) {
456 report_error( parser, c,
457 "Expected comma or brace in hash, didn't find it" );
458 jsonObjectFree( hash );
461 c = skip_white_space( parser );
468 @brief Parse the JSON keyword "null", and create a JSON_NULL for it.
469 @param parser Pointer to a Parser.
470 @return Pointer to a newly created jsonObject of type JSON_NULL, or NULL upon error.
472 We already saw an 'n', or we wouldn't be here. Make sure that the next three characters
473 are 'u', 'l', and 'l', and that the character after that is not a letter or a digit.
475 If all goes well, create a jsonObject of type JSON_NULL, and return a pointer to it.
476 Otherwise log an error message and return NULL.
478 static jsonObject* get_null( Parser* parser ) {
480 if( parser_nextc( parser ) != 'u' ||
481 parser_nextc( parser ) != 'l' ||
482 parser_nextc( parser ) != 'l' ) {
483 report_error( parser, parser->buff[ parser->index - 1 ],
484 "Expected \"ull\" to follow \"n\"; didn't find it" );
488 // Peek at the next character to make sure that it's kosher
489 char c = parser_nextc( parser );
490 if( ! isspace( (unsigned char) c ) )
491 parser_ungetc( parser );
493 if( isalnum( (unsigned char) c ) ) {
494 report_error( parser, c, "Found letter or number after \"null\"" );
498 // Everything's okay. Return a JSON_NULL.
499 return jsonNewObject( NULL );
503 @brief Parse the JSON keyword "true", and create a JSON_BOOL for it.
504 @param parser Pointer to a Parser.
505 @return Pointer to a newly created jsonObject of type JSON_BOOL, or NULL upon error.
507 We already saw a 't', or we wouldn't be here. Make sure that the next three characters
508 are 'r', 'u', and 'e', and that the character after that is not a letter or a digit.
510 If all goes well, create a jsonObject of type JSON_BOOL, and return a pointer to it.
511 Otherwise log an error message and return NULL.
513 static jsonObject* get_true( Parser* parser ) {
515 if( parser_nextc( parser ) != 'r' ||
516 parser_nextc( parser ) != 'u' ||
517 parser_nextc( parser ) != 'e' ) {
518 report_error( parser, parser->buff[ parser->index - 1 ],
519 "Expected \"rue\" to follow \"t\"; didn't find it" );
523 // Peek at the next character to make sure that it's kosher
524 char c = parser_nextc( parser );
525 if( ! isspace( (unsigned char) c ) )
526 parser_ungetc( parser );
528 if( isalnum( (unsigned char) c ) ) {
529 report_error( parser, c, "Found letter or number after \"true\"" );
533 // Everything's okay. Return a JSON_BOOL.
534 return jsonNewBoolObject( 1 );
538 @brief Parse the JSON keyword "false", and create a JSON_BOOL for it.
539 @param parser Pointer to a Parser.
540 @return Pointer to a newly created jsonObject of type JSON_BOOL, or NULL upon error.
542 We already saw a 'f', or we wouldn't be here. Make sure that the next four characters
543 are 'a', 'l', 's', and 'e', and that the character after that is not a letter or a digit.
545 If all goes well, create a jsonObject of type JSON_BOOL, and return a pointer to it.
546 Otherwise log an error message and return NULL.
548 static jsonObject* get_false( Parser* parser ) {
550 if( parser_nextc( parser ) != 'a' ||
551 parser_nextc( parser ) != 'l' ||
552 parser_nextc( parser ) != 's' ||
553 parser_nextc( parser ) != 'e' ) {
554 report_error( parser, parser->buff[ parser->index - 1 ],
555 "Expected \"alse\" to follow \"f\"; didn't find it" );
559 // Peek at the next character to make sure that it's kosher
560 char c = parser_nextc( parser );
561 if( ! isspace( (unsigned char) c ) )
562 parser_ungetc( parser );
564 if( isalnum( (unsigned char) c ) ) {
565 report_error( parser, c, "Found letter or number after \"false\"" );
569 // Everything's okay. Return a JSON_BOOL.
570 return jsonNewBoolObject( 0 );
574 @brief Convert a hex digit to the corresponding numeric value.
576 @return The corresponding numeric value.
578 Warning #1: The calling code must ensure that the character to be converted is, in fact,
579 a hex character. Otherwise the results will be strange.
581 Warning #2. This macro evaluates its argument three times. Beware of side effects.
582 (It might make sense to convert this macro to a static inline function.)
584 Warning #3: This code assumes that the characters [a-f] and [A-F] are contiguous in the
585 execution character set, and that the lower 4 bits for 'a' and 'A' are 0001. Those
586 assumptions are true for ASCII and EBCDIC, but there may be some character sets for
587 which it is not true.
589 #define hexdigit(x) ( ((x) <= '9') ? (x) - '0' : ((x) & 7) + 9)
592 @brief Translate the next four characters into a UTF-8 character.
593 @param parser Pointer to a Parser.
594 @param unibuff Pointer to a small buffer in which to return the results.
595 @return 0 if successful, or 1 if not.
597 Collect the next four characters into @a unibuff, and make sure that they're all hex.
598 Translate them into a nul-terminated UTF-8 byte sequence, and return the result via
601 (Note that a UTF-8 byte sequence is guaranteed not to contain a nul byte. Hence using
602 a nul as a terminator creates no ambiguity.)
604 static int get_utf8( Parser* parser, Unibuff* unibuff ) {
608 // Accumulate four characters into a buffer. Make sure that
609 // there are four of them, and that they're all hex.
610 for( i = 0; i < 4; ++i ) {
611 int c = parser_nextc( parser );
613 report_error( parser, 'u', "Incomplete Unicode sequence" );
614 unibuff->buff[ 0 ] = '\0';
616 } else if( ! isxdigit( (unsigned char) c ) ) {
617 report_error( parser, c, "Non-hex byte found in Unicode sequence" );
618 unibuff->buff[ 0 ] = '\0';
625 /* The following code is adapted with permission from
626 * json-c http://oss.metaparadigm.com/json-c/
629 // Convert the hex sequence to a single integer
630 unsigned int ucs_char =
631 (hexdigit(ubuff[ 0 ]) << 12) +
632 (hexdigit(ubuff[ 1 ]) << 8) +
633 (hexdigit(ubuff[ 2 ]) << 4) +
634 hexdigit(ubuff[ 3 ]);
636 unsigned char* utf_out = unibuff->buff;
638 if (ucs_char < 0x80) {
639 utf_out[0] = ucs_char;
642 } else if (ucs_char < 0x800) {
643 utf_out[0] = 0xc0 | (ucs_char >> 6);
644 utf_out[1] = 0x80 | (ucs_char & 0x3f);
648 utf_out[0] = 0xe0 | (ucs_char >> 12);
649 utf_out[1] = 0x80 | ((ucs_char >> 6) & 0x3f);
650 utf_out[2] = 0x80 | (ucs_char & 0x3f);
658 @brief Skip over white space.
659 @param parser Pointer to a Parser.
660 @return The next non-whitespace character.
662 static char skip_white_space( Parser* parser ) {
665 c = parser_nextc( parser );
666 } while( isspace( (unsigned char) c ) );
672 @brief Back up by one character.
673 @param parser Pointer to a Parser.
675 Decrement an index into the input string. We don't guard against a negative index, so
676 the calling code should make sure that it doesn't do anything stupid.
678 static inline void parser_ungetc( Parser* parser ) {
683 @brief Get the next character
684 @param parser Pointer to a Parser.
685 @return The next character.
687 Increment an index into the input string and return the corresponding character.
688 The calling code should make sure that it doesn't try to read past the terminal nul.
690 static inline char parser_nextc( Parser* parser ) {
691 return parser->buff[ parser->index++ ];
695 @brief Report a syntax error to the log.
696 @param parser Pointer to a Parser.
697 @param badchar The character at the position where the error was detected.
698 @param err Pointer to a descriptive error message.
700 Format and log an error message. Identify the location of the error and
701 the character at that location. Show the neighborhood of the error within
704 static void report_error( Parser* parser, char badchar, const char* err ) {
706 // Determine the beginning and ending points of a JSON
707 // fragment to display, from the vicinity of the error
709 const int max_margin = 15; // How many characters to show
710 // on either side of the error
711 int pre = parser->index - max_margin;
715 int post = parser->index + 15;
716 if( '\0' == parser->buff[ parser->index ] ) {
717 post = parser->index - 1;
719 int remaining = strlen(parser->buff + parser->index);
720 if( remaining < max_margin )
721 post = parser->index + remaining;
724 // Copy the fragment into a buffer
725 int len = post - pre + 1; // length of fragment
727 memcpy( buf, parser->buff + pre, len );
730 // Replace newlines and tabs with spaces
733 if( '\n' == *p || '\t' == *p )
738 // Avoid trying to display a nul character
739 if( '\0' == badchar )
743 osrfLogError( OSRF_LOG_MARK,
744 "*JSON Parser Error\n - char = %c\n "
745 "- index = %d\n - near => %s\n - %s",
746 badchar, parser->index, buf, err );