2 Copyright (C) 2009 Georgia Public Library Service
3 Scott McKellar <scott@esilibrary.com>
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License
7 as published by the Free Software Foundation; either version 2
8 of the License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
17 @file osrf_parse_json.c
18 @brief Recursive descent parser for JSON.
25 #include <opensrf/osrf_json.h>
26 #include <opensrf/osrf_json_utils.h>
29 @brief A collection of things the parser uses to keep track of what it's doing.
32 growing_buffer* str_buf; /**< for building strings */
33 size_t index; /**< index into input buffer */
34 const char* buff; /**< client's buffer holding current chunk of input */
35 int decode; /**< boolean; true if we are decoding class hints */
39 @brief A small buffer for building Unicode byte sequences.
41 Because we pass a Unibuff* instead of a bare char*, the receiving function doesn't
42 have to worry about the size of the supplied buffer. The size is known.
45 /** @brief A small working buffer.
47 We fill the buffer with four hex characters, and then transform them into a byte
48 sequence up to three bytes long (plus terminal nul) encoding a UTF-8 character.
50 unsigned char buff[ 4 ];
53 static jsonObject* parse_it( const char* s, int decode );
55 static jsonObject* get_json_node( Parser* parser, char firstc );
56 static const char* get_string( Parser* parser );
57 static jsonObject* get_number( Parser* parser, char firstc );
58 static jsonObject* get_array( Parser* parser );
59 static jsonObject* get_hash( Parser* parser );
60 static jsonObject* get_decoded_hash( Parser* parser );
61 static jsonObject* get_null( Parser* parser );
62 static jsonObject* get_true( Parser* parser );
63 static jsonObject* get_false( Parser* parser );
64 static int get_utf8( Parser* parser, Unibuff* unibuff );
66 static char skip_white_space( Parser* parser );
67 static inline void parser_ungetc( Parser* parser );
68 static inline char parser_nextc( Parser* parser );
69 static void report_error( Parser* parser, char badchar, const char* err );
71 /* ------------------------------------- */
74 @brief Parse a JSON string, with decoding of classname hints.
75 @param str Pointer to the JSON string to parse.
76 @return A pointer to the resulting JSON object, or NULL on error.
78 If any node in the jsonObject tree is of type JSON_HASH, with a tag of JSON_CLASS_KEY
79 and another tag of JSON_DATA_KEY, the parser will collapse a level. The subobject
80 tagged with JSON_DATA_KEY will replace the JSON_HASH, and the string tagged as
81 JSON_CLASS_KEY will be stored as its classname. If there is no tag of JSON_DATA_KEY,
82 the hash will be replaced by a jsonObject of type JSON_NULL.
84 The calling code is responsible for freeing the resulting jsonObject.
86 jsonObject* jsonParse( const char* str ) {
87 return parse_it( str, 1 );
91 @brief Parse a JSON string, with no decoding of classname hints.
92 @param s Pointer to the JSON string to parse.
93 @return A pointer to the resulting JSON object, or NULL on error.
95 This function is similar to jsonParse(), except that it does not give any special
96 treatment to a JSON_HASH with the JSON_CLASS_KEY tag.
98 The calling code is responsible for freeing the resulting jsonObject.
100 jsonObject* jsonParseRaw( const char* s ) {
101 return parse_it( s, 0 );
105 @brief Parse a JSON string received as a printf-style format string.
106 @param str A printf-style format string. Subsequent arguments, if any, are formatted
107 and inserted into the JSON string before parsing.
108 @return A pointer to the resulting JSON object, or NULL on error.
110 Unlike jsonParse(), this function does not give any special treatment to a JSON_HASH
111 with tags JSON_CLASS_KEY or JSON_DATA_KEY.
113 The calling code is responsible for freeing the resulting jsonObject.
115 jsonObject* jsonParseFmt( const char* str, ... ) {
118 VA_LIST_TO_STRING( str );
119 return parse_it( VA_BUF, 0 );
123 @brief Parse a JSON string into a jsonObject.
124 @param s Pointer to the string to be parsed.
125 @param decode A boolean; true means decode class hints, false means don't.
126 @return Pointer to the newly created jsonObject.
128 Set up a Parser. Call get_json_node() to do the real work, then make sure that there's
129 nothing but white space at the end.
131 static jsonObject* parse_it( const char* s, int decode ) {
134 return NULL; // Nothing to parse
138 parser.str_buf = NULL;
141 parser.decode = decode;
143 jsonObject* obj = get_json_node( &parser, skip_white_space( &parser ) );
145 // Make sure there's nothing but white space at the end
147 if( obj && (c = skip_white_space( &parser )) ) {
148 report_error( &parser, c, "Extra material follows JSON string" );
149 jsonObjectFree( obj );
153 buffer_free( parser.str_buf );
158 @brief Get the next JSON node -- be it string, number, hash, or whatever.
159 @param parser Pointer to a Parser.
160 @param firstc The first character in the part that we're parsing.
161 @return Pointer to the next JSON node, or NULL upon error.
163 The first character tells us what kind of thing we're parsing next: a string, an array,
164 a hash, a number, a boolean, or a null. Branch accordingly.
166 In the case of an array or a hash, this function indirectly calls itself in order to
167 parse subordinate nodes.
169 static jsonObject* get_json_node( Parser* parser, char firstc ) {
171 jsonObject* obj = NULL;
173 // Branch on the first character
174 if( '"' == firstc ) {
175 const char* str = get_string( parser );
177 obj = jsonNewObject( NULL );
178 obj->type = JSON_STRING;
179 obj->value.s = strdup( str );
181 } else if( '[' == firstc ) {
182 obj = get_array( parser );
183 } else if( '{' == firstc ) {
185 obj = get_decoded_hash( parser );
187 obj = get_hash( parser );
188 } else if( 'n' == firstc ) {
189 obj = get_null( parser );
190 } else if( 't' == firstc ) {
191 obj = get_true( parser );
192 } else if( 'f' == firstc ) {
193 obj = get_false( parser );
195 else if( isdigit( (unsigned char) firstc ) ||
201 obj = get_number( parser, firstc );
203 report_error( parser, firstc, "Unexpected character" );
210 @brief Collect characters into a character string.
211 @param parser Pointer to a Parser.
212 @return Pointer to parser->str_buf if successful, or NULL upon error.
214 Translate the usual escape sequences. In particular, "\u" escapes a sequence of four
215 hex characters; turn the hex into the corresponding UTF-8 byte sequence.
217 Return the string we have built, without the enclosing quotation marks, in
218 parser->str_buf. In case of error, log an error message.
220 static const char* get_string( Parser* parser ) {
222 if( parser->str_buf )
223 buffer_reset( parser->str_buf );
225 parser->str_buf = buffer_init( 64 );
227 growing_buffer* gb = parser->str_buf;
229 // Collect the characters.
231 char c = parser_nextc( parser );
235 report_error( parser, parser->buff[ parser->index - 1 ],
236 "Quoted string not terminated" );
238 } else if( '\\' == c ) {
239 c = parser_nextc( parser );
241 case '"' : OSRF_BUFFER_ADD_CHAR( gb, '"' ); break;
242 case '\\' : OSRF_BUFFER_ADD_CHAR( gb, '\\' ); break;
243 case '/' : OSRF_BUFFER_ADD_CHAR( gb, '/' ); break;
244 case 'b' : OSRF_BUFFER_ADD_CHAR( gb, '\b' ); break;
245 case 'f' : OSRF_BUFFER_ADD_CHAR( gb, '\f' ); break;
246 case 'n' : OSRF_BUFFER_ADD_CHAR( gb, '\n' ); break;
247 case 'r' : OSRF_BUFFER_ADD_CHAR( gb, '\r' ); break;
248 case 't' : OSRF_BUFFER_ADD_CHAR( gb, '\t' ); break;
251 if( get_utf8( parser, &unibuff ) ) {
252 return NULL; // bad UTF-8
253 } else if( unibuff.buff[0] ) {
254 OSRF_BUFFER_ADD( gb, (char*) unibuff.buff );
256 report_error( parser, 'u', "Unicode sequence encodes a nul byte" );
261 default : OSRF_BUFFER_ADD_CHAR( gb, c ); break;
265 OSRF_BUFFER_ADD_CHAR( gb, c );
268 return OSRF_BUFFER_C_STR( gb );
272 @brief Collect characters into a number, and create a JSON_NUMBER for it.
273 @param parser Pointer to a parser.
274 @param firstc The first character in the number.
275 @return Pointer to a newly created jsonObject of type JSON_NUMBER, or NULL upon error.
277 Collect digits, signs, decimal points, and 'E' or 'e' (for scientific notation) into
278 a buffer. Make sure that the result is numeric. If it's not numeric by strict JSON
279 rules, try to make it numeric by some judicious massaging (we aren't quite as strict
280 as the official JSON rules).
282 If successful, construct a jsonObject of type JSON_NUMBER containing the resulting
283 numeric string. Otherwise log an error message and return NULL.
285 static jsonObject* get_number( Parser* parser, char firstc ) {
287 if( parser->str_buf )
288 buffer_reset( parser->str_buf );
290 parser->str_buf = buffer_init( 64 );
292 growing_buffer* gb = parser->str_buf;
293 OSRF_BUFFER_ADD_CHAR( gb, firstc );
298 c = parser_nextc( parser );
299 if( isdigit( (unsigned char) c ) ||
305 OSRF_BUFFER_ADD_CHAR( gb, c );
307 if( ! isspace( (unsigned char) c ) )
308 parser_ungetc( parser );
313 char* s = buffer_data( gb );
314 if( ! jsonIsNumeric( s ) ) {
315 char* temp = jsonScrubNumber( s );
319 report_error( parser, parser->buff[ parser->index - 1 ],
320 "Invalid numeric format" );
325 jsonObject* obj = jsonNewObject( NULL );
326 obj->type = JSON_NUMBER;
333 @brief Parse an array, and create a JSON_ARRAY for it.
334 @param parser Pointer to a Parser.
335 @return Pointer to a newly created jsonObject of type JSON_ARRAY, or NULL upon error.
337 Look for a series of JSON nodes, separated by commas and terminated by a right square
338 bracket. Parse each node recursively, collect them all into a newly created jsonObject
339 of type JSON_ARRAY, and return a pointer to the result.
341 Upon error, log an error message and return NULL.
343 static jsonObject* get_array( Parser* parser ) {
345 jsonObject* array = jsonNewObjectType( JSON_ARRAY );
347 char c = skip_white_space( parser );
349 return array; // Empty array
352 jsonObject* obj = get_json_node( parser, c );
354 jsonObjectFree( array );
355 return NULL; // Failed to get anything
358 // Add the entry to the array
359 jsonObjectPush( array, obj );
361 // Look for a comma or right bracket
362 c = skip_white_space( parser );
365 else if( c != ',' ) {
366 report_error( parser, c, "Expected comma or bracket in array; didn't find it\n" );
367 jsonObjectFree( array );
370 c = skip_white_space( parser );
377 @brief Parse a hash (JSON object), and create a JSON_HASH for it.
378 @param parser Pointer to a Parser.
379 @return Pointer to a newly created jsonObject of type JSON_HASH, or NULL upon error.
381 Look for a series of name/value pairs, separated by commas and terminated by a right
382 curly brace. Each name/value pair consists of a quoted string, followed by a colon,
383 followed a JSON node of any sort. Parse the value recursively.
385 Collect the name/value pairs into a newly created jsonObject of type JSON_ARRAY, and
386 return a pointer to it.
388 Upon error, log an error message and return NULL.
390 static jsonObject* get_hash( Parser* parser ) {
391 jsonObject* hash = jsonNewObjectType( JSON_HASH );
393 char c = skip_white_space( parser );
395 return hash; // Empty hash
399 // Get the key string
401 report_error( parser, c,
402 "Expected quotation mark to begin hash key; didn't find it\n" );
403 jsonObjectFree( hash );
407 const char* key = get_string( parser );
409 jsonObjectFree( hash );
412 char* key_copy = strdup( key );
414 if( jsonObjectGetKey( hash, key_copy ) ) {
415 report_error( parser, '"', "Duplicate key in JSON object" );
416 jsonObjectFree( hash );
421 c = skip_white_space( parser );
423 report_error( parser, c,
424 "Expected colon after hash key; didn't find it\n" );
426 jsonObjectFree( hash );
430 // Get the associated value
431 jsonObject* obj = get_json_node( parser, skip_white_space( parser ) );
434 jsonObjectFree( hash );
438 // Add a new entry to the hash
439 jsonObjectSetKey( hash, key_copy, obj );
442 // Look for comma or right brace
443 c = skip_white_space( parser );
446 else if( c != ',' ) {
447 report_error( parser, c,
448 "Expected comma or brace in hash, didn't find it" );
449 jsonObjectFree( hash );
452 c = skip_white_space( parser );
459 @brief Parse a hash (JSON object), and create a JSON_HASH for it; decode class hints.
460 @param parser Pointer to a Parser.
461 @return Pointer to a newly created jsonObject, or NULL upon error.
463 This function is similar to get_hash(), @em except:
465 If the hash includes a member with a key equal to JSON_CLASS_KEY ("__c" by default),
466 then look for a member whose key is JSON_DATA_KEY ("__p" by default). If you find one,
467 return the data associated with it; otherwise return a jsonObject of type JSON_NULL.
469 If there is no member with a key equal to JSON_CLASS_KEY, then return the same sort of
470 jsonObject as get_hash() would return (except of course that lower levels may be
471 decoded as described above).
473 static jsonObject* get_decoded_hash( Parser* parser ) {
474 jsonObject* hash = jsonNewObjectType( JSON_HASH );
476 char c = skip_white_space( parser );
478 return hash; // Empty hash
480 char* class_name = NULL;
484 // Get the key string
486 report_error( parser, c,
487 "Expected quotation mark to begin hash key; didn't find it\n" );
488 jsonObjectFree( hash );
492 const char* key = get_string( parser );
494 jsonObjectFree( hash );
497 char* key_copy = strdup( key );
499 if( jsonObjectGetKey( hash, key_copy ) ) {
500 report_error( parser, '"', "Duplicate key in JSON object" );
501 jsonObjectFree( hash );
506 c = skip_white_space( parser );
508 report_error( parser, c,
509 "Expected colon after hash key; didn't find it\n" );
511 jsonObjectFree( hash );
515 // Get the associated value
516 jsonObject* obj = get_json_node( parser, skip_white_space( parser ) );
519 jsonObjectFree( hash );
523 // Add a new entry to the hash
524 jsonObjectSetKey( hash, key_copy, obj );
526 // Save info for class hint, if present
527 if( !strcmp( key_copy, JSON_CLASS_KEY ) )
528 class_name = jsonObjectToSimpleString( obj );
532 // Look for comma or right brace
533 c = skip_white_space( parser );
536 else if( c != ',' ) {
537 report_error( parser, c,
538 "Expected comma or brace in hash, didn't find it" );
539 jsonObjectFree( hash );
542 c = skip_white_space( parser );
546 // We found a class hint. Extract the data node and return it.
547 jsonObject* class_data = osrfHashExtract( hash->value.h, JSON_DATA_KEY );
549 class_data->parent = NULL;
550 jsonObjectFree( hash );
553 hash->classname = class_name;
555 // Huh? We have a class name but no data for it.
556 // Throw away what we have and return a JSON_NULL.
557 jsonObjectFree( hash );
558 hash = jsonNewObjectType( JSON_NULL );
570 @brief Parse the JSON keyword "null", and create a JSON_NULL for it.
571 @param parser Pointer to a Parser.
572 @return Pointer to a newly created jsonObject of type JSON_NULL, or NULL upon error.
574 We already saw an 'n', or we wouldn't be here. Make sure that the next three characters
575 are 'u', 'l', and 'l', and that the character after that is not a letter or a digit.
577 If all goes well, create a jsonObject of type JSON_NULL, and return a pointer to it.
578 Otherwise log an error message and return NULL.
580 static jsonObject* get_null( Parser* parser ) {
582 if( parser_nextc( parser ) != 'u' ||
583 parser_nextc( parser ) != 'l' ||
584 parser_nextc( parser ) != 'l' ) {
585 report_error( parser, parser->buff[ parser->index - 1 ],
586 "Expected \"ull\" to follow \"n\"; didn't find it" );
590 // Peek at the next character to make sure that it's kosher
591 char c = parser_nextc( parser );
592 if( ! isspace( (unsigned char) c ) )
593 parser_ungetc( parser );
595 if( isalnum( (unsigned char) c ) ) {
596 report_error( parser, c, "Found letter or number after \"null\"" );
600 // Everything's okay. Return a JSON_NULL.
601 return jsonNewObject( NULL );
605 @brief Parse the JSON keyword "true", and create a JSON_BOOL for it.
606 @param parser Pointer to a Parser.
607 @return Pointer to a newly created jsonObject of type JSON_BOOL, or NULL upon error.
609 We already saw a 't', or we wouldn't be here. Make sure that the next three characters
610 are 'r', 'u', and 'e', and that the character after that is not a letter or a digit.
612 If all goes well, create a jsonObject of type JSON_BOOL, and return a pointer to it.
613 Otherwise log an error message and return NULL.
615 static jsonObject* get_true( Parser* parser ) {
617 if( parser_nextc( parser ) != 'r' ||
618 parser_nextc( parser ) != 'u' ||
619 parser_nextc( parser ) != 'e' ) {
620 report_error( parser, parser->buff[ parser->index - 1 ],
621 "Expected \"rue\" to follow \"t\"; didn't find it" );
625 // Peek at the next character to make sure that it's kosher
626 char c = parser_nextc( parser );
627 if( ! isspace( (unsigned char) c ) )
628 parser_ungetc( parser );
630 if( isalnum( (unsigned char) c ) ) {
631 report_error( parser, c, "Found letter or number after \"true\"" );
635 // Everything's okay. Return a JSON_BOOL.
636 return jsonNewBoolObject( 1 );
640 @brief Parse the JSON keyword "false", and create a JSON_BOOL for it.
641 @param parser Pointer to a Parser.
642 @return Pointer to a newly created jsonObject of type JSON_BOOL, or NULL upon error.
644 We already saw a 'f', or we wouldn't be here. Make sure that the next four characters
645 are 'a', 'l', 's', and 'e', and that the character after that is not a letter or a digit.
647 If all goes well, create a jsonObject of type JSON_BOOL, and return a pointer to it.
648 Otherwise log an error message and return NULL.
650 static jsonObject* get_false( Parser* parser ) {
652 if( parser_nextc( parser ) != 'a' ||
653 parser_nextc( parser ) != 'l' ||
654 parser_nextc( parser ) != 's' ||
655 parser_nextc( parser ) != 'e' ) {
656 report_error( parser, parser->buff[ parser->index - 1 ],
657 "Expected \"alse\" to follow \"f\"; didn't find it" );
661 // Peek at the next character to make sure that it's kosher
662 char c = parser_nextc( parser );
663 if( ! isspace( (unsigned char) c ) )
664 parser_ungetc( parser );
666 if( isalnum( (unsigned char) c ) ) {
667 report_error( parser, c, "Found letter or number after \"false\"" );
671 // Everything's okay. Return a JSON_BOOL.
672 return jsonNewBoolObject( 0 );
676 @brief Convert a hex digit to the corresponding numeric value.
678 @return The corresponding numeric value.
680 Warning #1: The calling code must ensure that the character to be converted is, in fact,
681 a hex character. Otherwise the results will be strange.
683 Warning #2. This macro evaluates its argument three times. Beware of side effects.
684 (It might make sense to convert this macro to a static inline function.)
686 Warning #3: This code assumes that the characters [a-f] and [A-F] are contiguous in the
687 execution character set, and that the lower 4 bits for 'a' and 'A' are 0001. Those
688 assumptions are true for ASCII and EBCDIC, but there may be some character sets for
689 which it is not true.
691 #define hexdigit(x) ( ((x) <= '9') ? (x) - '0' : ((x) & 7) + 9)
694 @brief Translate the next four characters into a UTF-8 character.
695 @param parser Pointer to a Parser.
696 @param unibuff Pointer to a small buffer in which to return the results.
697 @return 0 if successful, or 1 if not.
699 Collect the next four characters into @a unibuff, and make sure that they're all hex.
700 Translate them into a nul-terminated UTF-8 byte sequence, and return the result via
703 (Note that a UTF-8 byte sequence is guaranteed not to contain a nul byte. Hence using
704 a nul as a terminator creates no ambiguity.)
706 static int get_utf8( Parser* parser, Unibuff* unibuff ) {
710 // Accumulate four characters into a buffer. Make sure that
711 // there are four of them, and that they're all hex.
712 for( i = 0; i < 4; ++i ) {
713 int c = parser_nextc( parser );
715 report_error( parser, 'u', "Incomplete Unicode sequence" );
716 unibuff->buff[ 0 ] = '\0';
718 } else if( ! isxdigit( (unsigned char) c ) ) {
719 report_error( parser, c, "Non-hex byte found in Unicode sequence" );
720 unibuff->buff[ 0 ] = '\0';
727 /* The following code is adapted with permission from
728 * json-c http://oss.metaparadigm.com/json-c/
731 // Convert the hex sequence to a single integer
732 unsigned int ucs_char =
733 (hexdigit(ubuff[ 0 ]) << 12) +
734 (hexdigit(ubuff[ 1 ]) << 8) +
735 (hexdigit(ubuff[ 2 ]) << 4) +
736 hexdigit(ubuff[ 3 ]);
738 unsigned char* utf_out = unibuff->buff;
740 if (ucs_char < 0x80) {
741 utf_out[0] = ucs_char;
744 } else if (ucs_char < 0x800) {
745 utf_out[0] = 0xc0 | (ucs_char >> 6);
746 utf_out[1] = 0x80 | (ucs_char & 0x3f);
750 utf_out[0] = 0xe0 | (ucs_char >> 12);
751 utf_out[1] = 0x80 | ((ucs_char >> 6) & 0x3f);
752 utf_out[2] = 0x80 | (ucs_char & 0x3f);
760 @brief Skip over white space.
761 @param parser Pointer to a Parser.
762 @return The next non-whitespace character.
764 static char skip_white_space( Parser* parser ) {
767 c = parser_nextc( parser );
768 } while( isspace( (unsigned char) c ) );
774 @brief Back up by one character.
775 @param parser Pointer to a Parser.
777 Decrement an index into the input string. We don't guard against a negative index, so
778 the calling code should make sure that it doesn't do anything stupid.
780 static inline void parser_ungetc( Parser* parser ) {
785 @brief Get the next character
786 @param parser Pointer to a Parser.
787 @return The next character.
789 Increment an index into the input string and return the corresponding character.
790 The calling code should make sure that it doesn't try to read past the terminal nul.
792 static inline char parser_nextc( Parser* parser ) {
793 return parser->buff[ parser->index++ ];
797 @brief Report a syntax error to the log.
798 @param parser Pointer to a Parser.
799 @param badchar The character at the position where the error was detected.
800 @param err Pointer to a descriptive error message.
802 Format and log an error message. Identify the location of the error and
803 the character at that location. Show the neighborhood of the error within
806 static void report_error( Parser* parser, char badchar, const char* err ) {
808 // Determine the beginning and ending points of a JSON
809 // fragment to display, from the vicinity of the error
811 const int max_margin = 15; // How many characters to show
812 // on either side of the error
813 int pre = parser->index - max_margin;
817 int post = parser->index + 15;
818 if( '\0' == parser->buff[ parser->index ] ) {
819 post = parser->index - 1;
821 int remaining = strlen(parser->buff + parser->index);
822 if( remaining < max_margin )
823 post = parser->index + remaining;
826 // Copy the fragment into a buffer
827 int len = post - pre + 1; // length of fragment
829 memcpy( buf, parser->buff + pre, len );
832 // Replace newlines and tabs with spaces
835 if( '\n' == *p || '\t' == *p )
840 // Avoid trying to display a nul character
841 if( '\0' == badchar )
845 osrfLogError( OSRF_LOG_MARK,
846 "*JSON Parser Error\n - char = %c\n "
847 "- index = %d\n - near => %s\n - %s",
848 badchar, parser->index, buf, err );