2 Copyright (C) 2009 Georgia Public Library Service
3 Scott McKellar <scott@esilibrary.com>
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License
7 as published by the Free Software Foundation; either version 2
8 of the License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
17 @file osrf_parse_json.c
18 @brief Recursive descent parser for JSON.
25 #include <opensrf/osrf_json.h>
28 @brief A collection of things the parser uses to keep track of what it's doing.
31 growing_buffer* str_buf; /**< for building strings */
32 size_t index; /**< index into input buffer */
33 const char* buff; /**< client's buffer holding current chunk of input */
34 int decode; /**< boolean; true if we are decoding class hints */
38 @brief A small buffer for building Unicode byte sequences.
40 Because we pass a Unibuff* instead of a bare char*, the receiving function doesn't
41 have to worry about the size of the supplied buffer. The size is known.
44 /** @brief A small working buffer.
46 We fill the buffer with four hex characters, and then transform them into a byte
47 sequence up to three bytes long (plus terminal nul) encoding a UTF-8 character.
49 unsigned char buff[ 4 ];
52 static jsonObject* parse_it( const char* s, int decode );
54 static jsonObject* get_json_node( Parser* parser, char firstc );
55 static const char* get_string( Parser* parser );
56 static jsonObject* get_number( Parser* parser, char firstc );
57 static jsonObject* get_array( Parser* parser );
58 static jsonObject* get_hash( Parser* parser );
59 static jsonObject* get_decoded_hash( Parser* parser );
60 static jsonObject* get_null( Parser* parser );
61 static jsonObject* get_true( Parser* parser );
62 static jsonObject* get_false( Parser* parser );
63 static int get_utf8( Parser* parser, Unibuff* unibuff );
65 static char skip_white_space( Parser* parser );
66 static inline void parser_ungetc( Parser* parser );
67 static inline char parser_nextc( Parser* parser );
68 static void report_error( Parser* parser, char badchar, const char* err );
70 /* ------------------------------------- */
73 @brief Parse a JSON string, with decoding of classname hints.
74 @param str Pointer to the JSON string to parse.
75 @return A pointer to the resulting JSON object, or NULL on error.
77 If any node in the jsonObject tree is of type JSON_HASH, with a tag of JSON_CLASS_KEY
78 and another tag of JSON_DATA_KEY, the parser will collapse a level. The subobject
79 tagged with JSON_DATA_KEY will replace the JSON_HASH, and the string tagged as
80 JSON_CLASS_KEY will be stored as its classname. If there is no tag of JSON_DATA_KEY,
81 the hash will be replaced by a jsonObject of type JSON_NULL.
83 The calling code is responsible for freeing the resulting jsonObject.
85 jsonObject* jsonParse( const char* str ) {
86 return parse_it( str, 1 );
90 @brief Parse a JSON string, with no decoding of classname hints.
91 @param s Pointer to the JSON string to parse.
92 @return A pointer to the resulting JSON object, or NULL on error.
94 This function is similar to jsonParse(), except that it does not give any special
95 treatment to a JSON_HASH with the JSON_CLASS_KEY tag.
97 The calling code is responsible for freeing the resulting jsonObject.
99 jsonObject* jsonParseRaw( const char* s ) {
100 return parse_it( s, 0 );
104 @brief Parse a JSON string received as a printf-style format string.
105 @param str A printf-style format string. Subsequent arguments, if any, are formatted
106 and inserted into the JSON string before parsing.
107 @return A pointer to the resulting JSON object, or NULL on error.
109 Unlike jsonParse(), this function does not give any special treatment to a JSON_HASH
110 with tags JSON_CLASS_KEY or JSON_DATA_KEY.
112 The calling code is responsible for freeing the resulting jsonObject.
114 jsonObject* jsonParseFmt( const char* str, ... ) {
117 VA_LIST_TO_STRING( str );
118 return parse_it( VA_BUF, 0 );
122 @brief Parse a JSON string into a jsonObject.
123 @param s Pointer to the string to be parsed.
124 @param decode A boolean; true means decode class hints, false means don't.
125 @return Pointer to the newly created jsonObject.
127 Set up a Parser. Call get_json_node() to do the real work, then make sure that there's
128 nothing but white space at the end.
130 static jsonObject* parse_it( const char* s, int decode ) {
133 return NULL; // Nothing to parse
137 parser.str_buf = NULL;
140 parser.decode = decode;
142 jsonObject* obj = get_json_node( &parser, skip_white_space( &parser ) );
144 // Make sure there's nothing but white space at the end
146 if( obj && (c = skip_white_space( &parser )) ) {
147 report_error( &parser, c, "Extra material follows JSON string" );
148 jsonObjectFree( obj );
152 buffer_free( parser.str_buf );
157 @brief Get the next JSON node -- be it string, number, hash, or whatever.
158 @param parser Pointer to a Parser.
159 @param firstc The first character in the part that we're parsing.
160 @return Pointer to the next JSON node, or NULL upon error.
162 The first character tells us what kind of thing we're parsing next: a string, an array,
163 a hash, a number, a boolean, or a null. Branch accordingly.
165 In the case of an array or a hash, this function indirectly calls itself in order to
166 parse subordinate nodes.
168 static jsonObject* get_json_node( Parser* parser, char firstc ) {
170 jsonObject* obj = NULL;
172 // Branch on the first character
173 if( '"' == firstc ) {
174 const char* str = get_string( parser );
176 obj = jsonNewObject( NULL );
177 obj->type = JSON_STRING;
178 obj->value.s = strdup( str );
180 } else if( '[' == firstc ) {
181 obj = get_array( parser );
182 } else if( '{' == firstc ) {
184 obj = get_decoded_hash( parser );
186 obj = get_hash( parser );
187 } else if( 'n' == firstc ) {
188 obj = get_null( parser );
189 } else if( 't' == firstc ) {
190 obj = get_true( parser );
191 } else if( 'f' == firstc ) {
192 obj = get_false( parser );
194 else if( isdigit( (unsigned char) firstc ) ||
200 obj = get_number( parser, firstc );
202 report_error( parser, firstc, "Unexpected character" );
209 @brief Collect characters into a character string.
210 @param parser Pointer to a Parser.
211 @return Pointer to parser->str_buf if successful, or NULL upon error.
213 Translate the usual escape sequences. In particular, "\u" escapes a sequence of four
214 hex characters; turn the hex into the corresponding UTF-8 byte sequence.
216 Return the string we have built, without the enclosing quotation marks, in
217 parser->str_buf. In case of error, log an error message.
219 static const char* get_string( Parser* parser ) {
221 if( parser->str_buf )
222 buffer_reset( parser->str_buf );
224 parser->str_buf = buffer_init( 64 );
226 growing_buffer* gb = parser->str_buf;
228 // Collect the characters.
230 char c = parser_nextc( parser );
234 report_error( parser, parser->buff[ parser->index - 1 ],
235 "Quoted string not terminated" );
237 } else if( '\\' == c ) {
238 c = parser_nextc( parser );
240 case '"' : OSRF_BUFFER_ADD_CHAR( gb, '"' ); break;
241 case '\\' : OSRF_BUFFER_ADD_CHAR( gb, '\\' ); break;
242 case '/' : OSRF_BUFFER_ADD_CHAR( gb, '/' ); break;
243 case 'b' : OSRF_BUFFER_ADD_CHAR( gb, '\b' ); break;
244 case 'f' : OSRF_BUFFER_ADD_CHAR( gb, '\f' ); break;
245 case 'n' : OSRF_BUFFER_ADD_CHAR( gb, '\n' ); break;
246 case 'r' : OSRF_BUFFER_ADD_CHAR( gb, '\r' ); break;
247 case 't' : OSRF_BUFFER_ADD_CHAR( gb, '\t' ); break;
250 if( get_utf8( parser, &unibuff ) ) {
251 return NULL; // bad UTF-8
252 } else if( unibuff.buff[0] ) {
253 OSRF_BUFFER_ADD( gb, (char*) unibuff.buff );
255 report_error( parser, 'u', "Unicode sequence encodes a nul byte" );
260 default : OSRF_BUFFER_ADD_CHAR( gb, c ); break;
264 OSRF_BUFFER_ADD_CHAR( gb, c );
267 return OSRF_BUFFER_C_STR( gb );
271 @brief Collect characters into a number, and create a JSON_NUMBER for it.
272 @param parser Pointer to a parser.
273 @param firstc The first character in the number.
274 @return Pointer to a newly created jsonObject of type JSON_NUMBER, or NULL upon error.
276 Collect digits, signs, decimal points, and 'E' or 'e' (for scientific notation) into
277 a buffer. Make sure that the result is numeric. If it's not numeric by strict JSON
278 rules, try to make it numeric by some judicious massaging (we aren't quite as strict
279 as the official JSON rules).
281 If successful, construct a jsonObject of type JSON_NUMBER containing the resulting
282 numeric string. Otherwise log an error message and return NULL.
284 static jsonObject* get_number( Parser* parser, char firstc ) {
286 if( parser->str_buf )
287 buffer_reset( parser->str_buf );
289 parser->str_buf = buffer_init( 64 );
291 growing_buffer* gb = parser->str_buf;
292 OSRF_BUFFER_ADD_CHAR( gb, firstc );
297 c = parser_nextc( parser );
298 if( isdigit( (unsigned char) c ) ||
304 OSRF_BUFFER_ADD_CHAR( gb, c );
306 if( ! isspace( (unsigned char) c ) )
307 parser_ungetc( parser );
312 char* s = buffer_data( gb );
313 if( ! jsonIsNumeric( s ) ) {
314 char* temp = jsonScrubNumber( s );
318 report_error( parser, parser->buff[ parser->index - 1 ],
319 "Invalid numeric format" );
324 jsonObject* obj = jsonNewObject( NULL );
325 obj->type = JSON_NUMBER;
332 @brief Parse an array, and create a JSON_ARRAY for it.
333 @param parser Pointer to a Parser.
334 @return Pointer to a newly created jsonObject of type JSON_ARRAY, or NULL upon error.
336 Look for a series of JSON nodes, separated by commas and terminated by a right square
337 bracket. Parse each node recursively, collect them all into a newly created jsonObject
338 of type JSON_ARRAY, and return a pointer to the result.
340 Upon error, log an error message and return NULL.
342 static jsonObject* get_array( Parser* parser ) {
344 jsonObject* array = jsonNewObjectType( JSON_ARRAY );
346 char c = skip_white_space( parser );
348 return array; // Empty array
351 jsonObject* obj = get_json_node( parser, c );
353 jsonObjectFree( array );
354 return NULL; // Failed to get anything
357 // Add the entry to the array
358 jsonObjectPush( array, obj );
360 // Look for a comma or right bracket
361 c = skip_white_space( parser );
364 else if( c != ',' ) {
365 report_error( parser, c, "Expected comma or bracket in array; didn't find it\n" );
366 jsonObjectFree( array );
369 c = skip_white_space( parser );
376 @brief Parse a hash (JSON object), and create a JSON_HASH for it.
377 @param parser Pointer to a Parser.
378 @return Pointer to a newly created jsonObject of type JSON_HASH, or NULL upon error.
380 Look for a series of name/value pairs, separated by commas and terminated by a right
381 curly brace. Each name/value pair consists of a quoted string, followed by a colon,
382 followed a JSON node of any sort. Parse the value recursively.
384 Collect the name/value pairs into a newly created jsonObject of type JSON_ARRAY, and
385 return a pointer to it.
387 Upon error, log an error message and return NULL.
389 static jsonObject* get_hash( Parser* parser ) {
390 jsonObject* hash = jsonNewObjectType( JSON_HASH );
392 char c = skip_white_space( parser );
394 return hash; // Empty hash
398 // Get the key string
400 report_error( parser, c,
401 "Expected quotation mark to begin hash key; didn't find it\n" );
402 jsonObjectFree( hash );
406 const char* key = get_string( parser );
408 jsonObjectFree( hash );
411 char* key_copy = strdup( key );
413 if( jsonObjectGetKeyConst( hash, key_copy ) ) {
414 report_error( parser, '"', "Duplicate key in JSON object" );
415 jsonObjectFree( hash );
420 c = skip_white_space( parser );
422 report_error( parser, c,
423 "Expected colon after hash key; didn't find it\n" );
425 jsonObjectFree( hash );
429 // Get the associated value
430 jsonObject* obj = get_json_node( parser, skip_white_space( parser ) );
433 jsonObjectFree( hash );
437 // Add a new entry to the hash
438 jsonObjectSetKey( hash, key_copy, obj );
441 // Look for comma or right brace
442 c = skip_white_space( parser );
445 else if( c != ',' ) {
446 report_error( parser, c,
447 "Expected comma or brace in hash, didn't find it" );
448 jsonObjectFree( hash );
451 c = skip_white_space( parser );
458 @brief Parse a hash (JSON object), and create a JSON_HASH for it; decode class hints.
459 @param parser Pointer to a Parser.
460 @return Pointer to a newly created jsonObject, or NULL upon error.
462 This function is similar to get_hash(), @em except:
464 If the hash includes a member with a key equal to JSON_CLASS_KEY ("__c" by default),
465 then look for a member whose key is JSON_DATA_KEY ("__p" by default). If you find one,
466 return the data associated with it; otherwise return a jsonObject of type JSON_NULL.
468 If there is no member with a key equal to JSON_CLASS_KEY, then return the same sort of
469 jsonObject as get_hash() would return (except of course that lower levels may be
470 decoded as described above).
472 static jsonObject* get_decoded_hash( Parser* parser ) {
473 jsonObject* hash = jsonNewObjectType( JSON_HASH );
475 char c = skip_white_space( parser );
477 return hash; // Empty hash
479 char* class_name = NULL;
483 // Get the key string
485 report_error( parser, c,
486 "Expected quotation mark to begin hash key; didn't find it\n" );
487 jsonObjectFree( hash );
491 const char* key = get_string( parser );
493 jsonObjectFree( hash );
496 char* key_copy = strdup( key );
498 if( jsonObjectGetKeyConst( hash, key_copy ) ) {
499 report_error( parser, '"', "Duplicate key in JSON object" );
500 jsonObjectFree( hash );
505 c = skip_white_space( parser );
507 report_error( parser, c,
508 "Expected colon after hash key; didn't find it\n" );
510 jsonObjectFree( hash );
514 // Get the associated value
515 jsonObject* obj = get_json_node( parser, skip_white_space( parser ) );
518 jsonObjectFree( hash );
522 // Add a new entry to the hash
523 jsonObjectSetKey( hash, key_copy, obj );
525 // Save info for class hint, if present
526 if( !strcmp( key_copy, JSON_CLASS_KEY ) )
527 class_name = jsonObjectToSimpleString( obj );
531 // Look for comma or right brace
532 c = skip_white_space( parser );
535 else if( c != ',' ) {
536 report_error( parser, c,
537 "Expected comma or brace in hash, didn't find it" );
538 jsonObjectFree( hash );
541 c = skip_white_space( parser );
545 // We found a class hint. Extract the data node and return it.
546 jsonObject* class_data = osrfHashExtract( hash->value.h, JSON_DATA_KEY );
548 class_data->parent = NULL;
549 jsonObjectFree( hash );
552 hash->classname = class_name;
554 // Huh? We have a class name but no data for it.
555 // Throw away what we have and return a JSON_NULL.
556 jsonObjectFree( hash );
557 hash = jsonNewObjectType( JSON_NULL );
569 @brief Parse the JSON keyword "null", and create a JSON_NULL for it.
570 @param parser Pointer to a Parser.
571 @return Pointer to a newly created jsonObject of type JSON_NULL, or NULL upon error.
573 We already saw an 'n', or we wouldn't be here. Make sure that the next three characters
574 are 'u', 'l', and 'l', and that the character after that is not a letter or a digit.
576 If all goes well, create a jsonObject of type JSON_NULL, and return a pointer to it.
577 Otherwise log an error message and return NULL.
579 static jsonObject* get_null( Parser* parser ) {
581 if( parser_nextc( parser ) != 'u' ||
582 parser_nextc( parser ) != 'l' ||
583 parser_nextc( parser ) != 'l' ) {
584 report_error( parser, parser->buff[ parser->index - 1 ],
585 "Expected \"ull\" to follow \"n\"; didn't find it" );
589 // Peek at the next character to make sure that it's kosher
590 char c = parser_nextc( parser );
591 if( ! isspace( (unsigned char) c ) )
592 parser_ungetc( parser );
594 if( isalnum( (unsigned char) c ) ) {
595 report_error( parser, c, "Found letter or number after \"null\"" );
599 // Everything's okay. Return a JSON_NULL.
600 return jsonNewObject( NULL );
604 @brief Parse the JSON keyword "true", and create a JSON_BOOL for it.
605 @param parser Pointer to a Parser.
606 @return Pointer to a newly created jsonObject of type JSON_BOOL, or NULL upon error.
608 We already saw a 't', or we wouldn't be here. Make sure that the next three characters
609 are 'r', 'u', and 'e', and that the character after that is not a letter or a digit.
611 If all goes well, create a jsonObject of type JSON_BOOL, and return a pointer to it.
612 Otherwise log an error message and return NULL.
614 static jsonObject* get_true( Parser* parser ) {
616 if( parser_nextc( parser ) != 'r' ||
617 parser_nextc( parser ) != 'u' ||
618 parser_nextc( parser ) != 'e' ) {
619 report_error( parser, parser->buff[ parser->index - 1 ],
620 "Expected \"rue\" to follow \"t\"; didn't find it" );
624 // Peek at the next character to make sure that it's kosher
625 char c = parser_nextc( parser );
626 if( ! isspace( (unsigned char) c ) )
627 parser_ungetc( parser );
629 if( isalnum( (unsigned char) c ) ) {
630 report_error( parser, c, "Found letter or number after \"true\"" );
634 // Everything's okay. Return a JSON_BOOL.
635 return jsonNewBoolObject( 1 );
639 @brief Parse the JSON keyword "false", and create a JSON_BOOL for it.
640 @param parser Pointer to a Parser.
641 @return Pointer to a newly created jsonObject of type JSON_BOOL, or NULL upon error.
643 We already saw a 'f', or we wouldn't be here. Make sure that the next four characters
644 are 'a', 'l', 's', and 'e', and that the character after that is not a letter or a digit.
646 If all goes well, create a jsonObject of type JSON_BOOL, and return a pointer to it.
647 Otherwise log an error message and return NULL.
649 static jsonObject* get_false( Parser* parser ) {
651 if( parser_nextc( parser ) != 'a' ||
652 parser_nextc( parser ) != 'l' ||
653 parser_nextc( parser ) != 's' ||
654 parser_nextc( parser ) != 'e' ) {
655 report_error( parser, parser->buff[ parser->index - 1 ],
656 "Expected \"alse\" to follow \"f\"; didn't find it" );
660 // Peek at the next character to make sure that it's kosher
661 char c = parser_nextc( parser );
662 if( ! isspace( (unsigned char) c ) )
663 parser_ungetc( parser );
665 if( isalnum( (unsigned char) c ) ) {
666 report_error( parser, c, "Found letter or number after \"false\"" );
670 // Everything's okay. Return a JSON_BOOL.
671 return jsonNewBoolObject( 0 );
675 @brief Convert a hex digit to the corresponding numeric value.
677 @return The corresponding numeric value.
679 Warning #1: The calling code must ensure that the character to be converted is, in fact,
680 a hex character. Otherwise the results will be strange.
682 Warning #2. This macro evaluates its argument three times. Beware of side effects.
683 (It might make sense to convert this macro to a static inline function.)
685 Warning #3: This code assumes that the characters [a-f] and [A-F] are contiguous in the
686 execution character set, and that the lower 4 bits for 'a' and 'A' are 0001. Those
687 assumptions are true for ASCII and EBCDIC, but there may be some character sets for
688 which it is not true.
690 #define hexdigit(x) ( ((x) <= '9') ? (x) - '0' : ((x) & 7) + 9)
693 @brief Translate the next four characters into a UTF-8 character.
694 @param parser Pointer to a Parser.
695 @param unibuff Pointer to a small buffer in which to return the results.
696 @return 0 if successful, or 1 if not.
698 Collect the next four characters into @a unibuff, and make sure that they're all hex.
699 Translate them into a nul-terminated UTF-8 byte sequence, and return the result via
702 (Note that a UTF-8 byte sequence is guaranteed not to contain a nul byte. Hence using
703 a nul as a terminator creates no ambiguity.)
705 static int get_utf8( Parser* parser, Unibuff* unibuff ) {
709 // Accumulate four characters into a buffer. Make sure that
710 // there are four of them, and that they're all hex.
711 for( i = 0; i < 4; ++i ) {
712 int c = parser_nextc( parser );
714 report_error( parser, 'u', "Incomplete Unicode sequence" );
715 unibuff->buff[ 0 ] = '\0';
717 } else if( ! isxdigit( (unsigned char) c ) ) {
718 report_error( parser, c, "Non-hex byte found in Unicode sequence" );
719 unibuff->buff[ 0 ] = '\0';
726 /* The following code is adapted with permission from
727 * json-c http://oss.metaparadigm.com/json-c/
730 // Convert the hex sequence to a single integer
731 unsigned int ucs_char =
732 (hexdigit(ubuff[ 0 ]) << 12) +
733 (hexdigit(ubuff[ 1 ]) << 8) +
734 (hexdigit(ubuff[ 2 ]) << 4) +
735 hexdigit(ubuff[ 3 ]);
737 unsigned char* utf_out = unibuff->buff;
739 if (ucs_char < 0x80) {
740 utf_out[0] = ucs_char;
743 } else if (ucs_char < 0x800) {
744 utf_out[0] = 0xc0 | (ucs_char >> 6);
745 utf_out[1] = 0x80 | (ucs_char & 0x3f);
749 utf_out[0] = 0xe0 | (ucs_char >> 12);
750 utf_out[1] = 0x80 | ((ucs_char >> 6) & 0x3f);
751 utf_out[2] = 0x80 | (ucs_char & 0x3f);
759 @brief Skip over white space.
760 @param parser Pointer to a Parser.
761 @return The next non-whitespace character.
763 static char skip_white_space( Parser* parser ) {
766 c = parser_nextc( parser );
767 } while( isspace( (unsigned char) c ) );
773 @brief Back up by one character.
774 @param parser Pointer to a Parser.
776 Decrement an index into the input string. We don't guard against a negative index, so
777 the calling code should make sure that it doesn't do anything stupid.
779 static inline void parser_ungetc( Parser* parser ) {
784 @brief Get the next character
785 @param parser Pointer to a Parser.
786 @return The next character.
788 Increment an index into the input string and return the corresponding character.
789 The calling code should make sure that it doesn't try to read past the terminal nul.
791 static inline char parser_nextc( Parser* parser ) {
792 return parser->buff[ parser->index++ ];
796 @brief Report a syntax error to the log.
797 @param parser Pointer to a Parser.
798 @param badchar The character at the position where the error was detected.
799 @param err Pointer to a descriptive error message.
801 Format and log an error message. Identify the location of the error and
802 the character at that location. Show the neighborhood of the error within
805 static void report_error( Parser* parser, char badchar, const char* err ) {
807 // Determine the beginning and ending points of a JSON
808 // fragment to display, from the vicinity of the error
810 const int max_margin = 15; // How many characters to show
811 // on either side of the error
812 int pre = parser->index - max_margin;
816 int post = parser->index + 15;
817 if( '\0' == parser->buff[ parser->index ] ) {
818 post = parser->index - 1;
820 int remaining = strlen(parser->buff + parser->index);
821 if( remaining < max_margin )
822 post = parser->index + remaining;
825 // Copy the fragment into a buffer
826 int len = post - pre + 1; // length of fragment
828 memcpy( buf, parser->buff + pre, len );
831 // Replace newlines and tabs with spaces
834 if( '\n' == *p || '\t' == *p )
839 // Avoid trying to display a nul character
840 if( '\0' == badchar )
844 osrfLogError( OSRF_LOG_MARK,
845 "*JSON Parser Error\n - char = %c\n "
846 "- index = %d\n - near => %s\n - %s",
847 badchar, parser->index, buf, err );