src/libopensrf/osrf_parse_json.c

   1 /*
   2 Copyright (C) 2009  Georgia Public Library Service
   3 Scott McKellar <scott@esilibrary.com>
   4
   5 This program is free software; you can redistribute it and/or
   6 modify it under the terms of the GNU General Public License
   7 as published by the Free Software Foundation; either version 2
   8 of the License, or (at your option) any later version.
   9
  10 This program is distributed in the hope that it will be useful,
  11 but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 GNU General Public License for more details.
  14 */
  15
  16 #include <stdlib.h>
  17 #include <string.h>
  18 #include <stdio.h>
  19 #include <ctype.h>
  20 #include <opensrf/osrf_json.h>
  21 #include <opensrf/osrf_json_utils.h>
  22
  23 typedef struct {
  24         growing_buffer* str_buf;  // for building strings
  25         size_t index;             // index into buffer
  26         const char* buff;         // client's buffer holding current chunk of input
  27 } Parser;
  28
  29 // For building Unicode byte sequences
  30 typedef struct {
  31         unsigned char buff[ 4 ];
  32 } Unibuff;
  33
  34 static jsonObject* parse( Parser* parser );
  35
  36 static jsonObject* get_json_thing( Parser* parser, char firstc );
  37 static const char* get_string( Parser* parser );
  38 static jsonObject* get_number( Parser* parser, char firstc );
  39 static jsonObject* get_array( Parser* parser );
  40 static jsonObject* get_hash( Parser* parser );
  41 static jsonObject* get_null( Parser* parser );
  42 static jsonObject* get_true( Parser* parser );
  43 static jsonObject* get_false( Parser* parser );
  44 static int get_utf8( Parser* parser, Unibuff* unibuff );
  45
  46 static char skip_white_space( Parser* parser );
  47 static inline void parser_ungetc( Parser* parser );
  48 static inline char parser_nextc( Parser* parser );
  49 static void report_error( Parser* parser, char badchar, char* err );
  50
  51 /* ------------------------------------- */
  52
  53 // Parse a JSON string; expand classes; construct a jsonObject.
  54 // Return NULL if the JSON string is invalid.
  55 jsonObject* jsonParse( const char* str ) {
  56         if(!str)
  57                 return NULL;
  58
  59         jsonObject* obj  = jsonParseRaw( str );
  60
  61         jsonObject* obj2 = NULL;
  62         if( obj )
  63                 obj2 = jsonObjectDecodeClass( obj );
  64
  65         jsonObjectFree( obj  );
  66
  67         return obj2;
  68 }
  69
  70 // Parse a JSON string with variable arguments; construct a jsonObject.
  71 // Return NULL if the resulting JSON string is invalid.
  72 jsonObject* jsonParseFmt( const char* str, ... ) {
  73         if( !str )
  74                 return NULL;
  75         VA_LIST_TO_STRING(str);
  76         return jsonParseRaw( VA_BUF );
  77 }
  78
  79 // Parse a JSON string; construct a jsonObject.
  80 // Return NULL if the JSON string is invalid.
  81 jsonObject* jsonParseRaw( const char* s ) {
  82
  83         if( !s || !*s )
  84                 return NULL;    // Nothing to parse
  85
  86         Parser parser;
  87
  88         parser.str_buf = NULL;
  89         parser.index = 0;
  90         parser.buff = s;
  91
  92         jsonObject* obj = parse( &parser );
  93
  94         buffer_free( parser.str_buf );
  95         return obj;
  96 }
  97
  98 // Parse a text string into a jsonObject.
  99 static jsonObject* parse( Parser* parser ) {
 100
 101         if( ! parser->buff ) {
 102                 osrfLogError( OSRF_LOG_MARK, "Internal error; no input buffer available" );
 103                 return NULL;         // Should never happen
 104         }
 105
 106         jsonObject* obj = get_json_thing( parser, skip_white_space( parser ) );
 107
 108         char c;
 109         if( obj && (c = skip_white_space( parser )) ) {
 110                 report_error( parser, c, "Extra material follows JSON string" );
 111                 jsonObjectFree( obj );
 112                 obj = NULL;
 113         }
 114
 115         return obj;
 116 }
 117
 118 // Get the next JSON node -- be it string, number, hash, or whatever.
 119 // Return a pointer to it if successful, or NULL if not.
 120 static jsonObject* get_json_thing( Parser* parser, char firstc ) {
 121
 122         jsonObject* obj = NULL;
 123
 124         // Branch on the first character
 125         if( '"' == firstc ) {
 126                 const char* str = get_string( parser );
 127                 if( str ) {
 128                         obj = jsonNewObject( NULL );
 129                         obj->type = JSON_STRING;
 130                         obj->value.s = strdup( str );
 131                 }
 132         } else if( '[' == firstc ) {
 133                 obj = get_array( parser );
 134         } else if( '{' == firstc ) {
 135                 obj = get_hash( parser );
 136         } else if( 'n' == firstc ) {
 137                 obj = get_null( parser );
 138         } else if( 't' == firstc ) {
 139                 obj = get_true( parser );
 140         } else if( 'f' == firstc ) {
 141                 obj = get_false( parser );
 142         }
 143         else if( isdigit( (unsigned char) firstc ) ||
 144                          '.' == firstc ||
 145                          '-' == firstc ||
 146                          '+' == firstc ||
 147                          'e' == firstc ||
 148                          'E' == firstc ) {
 149                 obj = get_number( parser, firstc );
 150         } else {
 151                 report_error( parser, firstc, "Unexpected character" );
 152         }
 153
 154         return obj;
 155 }
 156
 157 // Collect characters from the input stream into a character
 158 // string, terminated by '"'.  Return a char* if successful,
 159 // or NULL if not.
 160 static const char* get_string( Parser* parser ) {
 161
 162         if( parser->str_buf )
 163                 buffer_reset( parser->str_buf );
 164         else
 165                 parser->str_buf = buffer_init( 64 );
 166
 167         growing_buffer* gb = parser->str_buf;
 168
 169         // Collect the characters.
 170         // This is a naive implementation so far.
 171         // We need to worry about UTF-8.
 172         for( ;; ) {
 173                 char c = parser_nextc( parser );
 174                 if( '"' == c )
 175                         break;
 176                 else if( !c ) {
 177                         report_error( parser, parser->buff[ parser->index - 1  ],
 178                                                   "Quoted string not terminated" );
 179                         return NULL;
 180                 } else if( '\\' == c ) {
 181                         c = parser_nextc( parser );
 182                         switch( c ) {
 183                                 case '"'  : OSRF_BUFFER_ADD_CHAR( gb, '"'  );  break;
 184                                 case '\\' : OSRF_BUFFER_ADD_CHAR( gb, '\\' ); break;
 185                                 case '/'  : OSRF_BUFFER_ADD_CHAR( gb, '/'  );  break;
 186                                 case 'b'  : OSRF_BUFFER_ADD_CHAR( gb, '\b' ); break;
 187                                 case 'f'  : OSRF_BUFFER_ADD_CHAR( gb, '\f' ); break;
 188                                 case 'n'  : OSRF_BUFFER_ADD_CHAR( gb, '\n' ); break;
 189                                 case 'r'  : OSRF_BUFFER_ADD_CHAR( gb, '\r' ); break;
 190                                 case 't'  : OSRF_BUFFER_ADD_CHAR( gb, '\t' ); break;
 191                                 case 'u'  : {
 192                                         Unibuff unibuff;
 193                                         if( get_utf8( parser, &unibuff ) ) {
 194                                                 return NULL;       // bad UTF-8
 195                                         } else if( unibuff.buff[0] ) {
 196                                                 OSRF_BUFFER_ADD( gb, (char*) unibuff.buff );
 197                                         } else {
 198                                                 report_error( parser, 'u', "Unicode sequence encodes a nul byte" );
 199                                                 return NULL;
 200                                         }
 201                                         break;
 202                                 }
 203                                 default   : OSRF_BUFFER_ADD_CHAR( gb, c );    break;
 204                         }
 205                 }
 206                 else
 207                         OSRF_BUFFER_ADD_CHAR( gb, c );
 208         }
 209
 210         return OSRF_BUFFER_C_STR( gb );
 211 }
 212
 213 // We found what looks like the first character of a number.
 214 // Collect all the eligible characters, and verify that they
 215 // are numeric (possibly after some scrubbing).  Return a
 216 // pointer to a JSON_NUMBER if successful, or NULL if not.
 217 static jsonObject* get_number( Parser* parser, char firstc ) {
 218
 219         growing_buffer* gb = buffer_init( 32 );
 220         OSRF_BUFFER_ADD_CHAR( gb, firstc );
 221
 222         char c;
 223
 224         for( ;; ) {
 225                 c = parser_nextc( parser );
 226                 if( isdigit( (unsigned char) c ) ||
 227                         '.' == c ||
 228                         '-' == c ||
 229                         '+' == c ||
 230                         'e' == c ||
 231                         'E' == c ) {
 232                         OSRF_BUFFER_ADD_CHAR( gb, c );
 233                 } else {
 234                         if( ! isspace( (unsigned char) c ) )
 235                                 parser_ungetc( parser );
 236                         break;
 237                 }
 238         }
 239
 240         char* s = buffer_release( gb );
 241         if( ! jsonIsNumeric( s ) ) {
 242                 char* temp = jsonScrubNumber( s );
 243                 free( s );
 244                 s = temp;
 245                 if( !s ) {
 246                         report_error( parser, parser->buff[ parser->index - 1 ],
 247                                         "Invalid numeric format" );
 248                         return NULL;
 249                 }
 250         }
 251
 252         jsonObject* obj = jsonNewObject( NULL );
 253         obj->type = JSON_NUMBER;
 254         obj->value.s = s;
 255
 256         return obj;
 257 }
 258
 259 // We found a '['.  Create a JSON_ARRAY with all its subordinates.
 260 static jsonObject* get_array( Parser* parser ) {
 261
 262         jsonObject* array = jsonNewObjectType( JSON_ARRAY );
 263
 264         char c = skip_white_space( parser );
 265         if( ']' == c )
 266                 return array;          // Empty array
 267
 268         for( ;; ) {
 269                 jsonObject* obj = get_json_thing( parser, c );
 270                 if( !obj ) {
 271                         jsonObjectFree( array );
 272                         return NULL;         // Failed to get anything
 273                 }
 274
 275                 // Add the entry to the array
 276                 jsonObjectPush( array, obj );
 277
 278                 // Look for a comma or right bracket
 279                 c = skip_white_space( parser );
 280                 if( ']' == c )
 281                         break;
 282                 else if( c != ',' ) {
 283                         report_error( parser, c, "Expected comma or bracket in array; didn't find it\n" );
 284                         jsonObjectFree( array );
 285                         return NULL;
 286                 }
 287                 c = skip_white_space( parser );
 288         }
 289
 290         return array;
 291 }
 292
 293 // We found '{' Get a JSON_HASH, with all its subordinates.
 294 static jsonObject* get_hash( Parser* parser ) {
 295         jsonObject* hash = jsonNewObjectType( JSON_HASH );
 296
 297         char c = skip_white_space( parser );
 298         if( '}' == c )
 299                 return hash;           // Empty hash
 300
 301         for( ;; ) {
 302
 303                 // Get the key string
 304                 if( '"' != c ) {
 305                         report_error( parser, c,
 306                                         "Expected quotation mark to begin hash key; didn't find it\n" );
 307                         jsonObjectFree( hash );
 308                         return NULL;
 309                 }
 310
 311                 const char* key = get_string( parser );
 312                 if( ! key ) {
 313                         jsonObjectFree( hash );
 314                         return NULL;
 315                 }
 316                 char* key_copy = strdup( key );
 317
 318                 if( jsonObjectGetKey( hash, key_copy ) ) {
 319                         report_error( parser, '"', "Duplicate key in JSON object" );
 320                         jsonObjectFree( hash );
 321                         return NULL;
 322                 }
 323
 324                 // Get the colon
 325                 c = skip_white_space( parser );
 326                 if( c != ':' ) {
 327                         report_error( parser, c,
 328                                         "Expected colon after hash key; didn't find it\n" );
 329                         free( key_copy );
 330                         jsonObjectFree( hash );
 331                         return NULL;
 332                 }
 333
 334                 // Get the associated value
 335                 jsonObject* obj = get_json_thing( parser, skip_white_space( parser ) );
 336                 if( !obj ) {
 337                         free( key_copy );
 338                         jsonObjectFree( hash );
 339                         return NULL;
 340                 }
 341
 342                 // Add a new entry to the hash
 343                 jsonObjectSetKey( hash, key_copy, obj );
 344                 free( key_copy );
 345
 346                 // Look for comma or right brace
 347                 c = skip_white_space( parser );
 348                 if( '}' == c )
 349                         break;
 350                 else if( c != ',' ) {
 351                         report_error( parser, c,
 352                                         "Expected comma or brace in hash, didn't find it" );
 353                         jsonObjectFree( hash );
 354                         return NULL;
 355                 }
 356                 c = skip_white_space( parser );
 357         }
 358
 359         return hash;
 360 }
 361
 362 // We found an 'n'.  Verify that the next four characters are "ull",
 363 // and that there are no further characters in the token.
 364 static jsonObject* get_null( Parser* parser ) {
 365
 366         if( parser_nextc( parser ) != 'u' ||
 367                 parser_nextc( parser ) != 'l' ||
 368                 parser_nextc( parser ) != 'l' ) {
 369                 report_error( parser, parser->buff[ parser->index - 1 ],
 370                                 "Expected \"ull\" to follow \"n\"; didn't find it" );
 371                 return NULL;
 372         }
 373
 374         // Sneak a peek at the next character
 375         // to make sure that it's kosher
 376         char c = parser_nextc( parser );
 377         if( ! isspace( (unsigned char) c ) )
 378                 parser_ungetc( parser );
 379
 380         if( isalnum( (unsigned char) c ) ) {
 381                 report_error( parser, c,
 382                                 "Found letter or number after \"null\"" );
 383                 return NULL;
 384         }
 385
 386         // Everythings okay.  Return a JSON_BOOL.
 387         return jsonNewObject( NULL );
 388 }
 389
 390 // We found a 't'.  Verify that the next four characters are "rue",
 391 // and that there are no further characters in the token.
 392 static jsonObject* get_true( Parser* parser ) {
 393
 394         if( parser_nextc( parser ) != 'r' ||
 395                 parser_nextc( parser ) != 'u' ||
 396                 parser_nextc( parser ) != 'e' ) {
 397                 report_error( parser, parser->buff[ parser->index - 1 ],
 398                                           "Expected \"rue\" to follow \"t\"; didn't find it" );
 399                 return NULL;
 400         }
 401
 402         // Sneak a peek at the next character
 403         // to make sure that it's kosher
 404         char c = parser_nextc( parser );
 405         if( ! isspace( (unsigned char) c ) )
 406                 parser_ungetc( parser );
 407
 408         if( isalnum( (unsigned char) c ) ) {
 409                 report_error( parser, c,
 410                                 "Found letter or number after \"true\"" );
 411                 return NULL;
 412         }
 413
 414         // Everythings okay.  Return a JSON_NULL.
 415         return jsonNewBoolObject( 1 );
 416 }
 417
 418 // We found an 'f'.  Verify that the next four characters are "alse",
 419 // and that there are no further characters in the token.
 420 static jsonObject* get_false( Parser* parser ) {
 421
 422         if( parser_nextc( parser ) != 'a' ||
 423                 parser_nextc( parser ) != 'l' ||
 424                 parser_nextc( parser ) != 's' ||
 425                 parser_nextc( parser ) != 'e' ) {
 426                 report_error( parser, parser->buff[ parser->index - 1 ],
 427                                 "Expected \"alse\" to follow \"f\"; didn't find it" );
 428                 return NULL;
 429         }
 430
 431         // Sneak a peek at the next character
 432         // to make sure that it's kosher
 433         char c = parser_nextc( parser );
 434         if( ! isspace( (unsigned char) c ) )
 435                 parser_ungetc( parser );
 436
 437         if( isalnum( (unsigned char) c ) ) {
 438                 report_error( parser, c,
 439                                 "Found letter or number after \"false\"" );
 440                 return NULL;
 441         }
 442
 443         // Everythings okay.  Return a JSON_BOOL.
 444         return jsonNewBoolObject( 0 );
 445 }
 446
 447 // We found \u.  Grab the next 4 characters, confirm that they are hex,
 448 // and convert them to Unicode.
 449 static int get_utf8( Parser* parser, Unibuff* unibuff ) {
 450         char ubuff[ 5 ];
 451         int i = 0;
 452
 453         // Accumulate four characters into a buffer.  Make sure that
 454         // there are four of them, and that they're all hex.
 455         for( i = 0; i < 4; ++i ) {
 456                 int c = parser_nextc( parser );
 457                 if( !c ) {
 458                         report_error( parser, 'u', "Incomplete Unicode sequence" );
 459                         unibuff->buff[ 0 ] = '\0';
 460                         return 1;
 461                 } else if( ! isxdigit( (unsigned char) c ) ) {
 462                         report_error( parser, c, "Non-hex byte found in Unicode sequence" );
 463                         unibuff->buff[ 0 ] = '\0';
 464                         return 1;
 465                 }
 466                 else
 467                         ubuff[ i ] = c;
 468         }
 469
 470         /* The following code is adapted with permission from
 471          * json-c http://oss.metaparadigm.com/json-c/
 472          */
 473         #define hexdigit(x) ( ((x) <= '9') ? (x) - '0' : ((x) & 7) + 9)
 474
 475         // Convert the hex sequence into a single integer
 476         unsigned int ucs_char =
 477                         (hexdigit(ubuff[ 0 ]) << 12) +
 478                         (hexdigit(ubuff[ 1 ]) <<  8) +
 479                         (hexdigit(ubuff[ 2 ]) <<  4) +
 480                          hexdigit(ubuff[ 3 ]);
 481
 482         unsigned char* utf_out = unibuff->buff;
 483
 484         if (ucs_char < 0x80) {
 485                 utf_out[0] = ucs_char;
 486                 utf_out[1] = '\0';
 487
 488         } else if (ucs_char < 0x800) {
 489                 utf_out[0] = 0xc0 | (ucs_char >> 6);
 490                 utf_out[1] = 0x80 | (ucs_char & 0x3f);
 491                 utf_out[2] = '\0';
 492
 493         } else {
 494                 utf_out[0] = 0xe0 | (ucs_char >> 12);
 495                 utf_out[1] = 0x80 | ((ucs_char >> 6) & 0x3f);
 496                 utf_out[2] = 0x80 | (ucs_char & 0x3f);
 497                 utf_out[3] = '\0';
 498         }
 499
 500         return 0;
 501 }
 502
 503 // Return the next non-whitespace character in the input stream.
 504 static char skip_white_space( Parser* parser ) {
 505         char c;
 506         do {
 507                 c = parser_nextc( parser );
 508         } while( isspace( (unsigned char) c ) );
 509
 510         return c;
 511 }
 512
 513 // Put a character back into the input stream.
 514 // It is the responsibility of the caller not to back up
 515 // past the beginning of the input string.
 516 static inline void parser_ungetc( Parser* parser ) {
 517         --parser->index;
 518 }
 519
 520 // Get the next character.  It is the responsibility of
 521 //the caller not to read past the end of the input string.
 522 static inline char parser_nextc( Parser* parser ) {
 523         return parser->buff[ parser->index++ ];
 524 }
 525
 526 // Report a syntax error to standard error.
 527 static void report_error( Parser* parser, char badchar, char* err ) {
 528
 529         // Determine the beginning and ending points of a JSON
 530         // fragment to display, from the vicinity of the error
 531
 532         const int max_margin = 15;  // How many characters to show
 533                                     // on either side of the error
 534         int pre = parser->index - max_margin;
 535         if( pre < 0 )
 536                 pre = 0;
 537
 538         int post = parser->index + 15;
 539         if( '\0' == parser->buff[ parser->index ] ) {
 540                 post = parser->index - 1;
 541         } else {
 542                 int remaining = strlen(parser->buff + parser->index);
 543                 if( remaining < max_margin )
 544                         post = parser->index + remaining;
 545         }
 546
 547         // Copy the fragment into a buffer
 548
 549         int len = post - pre + 1;  // length of fragment
 550         char buf[len + 1];
 551         memcpy( buf, parser->buff + pre, len );
 552         buf[ len ] = '\0';
 553
 554         // Replace newlines and tabs with spaces
 555         char* p = buf;
 556         while( *p ) {
 557                 if( '\n' == *p || '\t' == *p )
 558                         *p = ' ';
 559                 ++p;
 560         }
 561
 562         // Avoid trying to display a nul character
 563         if( '\0' == badchar )
 564                 badchar = ' ';
 565
 566         // Issue the message
 567         osrfLogError( OSRF_LOG_MARK,
 568                 "*JSON Parser Error\n - char  = %c\n "
 569                 "- index = %d\n - near  => %s\n - %s",
 570                 badchar, parser->index, buf, err );
 571 }