From 4a02ad9f39356a92105fdd1491a95e98634e5ea3 Mon Sep 17 00:00:00 2001
From: erickson <erickson@9efc2488-bf62-4759-914b-345cdb29e865>
Date: Tue, 18 Nov 2008 22:23:29 +0000
Subject: [PATCH] 2 Patches from Scott McKellar, with slight modification:

This patch adds a new function buffer_append_uescape(), a streamlined
replacement for uescape().  The optimizations are as follows:

1. As described in an earlier post, the new function appends the uescaped
text to an existing growing_buffer, rather than into an internal one that
it must create and destroy.  It thereby saves two mallocs and two frees.
In addition, the calling code doesn't have to do a buffer_add().

2. Because it doesn't create an internal growing_buffer, it doesn't need
to do a strlen() to determine how big a buffer to allocate.

3. Since the new version doesn't have a boolean parameter, it doesn't have
to test the boolean.

4. For characters < 128 (i.e. ASCII characters), I rearranged the order of
the tests so that non-control characters are recognized immediately.  In
uescape() we first go through a switch/case looking for several specific
control characters like '\b' and '\n'.  In practice most characters are
not control characters, so this rearrangement saves a few CPU cycles.

5. For control characters, uescape() uses buffer_fadd() to format the
character into hex.  Now, buffer_fadd is slow because it uses vsnprintf()
twice, once to determine a buffer size and once to do the formatting.  In
turn vsnprintf() is slow because it has to parse the format string.  In
this case we don't need vsnprintf() because we already know exactly how
big the buffer needs to be, and the formatting is simple.  I eliminated
buffer_fadd() and formatted the hex manually with a little bit-twiddling.

Some of these optimizations could be applied to uescape(), but I haven't
bothered, partly because I wanted a clean comparison for benchmarking
purposes and partly because I expect uescape() to disappear from use
(though I am leaving it available).

=====

This patch is a rewrite of the jsonObjectToJSON and jsonObjectToJSONRaw functions.  It is dependent on my previous patch to utils.c and utils.h,
adding the new buffer_append_uescape function.

One purpose is to replace a call to the uescape function with a call to
the faster buffer_append_uescape function.  The other purpose is to
introduce a faster way to translate a jsonObject into a string.

(Also in one spot I broke up a very long string literal into several
smaller pieces so that it wouldn't wrap around in the editor.)

In the existing jsonObjectToJSON function, we receive a pointer to a
jsonObject and return a string of JSON.  However we don't translate the
original jsonObject directly.  Instead, we create a modified clone of the
original, inserting an additional JSON_HASH node wherever we find a
classname.  Then we translate the clone, and finally destroy it.

It always struck me as an egregious waste to create and destroy a whole
parallel object just so that we could turn it into a string.  So I looked
for a way to eliminate the cloning.

The result is a modification of add_json_to_buffer(), a local function
that recursively traverses and translates the jsonObject.  When it sees a
classname (and it has been asked to expand classnames), the new version
inserts additional gibberish into the output text and then continues the
traversal, without modifying or copying the original jsonObject at all.

In my benchmark, this new approach was faster than the original by a
factor of about 5.  When I combined this change with the use of the new
buffer_append_uencode function, it was faster by a factor of about 7.2.
The benchmark used a moderately complex jsonObject about 5 or 6 levels
deep, with both hashes and arrays, with classnames at several levels.
The performance gain will no doubt depend on the contents of the
jsonObject,but I haven't tried to isolate the variables.

The new version is a bit trickier and harder to read than the old.  In my
opinion the speedup is worth the obscurity, because a lot of places in
Evergreen will benefit.


git-svn-id: svn://svn.open-ils.org/OpenSRF/trunk@1498 9efc2488-bf62-4759-914b-345cdb29e865
---
 src/libopensrf/osrf_json_object.c | 105 +++++++++++++++++++---------
 src/libopensrf/utils.c            | 109 ++++++++++++++++++++++++++++++
 2 files changed, 183 insertions(+), 31 deletions(-)

diff --git a/src/libopensrf/osrf_json_object.c b/src/libopensrf/osrf_json_object.c
index 335d70e..26c3250 100644
--- a/src/libopensrf/osrf_json_object.c
+++ b/src/libopensrf/osrf_json_object.c
@@ -61,7 +61,8 @@ typedef union unusedObjUnion unusedObj;
 
 static unusedObj* freeObjList = NULL;
 
-static void add_json_to_buffer( const jsonObject* obj, growing_buffer * buf );
+static void add_json_to_buffer( const jsonObject* obj,
+	growing_buffer * buf, int do_classname, int second_pass );
 
 /**
  * Return all unused jsonObjects to the heap
@@ -194,7 +195,10 @@ void jsonObjectFree( jsonObject* o ) {
     unusedObjCapture++;
     currentListLen++;
     if (unusedObjCapture > 1 && !(unusedObjCapture % 1000))
-        osrfLogDebug( OSRF_LOG_MARK, "Objects malloc()'d: %d, Reusable objects captured: %d, Objects reused: %d, Current List Length: %d", mallocObjCreate, unusedObjCapture, unusedObjRelease, currentListLen );
+        osrfLogDebug( OSRF_LOG_MARK, "Objects malloc()'d: %d, "
+			"Reusable objects captured: %d, Objects reused: %d, "
+			"Current List Length: %d",
+			mallocObjCreate, unusedObjCapture, unusedObjRelease, currentListLen );
 }
 
 static void _jsonFreeHashItem(char* key, void* item){
@@ -256,21 +260,49 @@ const jsonObject* jsonObjectGetKeyConst( const jsonObject* obj, const char* key
 	return osrfHashGet( obj->value.h, key);
 }
 
-char* jsonObjectToJSON( const jsonObject* obj ) {
-	jsonObject* obj2 = jsonObjectEncodeClass( obj );
-	char* json = jsonObjectToJSONRaw(obj2);
-	jsonObjectFree(obj2);
-	return json;
-}
+/**
+ * Recursively traverse a jsonObject, formatting it into a JSON string.
+ *
+ * The last two parameters are booleans.
+ *
+ * If do_classname is true, examine each node for a classname, and if you
+ * find one, pretend that the node is under an extra layer of JSON_HASH, with
+ * JSON_CLASS_KEY and JSON_DATA_KEY as keys.
+ *
+ * second_pass should always be false except for some recursive calls.  It
+ * is used when expanding classnames, to distinguish between the first and
+ * second passes through a given node.
+ *
+ * @return Nothing
+ */
+static void add_json_to_buffer( const jsonObject* obj,
+	growing_buffer * buf, int do_classname, int second_pass ) {
 
-char* jsonObjectToJSONRaw( const jsonObject* obj ) {
-	if(!obj) return NULL;
-	growing_buffer* buf = buffer_init(32);
-	add_json_to_buffer( obj, buf );
-	return buffer_release( buf );
-}
+    if(NULL == obj) {
+        OSRF_BUFFER_ADD(buf, "null");
+        return;
+    }
 
-static void add_json_to_buffer( const jsonObject* obj, growing_buffer * buf ) {
+	if( obj->classname && do_classname )
+	{
+		if( second_pass )
+			second_pass = 0;
+		else
+		{
+			// Pretend we see an extra layer of JSON_HASH
+			
+			OSRF_BUFFER_ADD( buf, "{\"" );
+			OSRF_BUFFER_ADD( buf, JSON_CLASS_KEY );
+			OSRF_BUFFER_ADD( buf, "\":\"" );
+			OSRF_BUFFER_ADD( buf, obj->classname );
+			OSRF_BUFFER_ADD( buf, "\",\"" );
+			OSRF_BUFFER_ADD( buf, JSON_DATA_KEY );
+			OSRF_BUFFER_ADD( buf, "\":" );
+			add_json_to_buffer( obj, buf, 1, 1 );
+			buffer_add_char( buf, '}' );
+			return;
+		}
+	}
 
 	switch(obj->type) {
 
@@ -279,11 +311,11 @@ static void add_json_to_buffer( const jsonObject* obj, growing_buffer * buf ) {
 			else OSRF_BUFFER_ADD(buf, "false"); 
 			break;
 
-		case JSON_NUMBER: {
-			if(obj->value.s) OSRF_BUFFER_ADD( buf, obj->value.s );
-			else OSRF_BUFFER_ADD_CHAR( buf, '0' );
-			break;
-		}
+        case JSON_NUMBER: {
+            if(obj->value.s) OSRF_BUFFER_ADD( buf, obj->value.s );
+            else OSRF_BUFFER_ADD_CHAR( buf, '0' );
+            break;
+        }
 
 		case JSON_NULL:
 			OSRF_BUFFER_ADD(buf, "null");
@@ -291,12 +323,7 @@ static void add_json_to_buffer( const jsonObject* obj, growing_buffer * buf ) {
 
 		case JSON_STRING:
 			OSRF_BUFFER_ADD_CHAR(buf, '"');
-			char* data = obj->value.s;
-			int len = strlen(data);
-			
-			char* output = uescape(data, len, 1);
-			OSRF_BUFFER_ADD(buf, output);
-			free(output);
+			buffer_append_uescape(buf, obj->value.s);
 			OSRF_BUFFER_ADD_CHAR(buf, '"');
 			break;
 			
@@ -306,7 +333,8 @@ static void add_json_to_buffer( const jsonObject* obj, growing_buffer * buf ) {
 				int i;
 				for( i = 0; i != obj->value.l->size; i++ ) {
 					if(i > 0) OSRF_BUFFER_ADD(buf, ",");
-					add_json_to_buffer( OSRF_LIST_GET_INDEX(obj->value.l, i), buf );
+					add_json_to_buffer(
+						OSRF_LIST_GET_INDEX(obj->value.l, i), buf, do_classname, second_pass );
 				}
 			}
 			OSRF_BUFFER_ADD_CHAR(buf, ']');
@@ -314,16 +342,18 @@ static void add_json_to_buffer( const jsonObject* obj, growing_buffer * buf ) {
 		}
 
 		case JSON_HASH: {
-
+	
 			OSRF_BUFFER_ADD_CHAR(buf, '{');
 			osrfHashIterator* itr = osrfNewHashIterator(obj->value.h);
 			jsonObject* item;
 			int i = 0;
 
 			while( (item = osrfHashIteratorNext(itr)) ) {
-				if(i++ > 0) OSRF_BUFFER_ADD(buf, ",");
-				buffer_fadd(buf, "\"%s\":", osrfHashIteratorKey(itr));
-				add_json_to_buffer( item, buf );
+				if(i++ > 0) OSRF_BUFFER_ADD_CHAR(buf, ',');
+				OSRF_BUFFER_ADD_CHAR(buf, '"');
+				OSRF_BUFFER_ADD(buf, osrfHashIteratorKey(itr));
+				OSRF_BUFFER_ADD(buf, "\":");
+				add_json_to_buffer( item, buf, do_classname, second_pass );
 			}
 
 			osrfHashIteratorFree(itr);
@@ -333,6 +363,19 @@ static void add_json_to_buffer( const jsonObject* obj, growing_buffer * buf ) {
 	}
 }
 
+char* jsonObjectToJSONRaw( const jsonObject* obj ) {
+	if(!obj) return NULL;
+	growing_buffer* buf = buffer_init(32);
+	add_json_to_buffer( obj, buf, 0, 0 );
+	return buffer_release( buf );
+}
+
+char* jsonObjectToJSON( const jsonObject* obj ) {
+	if(!obj) return NULL;
+	growing_buffer* buf = buffer_init(32);
+	add_json_to_buffer( obj, buf, 1, 0 );
+	return buffer_release( buf );
+}
 
 jsonIterator* jsonNewIterator(const jsonObject* obj) {
 	if(!obj) return NULL;
diff --git a/src/libopensrf/utils.c b/src/libopensrf/utils.c
index 778afab..491f168 100644
--- a/src/libopensrf/utils.c
+++ b/src/libopensrf/utils.c
@@ -16,6 +16,9 @@ GNU General Public License for more details.
 #include <opensrf/log.h>
 #include <errno.h>
 
+static const char hex_chars[]   = "0123456789abcdef";
+static unsigned char hex_code[7] = "\\u00";
+
 inline void* safe_malloc( int size ) {
 	void* ptr = (void*) malloc( size );
 	if( ptr == NULL ) {
@@ -424,6 +427,112 @@ char* uescape( const char* string, int size, int full_escape ) {
 	return buffer_release(buf);
 }
 
+int buffer_append_uescape( growing_buffer* buf, const char* string ) {
+
+	if(NULL == string)
+		return 0;       // Nothing to add?  Nothing to do
+
+	if( NULL == buf )
+		return -1;      // Nothing to add to
+
+	int idx = 0;
+	unsigned long int c;
+
+	while (string[idx]) {
+
+		c = 0x0;
+
+		if ((unsigned char)string[idx] >= 0x80) { // not ASCII
+
+			if ((unsigned char)string[idx] >= 0xC0 && (unsigned char)string[idx] <= 0xF4) { // starts a UTF8 string
+
+				int clen = 1;
+				if (((unsigned char)string[idx] & 0xF0) == 0xF0) {
+					clen = 3;
+					c = (unsigned char)string[idx] ^ 0xF0;
+
+				} else if (((unsigned char)string[idx] & 0xE0) == 0xE0) {
+					clen = 2;
+					c = (unsigned char)string[idx] ^ 0xE0;
+
+				} else if (((unsigned char)string[idx] & 0xC0) == 0xC0) {
+					clen = 1;
+					c = (unsigned char)string[idx] ^ 0xC0;
+				}
+
+				for (;clen;clen--) {
+
+					idx++; // look at the next byte
+					c = (c << 6) | ((unsigned char)string[idx] & 0x3F); // add this byte worth
+				}
+
+				buffer_fadd(buf, "\\u%04x", c);
+
+			} else {
+				return idx + 1;
+			}
+
+		} else if(string[idx] >= ' ' ) { // printable ASCII character
+
+			c = string[idx];
+			switch(c) {
+				case '"':
+				case '\\':
+					OSRF_BUFFER_ADD_CHAR(buf, '\\');
+					OSRF_BUFFER_ADD_CHAR(buf, c);
+					break;
+
+                default:
+			        OSRF_BUFFER_ADD_CHAR(buf, c);
+            }
+
+		} else {
+			c = string[idx];
+
+			/* escape the usual suspects */
+			switch(c) {
+				case '\b':
+					OSRF_BUFFER_ADD_CHAR(buf, '\\');
+					OSRF_BUFFER_ADD_CHAR(buf, 'b');
+					break;
+	
+				case '\f':
+					OSRF_BUFFER_ADD_CHAR(buf, '\\');
+					OSRF_BUFFER_ADD_CHAR(buf, 'f');
+					break;
+	
+				case '\t':
+					OSRF_BUFFER_ADD_CHAR(buf, '\\');
+					OSRF_BUFFER_ADD_CHAR(buf, 't');
+					break;
+	
+				case '\n':
+					OSRF_BUFFER_ADD_CHAR(buf, '\\');
+					OSRF_BUFFER_ADD_CHAR(buf, 'n');
+					break;
+	
+				case '\r':
+					OSRF_BUFFER_ADD_CHAR(buf, '\\');
+					OSRF_BUFFER_ADD_CHAR(buf, 'r');
+					break;
+
+				default:
+				{
+					// Represent as \u followed by four hex characters
+					hex_code[ 4 ] = hex_chars[ c >> 4 ];    // high nybble
+					hex_code[ 5 ] = hex_chars[ c & 0x0F ];  // low nybble
+					hex_code[ 6 ] = '\0';
+					OSRF_BUFFER_ADD(buf, (char *) hex_code);
+					break;
+				}
+			}
+		}
+
+		idx++;
+	}
+
+	return 0;
+}
 
 // A function to turn a process into a daemon 
 int daemonize( void ) {
-- 
2.43.2