321d3e67a1d78ed9c5ccf05c8a4353da2c30ef41
[OpenSRF.git] / src / libopensrf / osrf_utf8.c
1 /*----------------------------------------------------
2  Desc    : functions and macros for processing UTF-8
3  Author  : Scott McKellar
4  Notes   : Translate UTF-8 text to a JSON string
5
6  Copyright 2008 Scott McKellar
7  All Rights reserved
8
9  This program is free software; you can redistribute it and/or modify
10  it under the terms of the GNU General Public License as published by
11  the Free Software Foundation; either version 2 of the License, or
12  (at your option) any later version.
13
14  This program is distributed in the hope that it will be useful,
15  but WITHOUT ANY WARRANTY; without even the implied warranty of
16  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17  GNU General Public License for more details.
18
19  You should have received a copy of the GNU General Public License
20  along with this program; if not, write to the
21  Free Software Foundation, Inc.,
22  59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
23
24 Date       Change
25  ---------- -------------------------------------------------
26  2008/11/20 Initial creation
27  2008/11/27 Emit surrogate pairs for code points > 0xFFFF
28  ----------------------------------------------------------*/
29 #include <opensrf/utils.h>
30 #include <opensrf/osrf_utf8.h>
31
32 static void append_surrogate_pair(growing_buffer * buf, unsigned long code_point);
33 static void append_uxxxx(growing_buffer * buf, unsigned long i);
34
35 unsigned char osrf_utf8_mask_[] =
36 {
37         193,    /* 00000000     Control character */
38         193,    /* 00000001     Control character */
39         193,    /* 00000010     Control character */
40         193,    /* 00000011     Control character */
41         193,    /* 00000100     Control character */
42         193,    /* 00000101     Control character */
43         193,    /* 00000110     Control character */
44         193,    /* 00000111     Control character */
45         193,    /* 00001000     Control character */
46         193,    /* 00001001     Control character */
47         193,    /* 00001010     Control character */
48         193,    /* 00001011     Control character */
49         193,    /* 00001100     Control character */
50         193,    /* 00001101     Control character */
51         193,    /* 00001110     Control character */
52         193,    /* 00001111     Control character */
53         193,    /* 00010000     Control character */
54         193,    /* 00010001     Control character */
55         193,    /* 00010010     Control character */
56         193,    /* 00010011     Control character */
57         193,    /* 00010100     Control character */
58         193,    /* 00010101     Control character */
59         193,    /* 00010110     Control character */
60         193,    /* 00010111     Control character */
61         193,    /* 00011000     Control character */
62         193,    /* 00011001     Control character */
63         193,    /* 00011010     Control character */
64         193,    /* 00011011     Control character */
65         193,    /* 00011100     Control character */
66         193,    /* 00011101     Control character */
67         193,    /* 00011110     Control character */
68         193,    /* 00011111     Control character */
69         194,    /* 00100000     Printable ASCII */
70         194,    /* 00100001     Printable ASCII */
71         194,    /* 00100010     Printable ASCII */
72         194,    /* 00100011     Printable ASCII */
73         194,    /* 00100100     Printable ASCII */
74         194,    /* 00100101     Printable ASCII */
75         194,    /* 00100110     Printable ASCII */
76         194,    /* 00100111     Printable ASCII */
77         194,    /* 00101000     Printable ASCII */
78         194,    /* 00101001     Printable ASCII */
79         194,    /* 00101010     Printable ASCII */
80         194,    /* 00101011     Printable ASCII */
81         194,    /* 00101100     Printable ASCII */
82         194,    /* 00101101     Printable ASCII */
83         194,    /* 00101110     Printable ASCII */
84         194,    /* 00101111     Printable ASCII */
85         194,    /* 00110000     Printable ASCII */
86         194,    /* 00110001     Printable ASCII */
87         194,    /* 00110010     Printable ASCII */
88         194,    /* 00110011     Printable ASCII */
89         194,    /* 00110100     Printable ASCII */
90         194,    /* 00110101     Printable ASCII */
91         194,    /* 00110110     Printable ASCII */
92         194,    /* 00110111     Printable ASCII */
93         194,    /* 00111000     Printable ASCII */
94         194,    /* 00111001     Printable ASCII */
95         194,    /* 00111010     Printable ASCII */
96         194,    /* 00111011     Printable ASCII */
97         194,    /* 00111100     Printable ASCII */
98         194,    /* 00111101     Printable ASCII */
99         194,    /* 00111110     Printable ASCII */
100         194,    /* 00111111     Printable ASCII */
101         194,    /* 01000000     Printable ASCII */
102         194,    /* 01000001     Printable ASCII */
103         194,    /* 01000010     Printable ASCII */
104         194,    /* 01000011     Printable ASCII */
105         194,    /* 01000100     Printable ASCII */
106         194,    /* 01000101     Printable ASCII */
107         194,    /* 01000110     Printable ASCII */
108         194,    /* 01000111     Printable ASCII */
109         194,    /* 01001000     Printable ASCII */
110         194,    /* 01001001     Printable ASCII */
111         194,    /* 01001010     Printable ASCII */
112         194,    /* 01001011     Printable ASCII */
113         194,    /* 01001100     Printable ASCII */
114         194,    /* 01001101     Printable ASCII */
115         194,    /* 01001110     Printable ASCII */
116         194,    /* 01001111     Printable ASCII */
117         194,    /* 01010000     Printable ASCII */
118         194,    /* 01010001     Printable ASCII */
119         194,    /* 01010010     Printable ASCII */
120         194,    /* 01010011     Printable ASCII */
121         194,    /* 01010100     Printable ASCII */
122         194,    /* 01010101     Printable ASCII */
123         194,    /* 01010110     Printable ASCII */
124         194,    /* 01010111     Printable ASCII */
125         194,    /* 01011000     Printable ASCII */
126         194,    /* 01011001     Printable ASCII */
127         194,    /* 01011010     Printable ASCII */
128         194,    /* 01011011     Printable ASCII */
129         194,    /* 01011100     Printable ASCII */
130         194,    /* 01011101     Printable ASCII */
131         194,    /* 01011110     Printable ASCII */
132         194,    /* 01011111     Printable ASCII */
133         194,    /* 01100000     Printable ASCII */
134         194,    /* 01100001     Printable ASCII */
135         194,    /* 01100010     Printable ASCII */
136         194,    /* 01100011     Printable ASCII */
137         194,    /* 01100100     Printable ASCII */
138         194,    /* 01100101     Printable ASCII */
139         194,    /* 01100110     Printable ASCII */
140         194,    /* 01100111     Printable ASCII */
141         194,    /* 01101000     Printable ASCII */
142         194,    /* 01101001     Printable ASCII */
143         194,    /* 01101010     Printable ASCII */
144         194,    /* 01101011     Printable ASCII */
145         194,    /* 01101100     Printable ASCII */
146         194,    /* 01101101     Printable ASCII */
147         194,    /* 01101110     Printable ASCII */
148         194,    /* 01101111     Printable ASCII */
149         194,    /* 01110000     Printable ASCII */
150         194,    /* 01110001     Printable ASCII */
151         194,    /* 01110010     Printable ASCII */
152         194,    /* 01110011     Printable ASCII */
153         194,    /* 01110100     Printable ASCII */
154         194,    /* 01110101     Printable ASCII */
155         194,    /* 01110110     Printable ASCII */
156         194,    /* 01110111     Printable ASCII */
157         194,    /* 01111000     Printable ASCII */
158         194,    /* 01111001     Printable ASCII */
159         194,    /* 01111010     Printable ASCII */
160         194,    /* 01111011     Printable ASCII */
161         194,    /* 01111100     Printable ASCII */
162         194,    /* 01111101     Printable ASCII */
163         194,    /* 01111110     Printable ASCII */
164         193,    /* 01111111     Control character */
165         132,    /* 10000000     UTFR-8 continuation */
166         132,    /* 10000001     UTFR-8 continuation */
167         132,    /* 10000010     UTFR-8 continuation */
168         132,    /* 10000011     UTFR-8 continuation */
169         132,    /* 10000100     UTFR-8 continuation */
170         132,    /* 10000101     UTFR-8 continuation */
171         132,    /* 10000110     UTFR-8 continuation */
172         132,    /* 10000111     UTFR-8 continuation */
173         132,    /* 10001000     UTFR-8 continuation */
174         132,    /* 10001001     UTFR-8 continuation */
175         132,    /* 10001010     UTFR-8 continuation */
176         132,    /* 10001011     UTFR-8 continuation */
177         132,    /* 10001100     UTFR-8 continuation */
178         132,    /* 10001101     UTFR-8 continuation */
179         132,    /* 10001110     UTFR-8 continuation */
180         132,    /* 10001111     UTFR-8 continuation */
181         132,    /* 10010000     UTFR-8 continuation */
182         132,    /* 10010001     UTFR-8 continuation */
183         132,    /* 10010010     UTFR-8 continuation */
184         132,    /* 10010011     UTFR-8 continuation */
185         132,    /* 10010100     UTFR-8 continuation */
186         132,    /* 10010101     UTFR-8 continuation */
187         132,    /* 10010110     UTFR-8 continuation */
188         132,    /* 10010111     UTFR-8 continuation */
189         132,    /* 10011000     UTFR-8 continuation */
190         132,    /* 10011001     UTFR-8 continuation */
191         132,    /* 10011010     UTFR-8 continuation */
192         132,    /* 10011011     UTFR-8 continuation */
193         132,    /* 10011100     UTFR-8 continuation */
194         132,    /* 10011101     UTFR-8 continuation */
195         132,    /* 10011110     UTFR-8 continuation */
196         132,    /* 10011111     UTFR-8 continuation */
197         132,    /* 10100000     UTFR-8 continuation */
198         132,    /* 10100001     UTFR-8 continuation */
199         132,    /* 10100010     UTFR-8 continuation */
200         132,    /* 10100011     UTFR-8 continuation */
201         132,    /* 10100100     UTFR-8 continuation */
202         132,    /* 10100101     UTFR-8 continuation */
203         132,    /* 10100110     UTFR-8 continuation */
204         132,    /* 10100111     UTFR-8 continuation */
205         132,    /* 10101000     UTFR-8 continuation */
206         132,    /* 10101001     UTFR-8 continuation */
207         132,    /* 10101010     UTFR-8 continuation */
208         132,    /* 10101011     UTFR-8 continuation */
209         132,    /* 10101100     UTFR-8 continuation */
210         132,    /* 10101101     UTFR-8 continuation */
211         132,    /* 10101110     UTFR-8 continuation */
212         132,    /* 10101111     UTFR-8 continuation */
213         132,    /* 10110000     UTFR-8 continuation */
214         132,    /* 10110001     UTFR-8 continuation */
215         132,    /* 10110010     UTFR-8 continuation */
216         132,    /* 10110011     UTFR-8 continuation */
217         132,    /* 10110100     UTFR-8 continuation */
218         132,    /* 10110101     UTFR-8 continuation */
219         132,    /* 10110110     UTFR-8 continuation */
220         132,    /* 10110111     UTFR-8 continuation */
221         132,    /* 10111000     UTFR-8 continuation */
222         132,    /* 10111001     UTFR-8 continuation */
223         132,    /* 10111010     UTFR-8 continuation */
224         132,    /* 10111011     UTFR-8 continuation */
225         132,    /* 10111100     UTFR-8 continuation */
226         132,    /* 10111101     UTFR-8 continuation */
227         132,    /* 10111110     UTFR-8 continuation */
228         132,    /* 10111111     UTFR-8 continuation */
229         0,      /* 11000000     Invalid UTF-8 */
230         0,      /* 11000001     Invalid UTF-8 */
231         200,    /* 11000010     Header of 2-byte character */
232         200,    /* 11000011     Header of 2-byte character */
233         200,    /* 11000100     Header of 2-byte character */
234         200,    /* 11000101     Header of 2-byte character */
235         200,    /* 11000110     Header of 2-byte character */
236         200,    /* 11000111     Header of 2-byte character */
237         200,    /* 11001000     Header of 2-byte character */
238         200,    /* 11001001     Header of 2-byte character */
239         200,    /* 11001010     Header of 2-byte character */
240         200,    /* 11001011     Header of 2-byte character */
241         200,    /* 11001100     Header of 2-byte character */
242         200,    /* 11001101     Header of 2-byte character */
243         200,    /* 11001110     Header of 2-byte character */
244         200,    /* 11001111     Header of 2-byte character */
245         200,    /* 11010000     Header of 2-byte character */
246         200,    /* 11010001     Header of 2-byte character */
247         200,    /* 11010010     Header of 2-byte character */
248         200,    /* 11010011     Header of 2-byte character */
249         200,    /* 11010100     Header of 2-byte character */
250         200,    /* 11010101     Header of 2-byte character */
251         200,    /* 11010110     Header of 2-byte character */
252         200,    /* 11010111     Header of 2-byte character */
253         200,    /* 11011000     Header of 2-byte character */
254         200,    /* 11011001     Header of 2-byte character */
255         200,    /* 11011010     Header of 2-byte character */
256         200,    /* 11011011     Header of 2-byte character */
257         200,    /* 11011100     Header of 2-byte character */
258         200,    /* 11011101     Header of 2-byte character */
259         200,    /* 11011110     Header of 2-byte character */
260         200,    /* 11011111     Header of 2-byte character */
261         208,    /* 11100000     Header of 3-byte character */
262         208,    /* 11100001     Header of 3-byte character */
263         208,    /* 11100010     Header of 3-byte character */
264         208,    /* 11100011     Header of 3-byte character */
265         208,    /* 11100100     Header of 3-byte character */
266         208,    /* 11100101     Header of 3-byte character */
267         208,    /* 11100110     Header of 3-byte character */
268         208,    /* 11100111     Header of 3-byte character */
269         208,    /* 11101000     Header of 3-byte character */
270         208,    /* 11101001     Header of 3-byte character */
271         208,    /* 11101010     Header of 3-byte character */
272         208,    /* 11101011     Header of 3-byte character */
273         208,    /* 11101100     Header of 3-byte character */
274         208,    /* 11101101     Header of 3-byte character */
275         208,    /* 11101110     Header of 3-byte character */
276         208,    /* 11101111     Header of 3-byte character */
277         224,    /* 11110000     Header of 4-byte character */
278         224,    /* 11110001     Header of 4-byte character */
279         224,    /* 11110010     Header of 4-byte character */
280         224,    /* 11110011     Header of 4-byte character */
281         224,    /* 11110100     Header of 4-byte character */
282         0,      /* 11110101     Invalid UTF-8 */
283         0,      /* 11110110     Invalid UTF-8 */
284         0,      /* 11110111     Invalid UTF-8 */
285         0,      /* 11111000     Invalid UTF-8 */
286         0,      /* 11111001     Invalid UTF-8 */
287         0,      /* 11111010     Invalid UTF-8 */
288         0,      /* 11111011     Invalid UTF-8 */
289         0,      /* 11111100     Invalid UTF-8 */
290         0,      /* 11111101     Invalid UTF-8 */
291         0,      /* 11111110     Invalid UTF-8 */
292         0       /* 11111111     Invalid UTF-8 */
293 };
294
295 // Functions equivalent to the corresponding macros, for cases
296 // where you need a function pointer
297
298 int is__utf8__control( int c ) {
299         return osrf_utf8_mask_[ c & 0xFF ] & UTF8_CONTROL;
300 }
301
302 int is__utf8__print( int c ) {
303         return osrf_utf8_mask_[ c & 0xFF ] & UTF8_PRINT;
304 }
305
306 int is__utf8__continue( int c ) {
307         return osrf_utf8_mask_[ c & 0xFF ] & UTF8_CONTINUE;
308 }
309
310 int is__utf8__2_byte( int c ) {
311         return osrf_utf8_mask_[ c & 0xFF ] & UTF8_2_BYTE;
312 }
313
314 int is__utf8__3_byte( int c ) {
315         return osrf_utf8_mask_[ c & 0xFF ] & UTF8_3_BYTE;
316 }
317
318 int is__utf8__4_byte( int c ) {
319         return osrf_utf8_mask_[ c & 0xFF ] & UTF8_4_BYTE;
320 }
321
322 int is__utf8__sync( int c ) {
323         return osrf_utf8_mask_[ c & 0xFF ] & UTF8_SYNC;
324 }
325
326 int is__utf8( int c ) {
327         return osrf_utf8_mask_[ c & 0xFF ] & UTF8_VALID;
328 }
329
330 typedef enum {
331         S_BEGIN,   // Expecting nothing in particular
332         S_2_OF_2,  // Expecting second of 2-byte character
333         S_2_OF_3,  // Expecting second of 3-byte-character
334         S_3_OF_3,  // Expecting third of 3-byte-character
335         S_2_OF_4,  // Expecting second of 4-byte character
336         S_3_OF_4,  // Expecting third of 4-byte-character
337         S_4_OF_4,  // Expecting fourth of 4-byte-character
338         S_ERROR,   // Looking for a valid byte to resync with
339         S_END      // Found a terminal nul
340 } utf8_state;
341
342 /**
343  Translate a UTF-8 input string into properly escaped text suitable
344  for a JSON string -- including escaped hex values and surrogate
345  pairs  where needed.  Append the result to a growing_buffer.
346 */
347 int buffer_append_utf8( growing_buffer* buf, const char* string ) {
348         utf8_state state = S_BEGIN;
349         unsigned long utf8_char;
350         const unsigned char* s = (unsigned char *) string;
351         int i = 0;
352         int rc = 0;
353
354         do
355         {
356                 switch( state )
357                 {
358                         case S_BEGIN :
359
360                                 while( s[i] && (s[i] < 0x80) ) {    // Handle ASCII
361                                         if( is_utf8_print( s[i] ) ) {   // Printable
362                                                 switch( s[i] )
363                                                 {
364                                                         case '"' :
365                                                         case '\\' :
366                                                                 OSRF_BUFFER_ADD_CHAR( buf, '\\' );
367                                                         default :
368                                                                 OSRF_BUFFER_ADD_CHAR( buf, s[i] );
369                                                                 break;
370                                                 }
371                                         } else if( s[i] ) {   // Control character
372
373                                                 switch( s[i] )    // Escape some
374                                                 {
375                                                         case '\n' :
376                                                                 OSRF_BUFFER_ADD_CHAR( buf, '\\' );
377                                                                 OSRF_BUFFER_ADD_CHAR( buf, 'n' );
378                                                                 break;
379                                                         case '\t' :
380                                                                 OSRF_BUFFER_ADD_CHAR( buf, '\\' );
381                                                                 OSRF_BUFFER_ADD_CHAR( buf, 't' );
382                                                                 break;
383                                                         case '\r' :
384                                                                 OSRF_BUFFER_ADD_CHAR( buf, '\\' );
385                                                                 OSRF_BUFFER_ADD_CHAR( buf, 'r' );
386                                                                 break;
387                                                         case '\f' :
388                                                                 OSRF_BUFFER_ADD_CHAR( buf, '\\' );
389                                                                 OSRF_BUFFER_ADD_CHAR( buf, 'f' );
390                                                                 break;
391                                                         case '\b' :
392                                                                 OSRF_BUFFER_ADD_CHAR( buf, '\\' );
393                                                                 OSRF_BUFFER_ADD_CHAR( buf, 'b' );
394                                                                 break;
395                                                         default : {   // Format the rest in hex
396                                                                 append_uxxxx(buf, s[i]);
397                                                                 break;
398                                                         }
399                                                 }
400                                         }
401                                         ++i;
402                                 }
403
404                                 // If the next byte is the first of a multibyte sequence, we zero out
405                                 // the length bits and store the rest.
406                                 
407                                 if( '\0' == s[i] )
408                                         state = S_END;
409                                 else if( 128 > s[i] )
410                                         state = S_BEGIN;
411                                 else if( is_utf8_2_byte( s[i] ) ) {
412                                         utf8_char = s[i] ^ 0xC0;
413                                         state = S_2_OF_2;   // Expect 1 continuation byte
414                                 } else if( is_utf8_3_byte( s[i] ) ) {
415                                         utf8_char = s[i] ^ 0xE0;
416                                         state = S_2_OF_3;   // Expect 2 continuation bytes
417                                 } else if( is_utf8_4_byte( s[i] ) ) {
418                                         utf8_char = s[i] ^ 0xF0;
419                                         state = S_2_OF_4;   // Expect 3 continuation bytes
420                                 } else {
421                                         if( 0 == rc )
422                                                 rc = i;
423                                         state = S_ERROR;
424                                 }
425                                 
426                                 ++i;
427                                 break;
428                         case S_2_OF_2 :  //Expect second byte of 1-byte character
429                                 if( is_utf8_continue( s[i] ) ) {  // Append lower 6 bits
430                                         utf8_char = (utf8_char << 6) | (s[i] & 0x3F);
431                                         append_uxxxx(buf, utf8_char);
432                                         state = S_BEGIN;
433                                         ++i;
434                                 } else if( '\0' == s[i] ) {  // Unexpected end of string
435                                         if( 0 == rc )
436                                                 rc = i;
437                                         state = S_END;
438                                 } else {   // Non-continuation character
439                                         if( 0 == rc )
440                                                 rc = i;
441                                         state = S_BEGIN;
442                                 }
443                                 break;
444                         case S_2_OF_3 :
445                                 if( is_utf8_continue( s[i] ) ) {  // Append lower 6 bits
446                                         utf8_char = (utf8_char << 6) | (s[i] & 0x3F);
447                                         state = S_3_OF_3;
448                                         ++i;
449                                 } else if( '\0' == s[i] ) {  // Unexpected end of string
450                                         if( 0 == rc )
451                                                 rc = i;
452                                         state = S_END;
453                                 } else {   // Non-continuation character
454                                         if( 0 == rc )
455                                                 rc = i;
456                                         state = S_BEGIN;
457                                 }
458                                 break;
459                         case S_3_OF_3 :
460                                 if( is_utf8_continue( s[i] ) ) {  // Append lower 6 bits
461                                         utf8_char = (utf8_char << 6) | (s[i] & 0x3F);
462                                         if(utf8_char > 0xFFFF )
463                                                 append_surrogate_pair(buf, utf8_char);
464                                         else
465                                                 append_uxxxx(buf, utf8_char);
466                                         state = S_BEGIN;
467                                         ++i;
468                                 } else if( '\0' == s[i] ) {  // Unexpected end of string
469                                         if( 0 == rc )
470                                                 rc = i;
471                                         state = S_END;
472                                 } else {   // Non-continuation character
473                                         if( 0 == rc )
474                                                 rc = i;
475                                         state = S_BEGIN;
476                                 }
477                                 break;
478                         case S_2_OF_4 :
479                                 if( is_utf8_continue( s[i] ) ) {  // Append lower 6 bits
480                                         utf8_char = (utf8_char << 6) | (s[i] & 0x3F);
481                                         state = S_3_OF_4;
482                                         ++i;
483                                 } else if( '\0' == s[i] ) {  // Unexpected end of string
484                                         if( 0 == rc )
485                                                 rc = i;
486                                         state = S_END;
487                                 } else {   // Non-continuation character
488                                         if( 0 == rc )
489                                                 rc = i;
490                                         state = S_BEGIN;
491                                 }
492                                 break;
493                         case S_3_OF_4 :
494                                 if( is_utf8_continue( s[i] ) ) {  // Append lower 6 bits
495                                         utf8_char = (utf8_char << 6) | (s[i] & 0x3F);
496                                         state = S_4_OF_4;
497                                         ++i;
498                                 } else if( '\0' == s[i] ) {  // Unexpected end of string
499                                         if( 0 == rc )
500                                                 rc = i;
501                                         state = S_END;
502                                 } else {   // Non-continuation character
503                                         if( 0 == rc )
504                                                 rc = i;
505                                         state = S_BEGIN;
506                                 }
507                                 break;
508                         case S_4_OF_4 :
509                                 if( is_utf8_continue( s[i] ) ) {  // Append lower 6 bits
510                                         utf8_char = (utf8_char << 6) | (s[i] & 0x3F);
511                                         if(utf8_char > 0xFFFF )
512                                                 append_surrogate_pair(buf, utf8_char);
513                                         else
514                                                 append_uxxxx(buf, utf8_char);
515                                         state = S_BEGIN;
516                                         ++i;
517                                 } else if( '\0' == s[i] ) {  // Unexpected end of string
518                                         if( 0 == rc )
519                                                 rc = i;
520                                         state = S_END;
521                                 } else {   // Non-continuation character
522                                         if( 0 == rc )
523                                                 rc = i;
524                                         state = S_BEGIN;
525                                 }
526                                 break;
527                         case S_ERROR :
528                                 if( '\0' == s[i] )
529                                         state = S_END;
530                                 else if( is_utf8_sync( s[i] ) )
531                                         state = S_BEGIN;  // Resume translation
532                                 else
533                                         ++i;
534
535                                 break;
536                         default :
537                                 state = S_END;
538                                 break;
539                 }
540         } while ( state != S_END );
541         
542         return rc;
543 }
544
545 /**
546  Break a code point up into two pieces, and format each piec
547  in hex. as a surrogate pair.  Append the results to a growing_buffer.
548
549  This code is loosely based on a code snippet at:
550  http://www.unicode.org/faq/utf_bom.html
551  It isn't obvious how, why, or whether it works.
552 */
553 static void append_surrogate_pair(growing_buffer * buf, unsigned long code_point) {
554         unsigned int hi;   // high surrogate
555         unsigned int low;  // low surrogate
556
557         hi = 0xD7C0 + (code_point >> 10);
558         append_uxxxx(buf, hi);
559
560         low = 0xDC00 + (code_point & 0x3FF);
561         append_uxxxx(buf, low);
562 }
563
564 /**
565  Format the lower 16 bits of an unsigned long in hex,
566  in the format "\uxxxx" where each x is a hex digit.
567  Append the result to a growing_buffer.
568 */
569 static void append_uxxxx( growing_buffer * buf, unsigned long i ) {
570         static const char hex_chars[] = "0123456789abcdef";
571         char hex_buf[7] = "\\u";
572
573         hex_buf[2] = hex_chars[ (i >> 12) & 0x000F ];
574         hex_buf[3] = hex_chars[ (i >>  8) & 0x000F ];
575         hex_buf[4] = hex_chars[ (i >>  4) & 0x000F ];
576         hex_buf[5] = hex_chars[ i         & 0x000F ];
577         hex_buf[6] = '\0';
578
579         OSRF_BUFFER_ADD(buf, hex_buf);
580 }