1 /*----------------------------------------------------
2 Desc : functions and macros for processing UTF-8
3 Author : Scott McKellar
4 Notes : Translate UTF-8 text to a JSON string
6 Copyright 2008 Scott McKellar
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation; either version 2 of the License, or
12 (at your option) any later version.
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
19 You should have received a copy of the GNU General Public License
20 along with this program; if not, write to the
21 Free Software Foundation, Inc.,
22 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
25 ---------- -------------------------------------------------
26 2008/11/20 Initial creation
27 2008/11/27 Emit surrogate pairs for code points > 0xFFFF
28 ----------------------------------------------------------*/
29 #include <opensrf/utils.h>
30 #include <opensrf/osrf_utf8.h>
32 static void append_surrogate_pair(growing_buffer * buf, unsigned long code_point);
33 static void append_uxxxx(growing_buffer * buf, unsigned long i);
35 unsigned char osrf_utf8_mask_[] =
37 193, /* 00000000 Control character */
38 193, /* 00000001 Control character */
39 193, /* 00000010 Control character */
40 193, /* 00000011 Control character */
41 193, /* 00000100 Control character */
42 193, /* 00000101 Control character */
43 193, /* 00000110 Control character */
44 193, /* 00000111 Control character */
45 193, /* 00001000 Control character */
46 193, /* 00001001 Control character */
47 193, /* 00001010 Control character */
48 193, /* 00001011 Control character */
49 193, /* 00001100 Control character */
50 193, /* 00001101 Control character */
51 193, /* 00001110 Control character */
52 193, /* 00001111 Control character */
53 193, /* 00010000 Control character */
54 193, /* 00010001 Control character */
55 193, /* 00010010 Control character */
56 193, /* 00010011 Control character */
57 193, /* 00010100 Control character */
58 193, /* 00010101 Control character */
59 193, /* 00010110 Control character */
60 193, /* 00010111 Control character */
61 193, /* 00011000 Control character */
62 193, /* 00011001 Control character */
63 193, /* 00011010 Control character */
64 193, /* 00011011 Control character */
65 193, /* 00011100 Control character */
66 193, /* 00011101 Control character */
67 193, /* 00011110 Control character */
68 193, /* 00011111 Control character */
69 194, /* 00100000 Printable ASCII */
70 194, /* 00100001 Printable ASCII */
71 194, /* 00100010 Printable ASCII */
72 194, /* 00100011 Printable ASCII */
73 194, /* 00100100 Printable ASCII */
74 194, /* 00100101 Printable ASCII */
75 194, /* 00100110 Printable ASCII */
76 194, /* 00100111 Printable ASCII */
77 194, /* 00101000 Printable ASCII */
78 194, /* 00101001 Printable ASCII */
79 194, /* 00101010 Printable ASCII */
80 194, /* 00101011 Printable ASCII */
81 194, /* 00101100 Printable ASCII */
82 194, /* 00101101 Printable ASCII */
83 194, /* 00101110 Printable ASCII */
84 194, /* 00101111 Printable ASCII */
85 194, /* 00110000 Printable ASCII */
86 194, /* 00110001 Printable ASCII */
87 194, /* 00110010 Printable ASCII */
88 194, /* 00110011 Printable ASCII */
89 194, /* 00110100 Printable ASCII */
90 194, /* 00110101 Printable ASCII */
91 194, /* 00110110 Printable ASCII */
92 194, /* 00110111 Printable ASCII */
93 194, /* 00111000 Printable ASCII */
94 194, /* 00111001 Printable ASCII */
95 194, /* 00111010 Printable ASCII */
96 194, /* 00111011 Printable ASCII */
97 194, /* 00111100 Printable ASCII */
98 194, /* 00111101 Printable ASCII */
99 194, /* 00111110 Printable ASCII */
100 194, /* 00111111 Printable ASCII */
101 194, /* 01000000 Printable ASCII */
102 194, /* 01000001 Printable ASCII */
103 194, /* 01000010 Printable ASCII */
104 194, /* 01000011 Printable ASCII */
105 194, /* 01000100 Printable ASCII */
106 194, /* 01000101 Printable ASCII */
107 194, /* 01000110 Printable ASCII */
108 194, /* 01000111 Printable ASCII */
109 194, /* 01001000 Printable ASCII */
110 194, /* 01001001 Printable ASCII */
111 194, /* 01001010 Printable ASCII */
112 194, /* 01001011 Printable ASCII */
113 194, /* 01001100 Printable ASCII */
114 194, /* 01001101 Printable ASCII */
115 194, /* 01001110 Printable ASCII */
116 194, /* 01001111 Printable ASCII */
117 194, /* 01010000 Printable ASCII */
118 194, /* 01010001 Printable ASCII */
119 194, /* 01010010 Printable ASCII */
120 194, /* 01010011 Printable ASCII */
121 194, /* 01010100 Printable ASCII */
122 194, /* 01010101 Printable ASCII */
123 194, /* 01010110 Printable ASCII */
124 194, /* 01010111 Printable ASCII */
125 194, /* 01011000 Printable ASCII */
126 194, /* 01011001 Printable ASCII */
127 194, /* 01011010 Printable ASCII */
128 194, /* 01011011 Printable ASCII */
129 194, /* 01011100 Printable ASCII */
130 194, /* 01011101 Printable ASCII */
131 194, /* 01011110 Printable ASCII */
132 194, /* 01011111 Printable ASCII */
133 194, /* 01100000 Printable ASCII */
134 194, /* 01100001 Printable ASCII */
135 194, /* 01100010 Printable ASCII */
136 194, /* 01100011 Printable ASCII */
137 194, /* 01100100 Printable ASCII */
138 194, /* 01100101 Printable ASCII */
139 194, /* 01100110 Printable ASCII */
140 194, /* 01100111 Printable ASCII */
141 194, /* 01101000 Printable ASCII */
142 194, /* 01101001 Printable ASCII */
143 194, /* 01101010 Printable ASCII */
144 194, /* 01101011 Printable ASCII */
145 194, /* 01101100 Printable ASCII */
146 194, /* 01101101 Printable ASCII */
147 194, /* 01101110 Printable ASCII */
148 194, /* 01101111 Printable ASCII */
149 194, /* 01110000 Printable ASCII */
150 194, /* 01110001 Printable ASCII */
151 194, /* 01110010 Printable ASCII */
152 194, /* 01110011 Printable ASCII */
153 194, /* 01110100 Printable ASCII */
154 194, /* 01110101 Printable ASCII */
155 194, /* 01110110 Printable ASCII */
156 194, /* 01110111 Printable ASCII */
157 194, /* 01111000 Printable ASCII */
158 194, /* 01111001 Printable ASCII */
159 194, /* 01111010 Printable ASCII */
160 194, /* 01111011 Printable ASCII */
161 194, /* 01111100 Printable ASCII */
162 194, /* 01111101 Printable ASCII */
163 194, /* 01111110 Printable ASCII */
164 193, /* 01111111 Control character */
165 132, /* 10000000 UTFR-8 continuation */
166 132, /* 10000001 UTFR-8 continuation */
167 132, /* 10000010 UTFR-8 continuation */
168 132, /* 10000011 UTFR-8 continuation */
169 132, /* 10000100 UTFR-8 continuation */
170 132, /* 10000101 UTFR-8 continuation */
171 132, /* 10000110 UTFR-8 continuation */
172 132, /* 10000111 UTFR-8 continuation */
173 132, /* 10001000 UTFR-8 continuation */
174 132, /* 10001001 UTFR-8 continuation */
175 132, /* 10001010 UTFR-8 continuation */
176 132, /* 10001011 UTFR-8 continuation */
177 132, /* 10001100 UTFR-8 continuation */
178 132, /* 10001101 UTFR-8 continuation */
179 132, /* 10001110 UTFR-8 continuation */
180 132, /* 10001111 UTFR-8 continuation */
181 132, /* 10010000 UTFR-8 continuation */
182 132, /* 10010001 UTFR-8 continuation */
183 132, /* 10010010 UTFR-8 continuation */
184 132, /* 10010011 UTFR-8 continuation */
185 132, /* 10010100 UTFR-8 continuation */
186 132, /* 10010101 UTFR-8 continuation */
187 132, /* 10010110 UTFR-8 continuation */
188 132, /* 10010111 UTFR-8 continuation */
189 132, /* 10011000 UTFR-8 continuation */
190 132, /* 10011001 UTFR-8 continuation */
191 132, /* 10011010 UTFR-8 continuation */
192 132, /* 10011011 UTFR-8 continuation */
193 132, /* 10011100 UTFR-8 continuation */
194 132, /* 10011101 UTFR-8 continuation */
195 132, /* 10011110 UTFR-8 continuation */
196 132, /* 10011111 UTFR-8 continuation */
197 132, /* 10100000 UTFR-8 continuation */
198 132, /* 10100001 UTFR-8 continuation */
199 132, /* 10100010 UTFR-8 continuation */
200 132, /* 10100011 UTFR-8 continuation */
201 132, /* 10100100 UTFR-8 continuation */
202 132, /* 10100101 UTFR-8 continuation */
203 132, /* 10100110 UTFR-8 continuation */
204 132, /* 10100111 UTFR-8 continuation */
205 132, /* 10101000 UTFR-8 continuation */
206 132, /* 10101001 UTFR-8 continuation */
207 132, /* 10101010 UTFR-8 continuation */
208 132, /* 10101011 UTFR-8 continuation */
209 132, /* 10101100 UTFR-8 continuation */
210 132, /* 10101101 UTFR-8 continuation */
211 132, /* 10101110 UTFR-8 continuation */
212 132, /* 10101111 UTFR-8 continuation */
213 132, /* 10110000 UTFR-8 continuation */
214 132, /* 10110001 UTFR-8 continuation */
215 132, /* 10110010 UTFR-8 continuation */
216 132, /* 10110011 UTFR-8 continuation */
217 132, /* 10110100 UTFR-8 continuation */
218 132, /* 10110101 UTFR-8 continuation */
219 132, /* 10110110 UTFR-8 continuation */
220 132, /* 10110111 UTFR-8 continuation */
221 132, /* 10111000 UTFR-8 continuation */
222 132, /* 10111001 UTFR-8 continuation */
223 132, /* 10111010 UTFR-8 continuation */
224 132, /* 10111011 UTFR-8 continuation */
225 132, /* 10111100 UTFR-8 continuation */
226 132, /* 10111101 UTFR-8 continuation */
227 132, /* 10111110 UTFR-8 continuation */
228 132, /* 10111111 UTFR-8 continuation */
229 0, /* 11000000 Invalid UTF-8 */
230 0, /* 11000001 Invalid UTF-8 */
231 200, /* 11000010 Header of 2-byte character */
232 200, /* 11000011 Header of 2-byte character */
233 200, /* 11000100 Header of 2-byte character */
234 200, /* 11000101 Header of 2-byte character */
235 200, /* 11000110 Header of 2-byte character */
236 200, /* 11000111 Header of 2-byte character */
237 200, /* 11001000 Header of 2-byte character */
238 200, /* 11001001 Header of 2-byte character */
239 200, /* 11001010 Header of 2-byte character */
240 200, /* 11001011 Header of 2-byte character */
241 200, /* 11001100 Header of 2-byte character */
242 200, /* 11001101 Header of 2-byte character */
243 200, /* 11001110 Header of 2-byte character */
244 200, /* 11001111 Header of 2-byte character */
245 200, /* 11010000 Header of 2-byte character */
246 200, /* 11010001 Header of 2-byte character */
247 200, /* 11010010 Header of 2-byte character */
248 200, /* 11010011 Header of 2-byte character */
249 200, /* 11010100 Header of 2-byte character */
250 200, /* 11010101 Header of 2-byte character */
251 200, /* 11010110 Header of 2-byte character */
252 200, /* 11010111 Header of 2-byte character */
253 200, /* 11011000 Header of 2-byte character */
254 200, /* 11011001 Header of 2-byte character */
255 200, /* 11011010 Header of 2-byte character */
256 200, /* 11011011 Header of 2-byte character */
257 200, /* 11011100 Header of 2-byte character */
258 200, /* 11011101 Header of 2-byte character */
259 200, /* 11011110 Header of 2-byte character */
260 200, /* 11011111 Header of 2-byte character */
261 208, /* 11100000 Header of 3-byte character */
262 208, /* 11100001 Header of 3-byte character */
263 208, /* 11100010 Header of 3-byte character */
264 208, /* 11100011 Header of 3-byte character */
265 208, /* 11100100 Header of 3-byte character */
266 208, /* 11100101 Header of 3-byte character */
267 208, /* 11100110 Header of 3-byte character */
268 208, /* 11100111 Header of 3-byte character */
269 208, /* 11101000 Header of 3-byte character */
270 208, /* 11101001 Header of 3-byte character */
271 208, /* 11101010 Header of 3-byte character */
272 208, /* 11101011 Header of 3-byte character */
273 208, /* 11101100 Header of 3-byte character */
274 208, /* 11101101 Header of 3-byte character */
275 208, /* 11101110 Header of 3-byte character */
276 208, /* 11101111 Header of 3-byte character */
277 224, /* 11110000 Header of 4-byte character */
278 224, /* 11110001 Header of 4-byte character */
279 224, /* 11110010 Header of 4-byte character */
280 224, /* 11110011 Header of 4-byte character */
281 224, /* 11110100 Header of 4-byte character */
282 0, /* 11110101 Invalid UTF-8 */
283 0, /* 11110110 Invalid UTF-8 */
284 0, /* 11110111 Invalid UTF-8 */
285 0, /* 11111000 Invalid UTF-8 */
286 0, /* 11111001 Invalid UTF-8 */
287 0, /* 11111010 Invalid UTF-8 */
288 0, /* 11111011 Invalid UTF-8 */
289 0, /* 11111100 Invalid UTF-8 */
290 0, /* 11111101 Invalid UTF-8 */
291 0, /* 11111110 Invalid UTF-8 */
292 0 /* 11111111 Invalid UTF-8 */
295 // Functions equivalent to the corresponding macros, for cases
296 // where you need a function pointer
298 int is__utf8__control( int c ) {
299 return osrf_utf8_mask_[ c & 0xFF ] & UTF8_CONTROL;
302 int is__utf8__print( int c ) {
303 return osrf_utf8_mask_[ c & 0xFF ] & UTF8_PRINT;
306 int is__utf8__continue( int c ) {
307 return osrf_utf8_mask_[ c & 0xFF ] & UTF8_CONTINUE;
310 int is__utf8__2_byte( int c ) {
311 return osrf_utf8_mask_[ c & 0xFF ] & UTF8_2_BYTE;
314 int is__utf8__3_byte( int c ) {
315 return osrf_utf8_mask_[ c & 0xFF ] & UTF8_3_BYTE;
318 int is__utf8__4_byte( int c ) {
319 return osrf_utf8_mask_[ c & 0xFF ] & UTF8_4_BYTE;
322 int is__utf8__sync( int c ) {
323 return osrf_utf8_mask_[ c & 0xFF ] & UTF8_SYNC;
326 int is__utf8( int c ) {
327 return osrf_utf8_mask_[ c & 0xFF ] & UTF8_VALID;
331 S_BEGIN, // Expecting nothing in particular
332 S_2_OF_2, // Expecting second of 2-byte character
333 S_2_OF_3, // Expecting second of 3-byte-character
334 S_3_OF_3, // Expecting third of 3-byte-character
335 S_2_OF_4, // Expecting second of 4-byte character
336 S_3_OF_4, // Expecting third of 4-byte-character
337 S_4_OF_4, // Expecting fourth of 4-byte-character
338 S_ERROR, // Looking for a valid byte to resync with
339 S_END // Found a terminal nul
343 Translate a UTF-8 input string into properly escaped text suitable
344 for a JSON string -- including escaped hex values and surrogate
345 pairs where needed. Append the result to a growing_buffer.
347 int buffer_append_utf8( growing_buffer* buf, const char* string ) {
348 utf8_state state = S_BEGIN;
349 unsigned long utf8_char = 0;
350 const unsigned char* s = (unsigned char *) string;
360 while( s[i] && (s[i] < 0x80) ) { // Handle ASCII
361 if( is_utf8_print( s[i] ) ) { // Printable
366 OSRF_BUFFER_ADD_CHAR( buf, '\\' );
368 OSRF_BUFFER_ADD_CHAR( buf, s[i] );
371 } else if( s[i] ) { // Control character
373 switch( s[i] ) // Escape some
376 OSRF_BUFFER_ADD_CHAR( buf, '\\' );
377 OSRF_BUFFER_ADD_CHAR( buf, 'n' );
380 OSRF_BUFFER_ADD_CHAR( buf, '\\' );
381 OSRF_BUFFER_ADD_CHAR( buf, 't' );
384 OSRF_BUFFER_ADD_CHAR( buf, '\\' );
385 OSRF_BUFFER_ADD_CHAR( buf, 'r' );
388 OSRF_BUFFER_ADD_CHAR( buf, '\\' );
389 OSRF_BUFFER_ADD_CHAR( buf, 'f' );
392 OSRF_BUFFER_ADD_CHAR( buf, '\\' );
393 OSRF_BUFFER_ADD_CHAR( buf, 'b' );
395 default : { // Format the rest in hex
396 append_uxxxx(buf, s[i]);
404 // If the next byte is the first of a multibyte sequence, we zero out
405 // the length bits and store the rest.
409 else if( 128 > s[i] )
411 else if( is_utf8_2_byte( s[i] ) ) {
412 utf8_char = s[i] ^ 0xC0;
413 state = S_2_OF_2; // Expect 1 continuation byte
414 } else if( is_utf8_3_byte( s[i] ) ) {
415 utf8_char = s[i] ^ 0xE0;
416 state = S_2_OF_3; // Expect 2 continuation bytes
417 } else if( is_utf8_4_byte( s[i] ) ) {
418 utf8_char = s[i] ^ 0xF0;
419 state = S_2_OF_4; // Expect 3 continuation bytes
428 case S_2_OF_2 : //Expect second byte of 1-byte character
429 if( is_utf8_continue( s[i] ) ) { // Append lower 6 bits
430 utf8_char = (utf8_char << 6) | (s[i] & 0x3F);
431 append_uxxxx(buf, utf8_char);
434 } else if( '\0' == s[i] ) { // Unexpected end of string
438 } else { // Non-continuation character
445 if( is_utf8_continue( s[i] ) ) { // Append lower 6 bits
446 utf8_char = (utf8_char << 6) | (s[i] & 0x3F);
449 } else if( '\0' == s[i] ) { // Unexpected end of string
453 } else { // Non-continuation character
460 if( is_utf8_continue( s[i] ) ) { // Append lower 6 bits
461 utf8_char = (utf8_char << 6) | (s[i] & 0x3F);
462 if(utf8_char > 0xFFFF )
463 append_surrogate_pair(buf, utf8_char);
465 append_uxxxx(buf, utf8_char);
468 } else if( '\0' == s[i] ) { // Unexpected end of string
472 } else { // Non-continuation character
479 if( is_utf8_continue( s[i] ) ) { // Append lower 6 bits
480 utf8_char = (utf8_char << 6) | (s[i] & 0x3F);
483 } else if( '\0' == s[i] ) { // Unexpected end of string
487 } else { // Non-continuation character
494 if( is_utf8_continue( s[i] ) ) { // Append lower 6 bits
495 utf8_char = (utf8_char << 6) | (s[i] & 0x3F);
498 } else if( '\0' == s[i] ) { // Unexpected end of string
502 } else { // Non-continuation character
509 if( is_utf8_continue( s[i] ) ) { // Append lower 6 bits
510 utf8_char = (utf8_char << 6) | (s[i] & 0x3F);
511 if(utf8_char > 0xFFFF )
512 append_surrogate_pair(buf, utf8_char);
514 append_uxxxx(buf, utf8_char);
517 } else if( '\0' == s[i] ) { // Unexpected end of string
521 } else { // Non-continuation character
530 else if( is_utf8_sync( s[i] ) )
531 state = S_BEGIN; // Resume translation
540 } while ( state != S_END );
546 Break a code point up into two pieces, and format each piec
547 in hex. as a surrogate pair. Append the results to a growing_buffer.
549 This code is loosely based on a code snippet at:
550 http://www.unicode.org/faq/utf_bom.html
551 It isn't obvious how, why, or whether it works.
553 static void append_surrogate_pair(growing_buffer * buf, unsigned long code_point) {
554 unsigned int hi; // high surrogate
555 unsigned int low; // low surrogate
557 hi = 0xD7C0 + (code_point >> 10);
558 append_uxxxx(buf, hi);
560 low = 0xDC00 + (code_point & 0x3FF);
561 append_uxxxx(buf, low);
565 Format the lower 16 bits of an unsigned long in hex,
566 in the format "\uxxxx" where each x is a hex digit.
567 Append the result to a growing_buffer.
569 static void append_uxxxx( growing_buffer * buf, unsigned long i ) {
570 static const char hex_chars[] = "0123456789abcdef";
571 char hex_buf[7] = "\\u";
573 hex_buf[2] = hex_chars[ (i >> 12) & 0x000F ];
574 hex_buf[3] = hex_chars[ (i >> 8) & 0x000F ];
575 hex_buf[4] = hex_chars[ (i >> 4) & 0x000F ];
576 hex_buf[5] = hex_chars[ i & 0x000F ];
579 OSRF_BUFFER_ADD(buf, hex_buf);