@@ -22,6 +22,11 @@ public class CBORGenerator extends GeneratorBase
22
22
{
23
23
private final static int [] NO_INTS = new int [0 ];
24
24
25
+ /**
26
+ * The replacement character to use to fix invalid unicode sequences.
27
+ */
28
+ final static int REPLACEMENT_CHAR = 0xfffd ;
29
+
25
30
/**
26
31
* Let's ensure that we have big enough output buffer because of safety
27
32
* margins we need for UTF-8 encoding.
@@ -61,7 +66,14 @@ public enum Feature implements FormatFeature {
61
66
* Default value is <code>false</code> meaning that type tag will not be
62
67
* written at the beginning of a new document.
63
68
*/
64
- WRITE_TYPE_HEADER (false )
69
+ WRITE_TYPE_HEADER (false ),
70
+
71
+ /**
72
+ * Feature that determines if an invalid surrogate encoding found in the
73
+ * incoming String should fail with an exception or silently be outputed
74
+ * as the Unicode 'REPLACEMENT CHARACTER' (U+FFFD)
75
+ */
76
+ LENIENT_UTF_ENCODING (false ),
65
77
66
78
;
67
79
@@ -138,6 +150,8 @@ public int getMask() {
138
150
139
151
protected boolean _cfgMinimalInts ;
140
152
153
+ protected boolean _cfgLenientUnicodeEncoding ;
154
+
141
155
/*
142
156
/**********************************************************************
143
157
/* Output state
@@ -231,6 +245,7 @@ public CBORGenerator(ObjectWriteContext writeCtxt, IOContext ctxt,
231
245
: null ;
232
246
_tokenWriteContext = CBORWriteContext .createRootContext (dups );
233
247
_cfgMinimalInts = Feature .WRITE_MINIMAL_INTS .enabledIn (formatFeatures );
248
+ _cfgLenientUnicodeEncoding = Feature .LENIENT_UTF_ENCODING .enabledIn (formatFeatures );
234
249
_out = out ;
235
250
_bufferRecyclable = true ;
236
251
_outputBuffer = ctxt .allocWriteEncodingBuffer (BYTE_BUFFER_FOR_OUTPUT );
@@ -357,6 +372,9 @@ public CBORGenerator enable(Feature f) {
357
372
if (f == Feature .WRITE_MINIMAL_INTS ) {
358
373
_cfgMinimalInts = true ;
359
374
}
375
+ if (f == Feature .LENIENT_UTF_ENCODING ) {
376
+ _cfgLenientUnicodeEncoding = true ;
377
+ }
360
378
return this ;
361
379
}
362
380
@@ -365,6 +383,9 @@ public CBORGenerator disable(Feature f) {
365
383
if (f == Feature .WRITE_MINIMAL_INTS ) {
366
384
_cfgMinimalInts = false ;
367
385
}
386
+ if (f == Feature .LENIENT_UTF_ENCODING ) {
387
+ _cfgLenientUnicodeEncoding = false ;
388
+ }
368
389
return this ;
369
390
}
370
391
@@ -1356,81 +1377,33 @@ private final int _encode(int outputPtr, char[] str, int i, int end) {
1356
1377
do {
1357
1378
int c = str [i ];
1358
1379
if (c > 0x7F ) {
1359
- return _shortUTF8Encode2 ( str , i , end , outputPtr , outputStart );
1380
+ return _encode2 ( i , outputPtr , str , end , outputStart );
1360
1381
}
1361
1382
outBuf [outputPtr ++] = (byte ) c ;
1362
1383
} while (++i < end );
1363
1384
return outputPtr - outputStart ;
1364
1385
}
1365
1386
1366
- /**
1367
- * Helper method called when the whole character sequence is known to fit in
1368
- * the output buffer, but not all characters are single-byte (ASCII)
1369
- * characters.
1370
- */
1371
- private final int _shortUTF8Encode2 (char [] str , int i , int end ,
1372
- int outputPtr , int outputStart ) {
1373
- final byte [] outBuf = _outputBuffer ;
1374
- while (i < end ) {
1375
- int c = str [i ++];
1376
- if (c <= 0x7F ) {
1377
- outBuf [outputPtr ++] = (byte ) c ;
1378
- continue ;
1379
- }
1380
- // Nope, multi-byte:
1381
- if (c < 0x800 ) { // 2-byte
1382
- outBuf [outputPtr ++] = (byte ) (0xc0 | (c >> 6 ));
1383
- outBuf [outputPtr ++] = (byte ) (0x80 | (c & 0x3f ));
1384
- continue ;
1385
- }
1386
- // 3 or 4 bytes (surrogate)
1387
- // Surrogates?
1388
- if (c < SURR1_FIRST || c > SURR2_LAST ) { // nope, regular 3-byte character
1389
- outBuf [outputPtr ++] = (byte ) (0xe0 | (c >> 12 ));
1390
- outBuf [outputPtr ++] = (byte ) (0x80 | ((c >> 6 ) & 0x3f ));
1391
- outBuf [outputPtr ++] = (byte ) (0x80 | (c & 0x3f ));
1392
- continue ;
1393
- }
1394
- // Yup, a surrogate pair
1395
- if (c > SURR1_LAST ) { // must be from first range; second won't do
1396
- _throwIllegalSurrogate (c );
1397
- }
1398
- // ... meaning it must have a pair
1399
- if (i >= end ) {
1400
- _throwIllegalSurrogate (c );
1401
- }
1402
- c = _convertSurrogate (c , str [i ++]);
1403
- if (c > 0x10FFFF ) { // illegal in JSON as well as in XML
1404
- _throwIllegalSurrogate (c );
1405
- }
1406
- outBuf [outputPtr ++] = (byte ) (0xf0 | (c >> 18 ));
1407
- outBuf [outputPtr ++] = (byte ) (0x80 | ((c >> 12 ) & 0x3f ));
1408
- outBuf [outputPtr ++] = (byte ) (0x80 | ((c >> 6 ) & 0x3f ));
1409
- outBuf [outputPtr ++] = (byte ) (0x80 | (c & 0x3f ));
1410
- }
1411
- return (outputPtr - outputStart );
1412
- }
1413
-
1414
1387
private final int _encode (int outputPtr , String str , int len ) {
1415
1388
final byte [] outBuf = _outputBuffer ;
1416
1389
final int outputStart = outputPtr ;
1417
1390
1418
1391
for (int i = 0 ; i < len ; ++i ) {
1419
1392
int c = str .charAt (i );
1420
1393
if (c > 0x7F ) {
1421
- return _encode2 (i , outputPtr , str , len , outputStart );
1394
+ return _encode2 (i , outputPtr , str . toCharArray () , len , outputStart );
1422
1395
}
1423
1396
outBuf [outputPtr ++] = (byte ) c ;
1424
1397
}
1425
1398
return (outputPtr - outputStart );
1426
1399
}
1427
1400
1428
- private final int _encode2 (int i , int outputPtr , String str , int len ,
1401
+ private final int _encode2 (int i , int outputPtr , char [] str , int len ,
1429
1402
int outputStart ) {
1430
1403
final byte [] outBuf = _outputBuffer ;
1431
1404
// no; non-ASCII stuff, slower loop
1432
1405
while (i < len ) {
1433
- int c = str . charAt ( i ++) ;
1406
+ int c = str [ i ++] ;
1434
1407
if (c <= 0x7F ) {
1435
1408
outBuf [outputPtr ++] = (byte ) c ;
1436
1409
continue ;
@@ -1452,20 +1425,43 @@ private final int _encode2(int i, int outputPtr, String str, int len,
1452
1425
}
1453
1426
// Yup, a surrogate pair
1454
1427
if (c > SURR1_LAST ) { // must be from first range; second won't do
1455
- _throwIllegalSurrogate (c );
1428
+ if (_cfgLenientUnicodeEncoding ) {
1429
+ c = REPLACEMENT_CHAR ;
1430
+ } else {
1431
+ _throwIllegalSurrogate (c );
1432
+ }
1456
1433
}
1457
1434
// ... meaning it must have a pair
1458
- if (i >= len ) {
1459
- _throwIllegalSurrogate (c );
1435
+ else if (i >= len ) {
1436
+ if (_cfgLenientUnicodeEncoding ) {
1437
+ c = REPLACEMENT_CHAR ;
1438
+ } else {
1439
+ _throwIllegalSurrogate (c );
1440
+ }
1460
1441
}
1461
- c = _convertSurrogate (c , str .charAt (i ++));
1462
- if (c > 0x10FFFF ) { // illegal in JSON as well as in XML
1463
- _throwIllegalSurrogate (c );
1442
+ // ... verify that the next character is in range
1443
+ else if (str [i ] < SURR2_FIRST || str [i ] > SURR2_LAST ) {
1444
+ if (_cfgLenientUnicodeEncoding ) {
1445
+ c = REPLACEMENT_CHAR ;
1446
+ } else {
1447
+ _throwIllegalSurrogatePair (c , str [i ]);
1448
+ }
1449
+ }
1450
+ // ... we have a valid surrogate pair
1451
+ else {
1452
+ c = _convertSurrogate (c , str [i ++]);
1453
+ }
1454
+ // if we replaced by the replacement char we actually have a 3 bytes char
1455
+ if (c == REPLACEMENT_CHAR ) {
1456
+ outBuf [outputPtr ++] = (byte ) (0xe0 | (c >> 12 ));
1457
+ outBuf [outputPtr ++] = (byte ) (0x80 | ((c >> 6 ) & 0x3f ));
1458
+ outBuf [outputPtr ++] = (byte ) (0x80 | (c & 0x3f ));
1459
+ } else {
1460
+ outBuf [outputPtr ++] = (byte ) (0xf0 | (c >> 18 ));
1461
+ outBuf [outputPtr ++] = (byte ) (0x80 | ((c >> 12 ) & 0x3f ));
1462
+ outBuf [outputPtr ++] = (byte ) (0x80 | ((c >> 6 ) & 0x3f ));
1463
+ outBuf [outputPtr ++] = (byte ) (0x80 | (c & 0x3f ));
1464
1464
}
1465
- outBuf [outputPtr ++] = (byte ) (0xf0 | (c >> 18 ));
1466
- outBuf [outputPtr ++] = (byte ) (0x80 | ((c >> 12 ) & 0x3f ));
1467
- outBuf [outputPtr ++] = (byte ) (0x80 | ((c >> 6 ) & 0x3f ));
1468
- outBuf [outputPtr ++] = (byte ) (0x80 | (c & 0x3f ));
1469
1465
}
1470
1466
return (outputPtr - outputStart );
1471
1467
}
@@ -1474,16 +1470,20 @@ private final int _encode2(int i, int outputPtr, String str, int len,
1474
1470
* Method called to calculate UTF codepoint, from a surrogate pair.
1475
1471
*/
1476
1472
private int _convertSurrogate (int firstPart , int secondPart ) {
1477
- // Ok, then, is the second part valid?
1478
- if (secondPart < SURR2_FIRST || secondPart > SURR2_LAST ) {
1479
- throw new IllegalArgumentException (
1473
+ int c = 0x10000 + ((firstPart - SURR1_FIRST ) << 10 )
1474
+ + (secondPart - SURR2_FIRST );
1475
+ if (c > 0x10FFFF ) { // illegal in JSON as well as in XML
1476
+ _throwIllegalSurrogate (c );
1477
+ }
1478
+ return c ;
1479
+ }
1480
+
1481
+ private void _throwIllegalSurrogatePair (int firstPart , int secondPart ) {
1482
+ throw new IllegalArgumentException (
1480
1483
"Broken surrogate pair: first char 0x"
1481
1484
+ Integer .toHexString (firstPart ) + ", second 0x"
1482
1485
+ Integer .toHexString (secondPart )
1483
1486
+ "; illegal combination" );
1484
- }
1485
- return 0x10000 + ((firstPart - SURR1_FIRST ) << 10 )
1486
- + (secondPart - SURR2_FIRST );
1487
1487
}
1488
1488
1489
1489
private void _throwIllegalSurrogate (int code ) {
0 commit comments