Skip to content

Commit fc5aaa5

Browse files
committed
Add a CBORGenerator feature for lenient unicode encoding
If enabled, the generator will output the Unicode Replacement Character for invalid unicode sequence (invalid surrogate chars in the Java String) instead of failing with an IllegalArgumentException
1 parent e5f9755 commit fc5aaa5

File tree

3 files changed

+191
-69
lines changed

3 files changed

+191
-69
lines changed

cbor/src/main/java/com/fasterxml/jackson/dataformat/cbor/CBORGenerator.java

Lines changed: 69 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,11 @@ public class CBORGenerator extends GeneratorBase
2222
{
2323
private final static int[] NO_INTS = new int[0];
2424

25+
/**
26+
* The replacement character to use to fix invalid unicode sequences.
27+
*/
28+
final static int REPLACEMENT_CHAR = 0xfffd;
29+
2530
/**
2631
* Let's ensure that we have big enough output buffer because of safety
2732
* margins we need for UTF-8 encoding.
@@ -61,7 +66,14 @@ public enum Feature implements FormatFeature {
6166
* Default value is <code>false</code> meaning that type tag will not be
6267
* written at the beginning of a new document.
6368
*/
64-
WRITE_TYPE_HEADER(false)
69+
WRITE_TYPE_HEADER(false),
70+
71+
/**
72+
* Feature that determines if an invalid surrogate encoding found in the
73+
* incoming String should fail with an exception or silently be outputed
74+
* as the Unicode 'REPLACEMENT CHARACTER' (U+FFFD)
75+
*/
76+
LENIENT_UTF_ENCODING(false),
6577

6678
;
6779

@@ -138,6 +150,8 @@ public int getMask() {
138150

139151
protected boolean _cfgMinimalInts;
140152

153+
protected boolean _cfgLenientUnicodeEncoding;
154+
141155
/*
142156
/**********************************************************************
143157
/* Output state
@@ -231,6 +245,7 @@ public CBORGenerator(ObjectWriteContext writeCtxt, IOContext ctxt,
231245
: null;
232246
_tokenWriteContext = CBORWriteContext.createRootContext(dups);
233247
_cfgMinimalInts = Feature.WRITE_MINIMAL_INTS.enabledIn(formatFeatures);
248+
_cfgLenientUnicodeEncoding = Feature.LENIENT_UTF_ENCODING.enabledIn(formatFeatures);
234249
_out = out;
235250
_bufferRecyclable = true;
236251
_outputBuffer = ctxt.allocWriteEncodingBuffer(BYTE_BUFFER_FOR_OUTPUT);
@@ -357,6 +372,9 @@ public CBORGenerator enable(Feature f) {
357372
if (f == Feature.WRITE_MINIMAL_INTS) {
358373
_cfgMinimalInts = true;
359374
}
375+
if (f == Feature.LENIENT_UTF_ENCODING) {
376+
_cfgLenientUnicodeEncoding = true;
377+
}
360378
return this;
361379
}
362380

@@ -365,6 +383,9 @@ public CBORGenerator disable(Feature f) {
365383
if (f == Feature.WRITE_MINIMAL_INTS) {
366384
_cfgMinimalInts = false;
367385
}
386+
if (f == Feature.LENIENT_UTF_ENCODING) {
387+
_cfgLenientUnicodeEncoding = false;
388+
}
368389
return this;
369390
}
370391

@@ -1356,81 +1377,33 @@ private final int _encode(int outputPtr, char[] str, int i, int end) {
13561377
do {
13571378
int c = str[i];
13581379
if (c > 0x7F) {
1359-
return _shortUTF8Encode2(str, i, end, outputPtr, outputStart);
1380+
return _encode2(i, outputPtr, str, end, outputStart);
13601381
}
13611382
outBuf[outputPtr++] = (byte) c;
13621383
} while (++i < end);
13631384
return outputPtr - outputStart;
13641385
}
13651386

1366-
/**
1367-
* Helper method called when the whole character sequence is known to fit in
1368-
* the output buffer, but not all characters are single-byte (ASCII)
1369-
* characters.
1370-
*/
1371-
private final int _shortUTF8Encode2(char[] str, int i, int end,
1372-
int outputPtr, int outputStart) {
1373-
final byte[] outBuf = _outputBuffer;
1374-
while (i < end) {
1375-
int c = str[i++];
1376-
if (c <= 0x7F) {
1377-
outBuf[outputPtr++] = (byte) c;
1378-
continue;
1379-
}
1380-
// Nope, multi-byte:
1381-
if (c < 0x800) { // 2-byte
1382-
outBuf[outputPtr++] = (byte) (0xc0 | (c >> 6));
1383-
outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f));
1384-
continue;
1385-
}
1386-
// 3 or 4 bytes (surrogate)
1387-
// Surrogates?
1388-
if (c < SURR1_FIRST || c > SURR2_LAST) { // nope, regular 3-byte character
1389-
outBuf[outputPtr++] = (byte) (0xe0 | (c >> 12));
1390-
outBuf[outputPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
1391-
outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f));
1392-
continue;
1393-
}
1394-
// Yup, a surrogate pair
1395-
if (c > SURR1_LAST) { // must be from first range; second won't do
1396-
_throwIllegalSurrogate(c);
1397-
}
1398-
// ... meaning it must have a pair
1399-
if (i >= end) {
1400-
_throwIllegalSurrogate(c);
1401-
}
1402-
c = _convertSurrogate(c, str[i++]);
1403-
if (c > 0x10FFFF) { // illegal in JSON as well as in XML
1404-
_throwIllegalSurrogate(c);
1405-
}
1406-
outBuf[outputPtr++] = (byte) (0xf0 | (c >> 18));
1407-
outBuf[outputPtr++] = (byte) (0x80 | ((c >> 12) & 0x3f));
1408-
outBuf[outputPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
1409-
outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f));
1410-
}
1411-
return (outputPtr - outputStart);
1412-
}
1413-
14141387
private final int _encode(int outputPtr, String str, int len) {
14151388
final byte[] outBuf = _outputBuffer;
14161389
final int outputStart = outputPtr;
14171390

14181391
for (int i = 0; i < len; ++i) {
14191392
int c = str.charAt(i);
14201393
if (c > 0x7F) {
1421-
return _encode2(i, outputPtr, str, len, outputStart);
1394+
return _encode2(i, outputPtr, str.toCharArray(), len, outputStart);
14221395
}
14231396
outBuf[outputPtr++] = (byte) c;
14241397
}
14251398
return (outputPtr - outputStart);
14261399
}
14271400

1428-
private final int _encode2(int i, int outputPtr, String str, int len,
1401+
private final int _encode2(int i, int outputPtr, char[] str, int len,
14291402
int outputStart) {
14301403
final byte[] outBuf = _outputBuffer;
14311404
// no; non-ASCII stuff, slower loop
14321405
while (i < len) {
1433-
int c = str.charAt(i++);
1406+
int c = str[i++];
14341407
if (c <= 0x7F) {
14351408
outBuf[outputPtr++] = (byte) c;
14361409
continue;
@@ -1452,20 +1425,43 @@ private final int _encode2(int i, int outputPtr, String str, int len,
14521425
}
14531426
// Yup, a surrogate pair
14541427
if (c > SURR1_LAST) { // must be from first range; second won't do
1455-
_throwIllegalSurrogate(c);
1428+
if (_cfgLenientUnicodeEncoding) {
1429+
c = REPLACEMENT_CHAR;
1430+
} else {
1431+
_throwIllegalSurrogate(c);
1432+
}
14561433
}
14571434
// ... meaning it must have a pair
1458-
if (i >= len) {
1459-
_throwIllegalSurrogate(c);
1435+
else if (i >= len) {
1436+
if (_cfgLenientUnicodeEncoding) {
1437+
c = REPLACEMENT_CHAR;
1438+
} else {
1439+
_throwIllegalSurrogate(c);
1440+
}
14601441
}
1461-
c = _convertSurrogate(c, str.charAt(i++));
1462-
if (c > 0x10FFFF) { // illegal in JSON as well as in XML
1463-
_throwIllegalSurrogate(c);
1442+
// ... verify that the next character is in range
1443+
else if (str[i] < SURR2_FIRST || str[i] > SURR2_LAST) {
1444+
if (_cfgLenientUnicodeEncoding) {
1445+
c = REPLACEMENT_CHAR;
1446+
} else {
1447+
_throwIllegalSurrogatePair(c, str[i]);
1448+
}
1449+
}
1450+
// ... we have a valid surrogate pair
1451+
else {
1452+
c = _convertSurrogate(c, str[i++]);
1453+
}
1454+
// if we replaced by the replacement char we actually have a 3 bytes char
1455+
if (c == REPLACEMENT_CHAR) {
1456+
outBuf[outputPtr++] = (byte) (0xe0 | (c >> 12));
1457+
outBuf[outputPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
1458+
outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f));
1459+
} else {
1460+
outBuf[outputPtr++] = (byte) (0xf0 | (c >> 18));
1461+
outBuf[outputPtr++] = (byte) (0x80 | ((c >> 12) & 0x3f));
1462+
outBuf[outputPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
1463+
outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f));
14641464
}
1465-
outBuf[outputPtr++] = (byte) (0xf0 | (c >> 18));
1466-
outBuf[outputPtr++] = (byte) (0x80 | ((c >> 12) & 0x3f));
1467-
outBuf[outputPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
1468-
outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f));
14691465
}
14701466
return (outputPtr - outputStart);
14711467
}
@@ -1474,16 +1470,20 @@ private final int _encode2(int i, int outputPtr, String str, int len,
14741470
* Method called to calculate UTF codepoint, from a surrogate pair.
14751471
*/
14761472
private int _convertSurrogate(int firstPart, int secondPart) {
1477-
// Ok, then, is the second part valid?
1478-
if (secondPart < SURR2_FIRST || secondPart > SURR2_LAST) {
1479-
throw new IllegalArgumentException(
1473+
int c = 0x10000 + ((firstPart - SURR1_FIRST) << 10)
1474+
+ (secondPart - SURR2_FIRST);
1475+
if (c > 0x10FFFF) { // illegal in JSON as well as in XML
1476+
_throwIllegalSurrogate(c);
1477+
}
1478+
return c;
1479+
}
1480+
1481+
private void _throwIllegalSurrogatePair(int firstPart, int secondPart) {
1482+
throw new IllegalArgumentException(
14801483
"Broken surrogate pair: first char 0x"
14811484
+ Integer.toHexString(firstPart) + ", second 0x"
14821485
+ Integer.toHexString(secondPart)
14831486
+ "; illegal combination");
1484-
}
1485-
return 0x10000 + ((firstPart - SURR1_FIRST) << 10)
1486-
+ (secondPart - SURR2_FIRST);
14871487
}
14881488

14891489
private void _throwIllegalSurrogate(int code) {

cbor/src/test/java/com/fasterxml/jackson/dataformat/cbor/CBORTestBase.java

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,14 @@ protected CBORGenerator cborGenerator(OutputStream result)
9494
return (CBORGenerator) CBORMapper.shared().createGenerator(result);
9595
}
9696

97+
protected CBORGenerator lenientUnicodeCborGenerator(OutputStream result)
98+
throws IOException
99+
{
100+
CBORGenerator gen = cborGenerator(result);
101+
gen.enable(CBORGenerator.Feature.LENIENT_UTF_ENCODING);
102+
return gen;
103+
}
104+
97105
/*
98106
/**********************************************************
99107
/* Doc conversion
Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
2+
import java.io.*;
3+
import java.math.BigDecimal;
4+
import java.math.BigInteger;
5+
import java.util.*;
6+
7+
import org.junit.Assert;
8+
9+
import com.fasterxml.jackson.core.JsonGenerationException;
10+
11+
import com.fasterxml.jackson.databind.ObjectMapper;
12+
13+
import com.fasterxml.jackson.dataformat.cbor.CBORConstants;
14+
import com.fasterxml.jackson.dataformat.cbor.CBORGenerator;
15+
import com.fasterxml.jackson.dataformat.cbor.CBORParser;
16+
import com.fasterxml.jackson.dataformat.cbor.CBORTestBase;
17+
18+
public class UnicodeGenerationTest extends CBORTestBase
19+
{
20+
/**
21+
* Test that encoding a String containing invalid surrogates fail with an exception
22+
*/
23+
public void testFailForInvalidSurrogate() throws Exception
24+
{
25+
ByteArrayOutputStream out = new ByteArrayOutputStream();
26+
CBORGenerator gen = cborGenerator(out);
27+
28+
assertEquals(0, gen.getOutputBuffered());
29+
30+
// Unmatched first surrogate character
31+
try {
32+
gen.writeString("x\ud83d");
33+
} catch (IllegalArgumentException e) {
34+
}
35+
assertEquals(0, gen.getOutputBuffered());
36+
37+
// Unmatched second surrogate character
38+
try {
39+
gen.writeString("x\ude01");
40+
} catch (IllegalArgumentException e) {
41+
}
42+
assertEquals(0, gen.getOutputBuffered());
43+
44+
// Unmatched second surrogate character (2)
45+
try {
46+
gen.writeString("x\ude01x");
47+
} catch (IllegalArgumentException e) {
48+
}
49+
assertEquals(0, gen.getOutputBuffered());
50+
51+
// Broken surrogate pair
52+
try {
53+
gen.writeString("x\ud83dx");
54+
} catch (IllegalArgumentException e) {
55+
}
56+
assertEquals(0, gen.getOutputBuffered());
57+
}
58+
59+
/**
60+
* Test that when the lenient unicode feature is enabled, the replacement character is used to fix invalid sequences
61+
*/
62+
public void testRecoverInvalidSurrogate() throws Exception
63+
{
64+
ByteArrayOutputStream out;
65+
CBORGenerator gen;
66+
byte[] b;
67+
68+
out = new ByteArrayOutputStream();
69+
gen = lenientUnicodeCborGenerator(out);
70+
assertEquals(0, gen.getOutputBuffered());
71+
72+
// Unmatched first surrogate character
73+
gen.writeString("x\ud83d");
74+
gen.close();
75+
b = "x\ufffd".getBytes("utf-8");
76+
_verifyBytes(out.toByteArray(),
77+
(byte) (CBORConstants.PREFIX_TYPE_TEXT + b.length), b);
78+
79+
out = new ByteArrayOutputStream();
80+
gen = lenientUnicodeCborGenerator(out);
81+
assertEquals(0, gen.getOutputBuffered());
82+
83+
// Unmatched second surrogate character
84+
gen.writeString("x\ude01");
85+
gen.close();
86+
b = "x\ufffd".getBytes("utf-8");
87+
_verifyBytes(out.toByteArray(),
88+
(byte) (CBORConstants.PREFIX_TYPE_TEXT + b.length), b);
89+
90+
out = new ByteArrayOutputStream();
91+
gen = lenientUnicodeCborGenerator(out);
92+
assertEquals(0, gen.getOutputBuffered());
93+
94+
// Unmatched second surrogate character (2)
95+
gen.writeString("x\ude01x");
96+
gen.close();
97+
b = "x\ufffdx".getBytes("utf-8");
98+
_verifyBytes(out.toByteArray(),
99+
(byte) (CBORConstants.PREFIX_TYPE_TEXT + b.length), b);
100+
101+
out = new ByteArrayOutputStream();
102+
gen = lenientUnicodeCborGenerator(out);
103+
assertEquals(0, gen.getOutputBuffered());
104+
105+
// Broken surrogate pair
106+
gen.writeString("x\ud83dx");
107+
gen.close();
108+
b = "x\ufffdx".getBytes("utf-8");
109+
_verifyBytes(out.toByteArray(),
110+
(byte) (CBORConstants.PREFIX_TYPE_TEXT + b.length), b);
111+
112+
}
113+
114+
}

0 commit comments

Comments
 (0)