Skip to content

Commit 314bd30

Browse files
committed
Manually merged #222
1 parent 1f3cbdc commit 314bd30

File tree

5 files changed

+238
-73
lines changed

5 files changed

+238
-73
lines changed

cbor/src/main/java/com/fasterxml/jackson/dataformat/cbor/CBORGenerator.java

Lines changed: 95 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,14 @@ public class CBORGenerator extends GeneratorBase
2828
*/
2929
final static int BYTE_BUFFER_FOR_OUTPUT = 16000;
3030

31+
/**
32+
* The replacement character to use to fix invalid Unicode sequences
33+
* (mismatched surrogate pair).
34+
*
35+
* @since 2.12
36+
*/
37+
final static int REPLACEMENT_CHAR = 0xfffd;
38+
3139
/**
3240
* Longest char chunk we will output is chosen so that it is guaranteed to
3341
* fit in an empty buffer even if everything encoded in 3-byte sequences;
@@ -58,13 +66,25 @@ public enum Feature implements FormatFeature {
5866
* 55799, encoded as 3-byte sequence of <code>0xD9, 0xD9, 0xF7</code>)
5967
* should be written at the beginning of document or not.
6068
* <p>
61-
* Default value is <code>false</code> meaning that type tag will not be
69+
* Default value is {@code false} meaning that type tag will not be
6270
* written at the beginning of a new document.
6371
*
6472
* @since 2.5
6573
*/
66-
WRITE_TYPE_HEADER(false)
74+
WRITE_TYPE_HEADER(false),
6775

76+
/**
77+
* Feature that determines if an invalid surrogate encoding found in the
78+
* incoming String should fail with an exception or silently be output
79+
* as the Unicode 'REPLACEMENT CHARACTER' (U+FFFD) or not; if not,
80+
* an exception will be thrown to indicate invalid content.
81+
*<p>
82+
* Default value is {@code false} (for backwards compatibility) meaning that
83+
* an invalide surrogate will result in exception ({@link IllegalArgumentException}
84+
*
85+
* @since 2.12
86+
*/
87+
LENIENT_UTF_ENCODING(false),
6888
;
6989

7090
protected final boolean _defaultState;
@@ -201,7 +221,7 @@ public int getMask() {
201221

202222
/**
203223
* Number of elements remaining in the current complex structure (if any),
204-
* when writing defined-length Arrays, Objects; marker {@link #INDEFINITE_LENGTH}
224+
* when writing defined-length Arrays, Objects; marker {code INDEFINITE_LENGTH}
205225
* otherwise.
206226
*/
207227
protected int _currentRemainingElements = INDEFINITE_LENGTH;
@@ -1452,29 +1472,25 @@ private final int _shortUTF8Encode2(char[] str, int i, int end,
14521472
continue;
14531473
}
14541474
// 3 or 4 bytes (surrogate)
1455-
// Surrogates?
1456-
if (c < SURR1_FIRST || c > SURR2_LAST) { // nope, regular 3-byte character
1475+
if (c < SURR1_FIRST || c > SURR2_LAST) { // regular 3-byte character
14571476
outBuf[outputPtr++] = (byte) (0xe0 | (c >> 12));
14581477
outBuf[outputPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
14591478
outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f));
14601479
continue;
14611480
}
1462-
// Yup, a surrogate pair
1463-
if (c > SURR1_LAST) { // must be from first range; second won't do
1464-
_throwIllegalSurrogate(c);
1465-
}
1466-
// ... meaning it must have a pair
1467-
if (i >= end) {
1468-
_throwIllegalSurrogate(c);
1469-
}
1470-
c = _convertSurrogate(c, str[i++]);
1471-
if (c > 0x10FFFF) { // illegal in JSON as well as in XML
1472-
_throwIllegalSurrogate(c);
1481+
// Yup, looks like a surrogate pair... but is it?
1482+
if ((c <= SURR1_LAST) && (i < end)) { // must be from first range and have another char
1483+
final int d = str[i];
1484+
if ((d <= SURR2_LAST) && (d >= SURR2_FIRST)) {
1485+
++i;
1486+
outputPtr = _decodeAndWriteSurrogate(c, d, outBuf, outputPtr);
1487+
continue;
1488+
}
1489+
outputPtr = _invalidSurrogateEnd(c, d, outBuf, outputPtr);
1490+
continue;
14731491
}
1474-
outBuf[outputPtr++] = (byte) (0xf0 | (c >> 18));
1475-
outBuf[outputPtr++] = (byte) (0x80 | ((c >> 12) & 0x3f));
1476-
outBuf[outputPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
1477-
outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f));
1492+
// Nah, something wrong
1493+
outputPtr = _invalidSurrogateStart(c, outBuf, outputPtr);
14781494
}
14791495
return (outputPtr - outputStart);
14801496
}
@@ -1510,70 +1526,76 @@ private final int _encode2(int i, int outputPtr, String str, int len,
15101526
continue;
15111527
}
15121528
// 3 or 4 bytes (surrogate)
1513-
// Surrogates?
1514-
if (c < SURR1_FIRST || c > SURR2_LAST) { // nope, regular 3-byte
1515-
// character
1529+
if (c < SURR1_FIRST || c > SURR2_LAST) { // regular 3-byte character
15161530
outBuf[outputPtr++] = (byte) (0xe0 | (c >> 12));
15171531
outBuf[outputPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
15181532
outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f));
15191533
continue;
15201534
}
1521-
// Yup, a surrogate pair
1522-
if (c > SURR1_LAST) { // must be from first range; second won't do
1523-
_throwIllegalSurrogate(c);
1524-
}
1525-
// ... meaning it must have a pair
1526-
if (i >= len) {
1527-
_throwIllegalSurrogate(c);
1528-
}
1529-
c = _convertSurrogate(c, str.charAt(i++));
1530-
if (c > 0x10FFFF) { // illegal in JSON as well as in XML
1531-
_throwIllegalSurrogate(c);
1535+
// Yup, looks like a surrogate pair... but is it?
1536+
if ((c <= SURR1_LAST) && (i < len)) { // must be from first range and have another char
1537+
final int d = str.charAt(i);
1538+
if ((d <= SURR2_LAST) && (d >= SURR2_FIRST)) {
1539+
++i;
1540+
outputPtr = _decodeAndWriteSurrogate(c, d, outBuf, outputPtr);
1541+
continue;
1542+
}
1543+
outputPtr = _invalidSurrogateEnd(c, d, outBuf, outputPtr);
1544+
continue;
15321545
}
1533-
outBuf[outputPtr++] = (byte) (0xf0 | (c >> 18));
1534-
outBuf[outputPtr++] = (byte) (0x80 | ((c >> 12) & 0x3f));
1535-
outBuf[outputPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
1536-
outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f));
1546+
// Nah, something wrong
1547+
outputPtr = _invalidSurrogateStart(c, outBuf, outputPtr);
15371548
}
15381549
return (outputPtr - outputStart);
15391550
}
15401551

1541-
/**
1542-
* Method called to calculate UTF codepoint, from a surrogate pair.
1543-
*/
1544-
private int _convertSurrogate(int firstPart, int secondPart) {
1545-
// Ok, then, is the second part valid?
1546-
if (secondPart < SURR2_FIRST || secondPart > SURR2_LAST) {
1547-
throw new IllegalArgumentException(
1548-
"Broken surrogate pair: first char 0x"
1549-
+ Integer.toHexString(firstPart) + ", second 0x"
1550-
+ Integer.toHexString(secondPart)
1551-
+ "; illegal combination");
1552-
}
1553-
return 0x10000 + ((firstPart - SURR1_FIRST) << 10)
1554-
+ (secondPart - SURR2_FIRST);
1555-
}
1556-
1557-
private void _throwIllegalSurrogate(int code) {
1558-
if (code > 0x10FFFF) { // over max?
1559-
throw new IllegalArgumentException("Illegal character point (0x"
1560-
+ Integer.toHexString(code)
1561-
+ ") to output; max is 0x10FFFF as per RFC 4627");
1562-
}
1563-
if (code >= SURR1_FIRST) {
1564-
if (code <= SURR1_LAST) { // Unmatched first part (closing without
1565-
// second part?)
1566-
throw new IllegalArgumentException(
1567-
"Unmatched first part of surrogate pair (0x"
1568-
+ Integer.toHexString(code) + ")");
1569-
}
1570-
throw new IllegalArgumentException(
1571-
"Unmatched second part of surrogate pair (0x"
1572-
+ Integer.toHexString(code) + ")");
1552+
private int _invalidSurrogateStart(int code, byte[] outBuf, int outputPtr) {
1553+
if (isEnabled(Feature.LENIENT_UTF_ENCODING)) {
1554+
return _appendReplacementChar(outBuf, outputPtr);
1555+
}
1556+
// Will be called in two distinct cases: either first character is
1557+
// invalid (code range of second part), or first character is valid
1558+
// but there is no second part to encode
1559+
if (code <= SURR1_LAST) {
1560+
// Unmatched first part (closing without second part?)
1561+
throw new IllegalArgumentException(String.format(
1562+
"Unmatched surrogate pair, starts with valid high surrogate (0x%04X) but ends without low surrogate",
1563+
code));
1564+
}
1565+
throw new IllegalArgumentException(String.format(
1566+
"Invalid surrogate pair, starts with invalid high surrogate (0x%04X), not in valid range [0xD800, 0xDBFF]",
1567+
code));
1568+
}
1569+
1570+
private int _invalidSurrogateEnd(int surr1, int surr2,
1571+
byte[] outBuf, int outputPtr)
1572+
{
1573+
if (isEnabled(Feature.LENIENT_UTF_ENCODING)) {
1574+
return _appendReplacementChar(outBuf, outputPtr);
15731575
}
1574-
// should we ever get this?
1575-
throw new IllegalArgumentException("Illegal character point (0x"
1576-
+ Integer.toHexString(code) + ") to output");
1576+
throw new IllegalArgumentException(String.format(
1577+
"Invalid surrogate pair, starts with valid high surrogate (0x%04X)"
1578+
+" but ends with invalid low surrogate (0x%04X), not in valid range [0xDC00, 0xDFFF]",
1579+
surr1, surr2));
1580+
}
1581+
1582+
private int _appendReplacementChar(byte[] outBuf, int outputPtr) {
1583+
outBuf[outputPtr++] = (byte) (0xe0 | (REPLACEMENT_CHAR >> 12));
1584+
outBuf[outputPtr++] = (byte) (0x80 | ((REPLACEMENT_CHAR >> 6) & 0x3f));
1585+
outBuf[outputPtr++] = (byte) (0x80 | (REPLACEMENT_CHAR & 0x3f));
1586+
return outputPtr;
1587+
}
1588+
1589+
private int _decodeAndWriteSurrogate(int surr1, int surr2,
1590+
byte[] outBuf, int outputPtr)
1591+
{
1592+
final int c = 0x10000 + ((surr1 - SURR1_FIRST) << 10)
1593+
+ (surr2 - SURR2_FIRST);
1594+
outBuf[outputPtr++] = (byte) (0xf0 | (c >> 18));
1595+
outBuf[outputPtr++] = (byte) (0x80 | ((c >> 12) & 0x3f));
1596+
outBuf[outputPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
1597+
outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f));
1598+
return outputPtr;
15771599
}
15781600

15791601
/*

cbor/src/test/java/com/fasterxml/jackson/dataformat/cbor/CBORTestBase.java

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,16 @@ protected CBORGenerator cborGenerator(CBORFactory f,
8585
return f.createGenerator(result, null);
8686
}
8787

88+
// @since 2.12
89+
protected CBORGenerator lenientUnicodeCborGenerator(ByteArrayOutputStream result)
90+
throws IOException
91+
{
92+
return cborFactoryBuilder()
93+
.enable(CBORGenerator.Feature.LENIENT_UTF_ENCODING)
94+
.build()
95+
.createGenerator(result);
96+
}
97+
8898
/*
8999
/**********************************************************
90100
/* Additional assertion methods
Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
package com.fasterxml.jackson.dataformat.cbor.gen;
2+
3+
import java.io.ByteArrayOutputStream;
4+
5+
import com.fasterxml.jackson.dataformat.cbor.*;
6+
7+
public class LenientUnicodeGenerationTest extends CBORTestBase
8+
{
9+
/**
10+
* Test that encoding a String containing invalid surrogates fail with an exception
11+
*/
12+
public void testFailForInvalidSurrogate() throws Exception
13+
{
14+
ByteArrayOutputStream out = new ByteArrayOutputStream();
15+
CBORGenerator gen = cborGenerator(out);
16+
17+
assertEquals(0, gen.getOutputBuffered());
18+
19+
// Invalid first surrogate character
20+
try {
21+
gen.writeString("x\ud83d");
22+
} catch (IllegalArgumentException e) {
23+
verifyException(e, "Unmatched surrogate pair");
24+
verifyException(e, "0xD83D");
25+
verifyException(e, "without low surrogate");
26+
}
27+
assertEquals(0, gen.getOutputBuffered());
28+
29+
// Missing second surrogate character
30+
try {
31+
gen.writeString("x\ude01");
32+
} catch (IllegalArgumentException e) {
33+
verifyException(e, "Invalid surrogate pair");
34+
verifyException(e, "0xDE01");
35+
verifyException(e, "invalid high surrogate");
36+
}
37+
assertEquals(0, gen.getOutputBuffered());
38+
39+
// Invalid second surrogate character (1)
40+
try {
41+
gen.writeString("x\ud801\ud802");
42+
} catch (IllegalArgumentException e) {
43+
verifyException(e, "Invalid surrogate pair");
44+
verifyException(e, "0xD801");
45+
verifyException(e, "0xD802");
46+
verifyException(e, "valid high surrogate");
47+
verifyException(e, "invalid low surrogate");
48+
}
49+
assertEquals(0, gen.getOutputBuffered());
50+
51+
// Invalid second surrogate character (2)
52+
try {
53+
gen.writeString("x\ud83dx");
54+
} catch (IllegalArgumentException e) {
55+
verifyException(e, "Invalid surrogate pair");
56+
verifyException(e, "0xD83D");
57+
verifyException(e, "0x0078");
58+
verifyException(e, "valid high surrogate");
59+
verifyException(e, "invalid low surrogate");
60+
}
61+
assertEquals(0, gen.getOutputBuffered());
62+
}
63+
64+
/**
65+
* Test that when the lenient unicode feature is enabled, the replacement character is used to fix invalid sequences
66+
*/
67+
public void testRecoverInvalidSurrogate1() throws Exception
68+
{
69+
ByteArrayOutputStream out;
70+
CBORGenerator gen;
71+
byte[] b;
72+
73+
out = new ByteArrayOutputStream();
74+
gen = lenientUnicodeCborGenerator(out);
75+
assertEquals(0, gen.getOutputBuffered());
76+
77+
// Unmatched first surrogate character
78+
gen.writeString("x\ud83d");
79+
gen.close();
80+
b = "x\ufffd".getBytes("utf-8");
81+
_verifyBytes(out.toByteArray(),
82+
(byte) (CBORConstants.PREFIX_TYPE_TEXT + b.length), b);
83+
84+
out = new ByteArrayOutputStream();
85+
gen = lenientUnicodeCborGenerator(out);
86+
assertEquals(0, gen.getOutputBuffered());
87+
88+
// Unmatched second surrogate character
89+
gen.writeString("x\ude01");
90+
gen.close();
91+
b = "x\ufffd".getBytes("utf-8");
92+
_verifyBytes(out.toByteArray(),
93+
(byte) (CBORConstants.PREFIX_TYPE_TEXT + b.length), b);
94+
95+
out = new ByteArrayOutputStream();
96+
gen = lenientUnicodeCborGenerator(out);
97+
assertEquals(0, gen.getOutputBuffered());
98+
99+
// Unmatched second surrogate character (2)
100+
gen.writeString("x\ude01x");
101+
gen.close();
102+
b = "x\ufffdx".getBytes("utf-8");
103+
_verifyBytes(out.toByteArray(),
104+
(byte) (CBORConstants.PREFIX_TYPE_TEXT + b.length), b);
105+
}
106+
107+
public void testRecoverInvalidSurrogate2() throws Exception
108+
{
109+
ByteArrayOutputStream out;
110+
CBORGenerator gen;
111+
byte[] b;
112+
113+
out = new ByteArrayOutputStream();
114+
gen = lenientUnicodeCborGenerator(out);
115+
assertEquals(0, gen.getOutputBuffered());
116+
117+
// Broken surrogate pair
118+
gen.writeString("X\ud83dY");
119+
gen.close();
120+
b = "X\ufffdY".getBytes("utf-8");
121+
_verifyBytes(out.toByteArray(),
122+
(byte) (CBORConstants.PREFIX_TYPE_TEXT + b.length), b);
123+
}
124+
}

release-notes/CREDITS-2.x

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,3 +143,9 @@ Michael Liedtke (mcliedtke@github)
143143
* Contributed fix for #212: (ion) Optimize `IonParser.getNumberType()` using
144144
`IonReader.getIntegerSize()`
145145
(2.12.0)
146+
147+
Guillaume Bort (guillaumebort@github)
148+
149+
* Contributed implementation of #222: (cbor) Add `CBORGenerator.Feature.LENIENT_UTF_ENCODING`
150+
for lenient handling of Unicode surrogate pairs on writing
151+
(2.12.0)

release-notes/VERSION-2.x

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,9 @@ Project: jackson-datatypes-binaryModules:
1414
(contributed by Paul F)
1515
#212: (ion) Optimize `IonParser.getNumberType()` using `IonReader.getIntegerSize()`
1616
(contributed by Michael L)
17+
#222: (cbor) Add `CBORGenerator.Feature.LENIENT_UTF_ENCODING` for lenient handling of
18+
Unicode surrogate pairs on writing
19+
(contributed by Guillaume B)
1720
- Add Gradle Module Metadata (https://blog.gradle.org/alignment-with-gradle-module-metadata)
1821

1922
2.11.3 (02-Oct-2020)

0 commit comments

Comments
 (0)