@@ -28,6 +28,14 @@ public class CBORGenerator extends GeneratorBase
28
28
*/
29
29
final static int BYTE_BUFFER_FOR_OUTPUT = 16000 ;
30
30
31
+ /**
32
+ * The replacement character to use to fix invalid Unicode sequences
33
+ * (mismatched surrogate pair).
34
+ *
35
+ * @since 2.12
36
+ */
37
+ final static int REPLACEMENT_CHAR = 0xfffd ;
38
+
31
39
/**
32
40
* Longest char chunk we will output is chosen so that it is guaranteed to
33
41
* fit in an empty buffer even if everything encoded in 3-byte sequences;
@@ -58,13 +66,25 @@ public enum Feature implements FormatFeature {
58
66
* 55799, encoded as 3-byte sequence of <code>0xD9, 0xD9, 0xF7</code>)
59
67
* should be written at the beginning of document or not.
60
68
* <p>
61
- * Default value is < code> false</code> meaning that type tag will not be
69
+ * Default value is {@ code false} meaning that type tag will not be
62
70
* written at the beginning of a new document.
63
71
*
64
72
* @since 2.5
65
73
*/
66
- WRITE_TYPE_HEADER (false )
74
+ WRITE_TYPE_HEADER (false ),
67
75
76
+ /**
77
+ * Feature that determines if an invalid surrogate encoding found in the
78
+ * incoming String should fail with an exception or silently be output
79
+ * as the Unicode 'REPLACEMENT CHARACTER' (U+FFFD) or not; if not,
80
+ * an exception will be thrown to indicate invalid content.
81
+ *<p>
82
+ * Default value is {@code false} (for backwards compatibility) meaning that
83
+ * an invalide surrogate will result in exception ({@link IllegalArgumentException}
84
+ *
85
+ * @since 2.12
86
+ */
87
+ LENIENT_UTF_ENCODING (false ),
68
88
;
69
89
70
90
protected final boolean _defaultState ;
@@ -201,7 +221,7 @@ public int getMask() {
201
221
202
222
/**
203
223
* Number of elements remaining in the current complex structure (if any),
204
- * when writing defined-length Arrays, Objects; marker {@link # INDEFINITE_LENGTH}
224
+ * when writing defined-length Arrays, Objects; marker {code INDEFINITE_LENGTH}
205
225
* otherwise.
206
226
*/
207
227
protected int _currentRemainingElements = INDEFINITE_LENGTH ;
@@ -1452,29 +1472,25 @@ private final int _shortUTF8Encode2(char[] str, int i, int end,
1452
1472
continue ;
1453
1473
}
1454
1474
// 3 or 4 bytes (surrogate)
1455
- // Surrogates?
1456
- if (c < SURR1_FIRST || c > SURR2_LAST ) { // nope, regular 3-byte character
1475
+ if (c < SURR1_FIRST || c > SURR2_LAST ) { // regular 3-byte character
1457
1476
outBuf [outputPtr ++] = (byte ) (0xe0 | (c >> 12 ));
1458
1477
outBuf [outputPtr ++] = (byte ) (0x80 | ((c >> 6 ) & 0x3f ));
1459
1478
outBuf [outputPtr ++] = (byte ) (0x80 | (c & 0x3f ));
1460
1479
continue ;
1461
1480
}
1462
- // Yup, a surrogate pair
1463
- if (c > SURR1_LAST ) { // must be from first range; second won't do
1464
- _throwIllegalSurrogate (c );
1465
- }
1466
- // ... meaning it must have a pair
1467
- if (i >= end ) {
1468
- _throwIllegalSurrogate (c );
1469
- }
1470
- c = _convertSurrogate (c , str [i ++]);
1471
- if (c > 0x10FFFF ) { // illegal in JSON as well as in XML
1472
- _throwIllegalSurrogate (c );
1481
+ // Yup, looks like a surrogate pair... but is it?
1482
+ if ((c <= SURR1_LAST ) && (i < end )) { // must be from first range and have another char
1483
+ final int d = str [i ];
1484
+ if ((d <= SURR2_LAST ) && (d >= SURR2_FIRST )) {
1485
+ ++i ;
1486
+ outputPtr = _decodeAndWriteSurrogate (c , d , outBuf , outputPtr );
1487
+ continue ;
1488
+ }
1489
+ outputPtr = _invalidSurrogateEnd (c , d , outBuf , outputPtr );
1490
+ continue ;
1473
1491
}
1474
- outBuf [outputPtr ++] = (byte ) (0xf0 | (c >> 18 ));
1475
- outBuf [outputPtr ++] = (byte ) (0x80 | ((c >> 12 ) & 0x3f ));
1476
- outBuf [outputPtr ++] = (byte ) (0x80 | ((c >> 6 ) & 0x3f ));
1477
- outBuf [outputPtr ++] = (byte ) (0x80 | (c & 0x3f ));
1492
+ // Nah, something wrong
1493
+ outputPtr = _invalidSurrogateStart (c , outBuf , outputPtr );
1478
1494
}
1479
1495
return (outputPtr - outputStart );
1480
1496
}
@@ -1510,70 +1526,76 @@ private final int _encode2(int i, int outputPtr, String str, int len,
1510
1526
continue ;
1511
1527
}
1512
1528
// 3 or 4 bytes (surrogate)
1513
- // Surrogates?
1514
- if (c < SURR1_FIRST || c > SURR2_LAST ) { // nope, regular 3-byte
1515
- // character
1529
+ if (c < SURR1_FIRST || c > SURR2_LAST ) { // regular 3-byte character
1516
1530
outBuf [outputPtr ++] = (byte ) (0xe0 | (c >> 12 ));
1517
1531
outBuf [outputPtr ++] = (byte ) (0x80 | ((c >> 6 ) & 0x3f ));
1518
1532
outBuf [outputPtr ++] = (byte ) (0x80 | (c & 0x3f ));
1519
1533
continue ;
1520
1534
}
1521
- // Yup, a surrogate pair
1522
- if (c > SURR1_LAST ) { // must be from first range; second won't do
1523
- _throwIllegalSurrogate (c );
1524
- }
1525
- // ... meaning it must have a pair
1526
- if (i >= len ) {
1527
- _throwIllegalSurrogate (c );
1528
- }
1529
- c = _convertSurrogate (c , str .charAt (i ++));
1530
- if (c > 0x10FFFF ) { // illegal in JSON as well as in XML
1531
- _throwIllegalSurrogate (c );
1535
+ // Yup, looks like a surrogate pair... but is it?
1536
+ if ((c <= SURR1_LAST ) && (i < len )) { // must be from first range and have another char
1537
+ final int d = str .charAt (i );
1538
+ if ((d <= SURR2_LAST ) && (d >= SURR2_FIRST )) {
1539
+ ++i ;
1540
+ outputPtr = _decodeAndWriteSurrogate (c , d , outBuf , outputPtr );
1541
+ continue ;
1542
+ }
1543
+ outputPtr = _invalidSurrogateEnd (c , d , outBuf , outputPtr );
1544
+ continue ;
1532
1545
}
1533
- outBuf [outputPtr ++] = (byte ) (0xf0 | (c >> 18 ));
1534
- outBuf [outputPtr ++] = (byte ) (0x80 | ((c >> 12 ) & 0x3f ));
1535
- outBuf [outputPtr ++] = (byte ) (0x80 | ((c >> 6 ) & 0x3f ));
1536
- outBuf [outputPtr ++] = (byte ) (0x80 | (c & 0x3f ));
1546
+ // Nah, something wrong
1547
+ outputPtr = _invalidSurrogateStart (c , outBuf , outputPtr );
1537
1548
}
1538
1549
return (outputPtr - outputStart );
1539
1550
}
1540
1551
1541
- /**
1542
- * Method called to calculate UTF codepoint, from a surrogate pair.
1543
- */
1544
- private int _convertSurrogate (int firstPart , int secondPart ) {
1545
- // Ok, then, is the second part valid?
1546
- if (secondPart < SURR2_FIRST || secondPart > SURR2_LAST ) {
1547
- throw new IllegalArgumentException (
1548
- "Broken surrogate pair: first char 0x"
1549
- + Integer .toHexString (firstPart ) + ", second 0x"
1550
- + Integer .toHexString (secondPart )
1551
- + "; illegal combination" );
1552
- }
1553
- return 0x10000 + ((firstPart - SURR1_FIRST ) << 10 )
1554
- + (secondPart - SURR2_FIRST );
1555
- }
1556
-
1557
- private void _throwIllegalSurrogate (int code ) {
1558
- if (code > 0x10FFFF ) { // over max?
1559
- throw new IllegalArgumentException ("Illegal character point (0x"
1560
- + Integer .toHexString (code )
1561
- + ") to output; max is 0x10FFFF as per RFC 4627" );
1562
- }
1563
- if (code >= SURR1_FIRST ) {
1564
- if (code <= SURR1_LAST ) { // Unmatched first part (closing without
1565
- // second part?)
1566
- throw new IllegalArgumentException (
1567
- "Unmatched first part of surrogate pair (0x"
1568
- + Integer .toHexString (code ) + ")" );
1569
- }
1570
- throw new IllegalArgumentException (
1571
- "Unmatched second part of surrogate pair (0x"
1572
- + Integer .toHexString (code ) + ")" );
1552
+ private int _invalidSurrogateStart (int code , byte [] outBuf , int outputPtr ) {
1553
+ if (isEnabled (Feature .LENIENT_UTF_ENCODING )) {
1554
+ return _appendReplacementChar (outBuf , outputPtr );
1555
+ }
1556
+ // Will be called in two distinct cases: either first character is
1557
+ // invalid (code range of second part), or first character is valid
1558
+ // but there is no second part to encode
1559
+ if (code <= SURR1_LAST ) {
1560
+ // Unmatched first part (closing without second part?)
1561
+ throw new IllegalArgumentException (String .format (
1562
+ "Unmatched surrogate pair, starts with valid high surrogate (0x%04X) but ends without low surrogate" ,
1563
+ code ));
1564
+ }
1565
+ throw new IllegalArgumentException (String .format (
1566
+ "Invalid surrogate pair, starts with invalid high surrogate (0x%04X), not in valid range [0xD800, 0xDBFF]" ,
1567
+ code ));
1568
+ }
1569
+
1570
+ private int _invalidSurrogateEnd (int surr1 , int surr2 ,
1571
+ byte [] outBuf , int outputPtr )
1572
+ {
1573
+ if (isEnabled (Feature .LENIENT_UTF_ENCODING )) {
1574
+ return _appendReplacementChar (outBuf , outputPtr );
1573
1575
}
1574
- // should we ever get this?
1575
- throw new IllegalArgumentException ("Illegal character point (0x"
1576
- + Integer .toHexString (code ) + ") to output" );
1576
+ throw new IllegalArgumentException (String .format (
1577
+ "Invalid surrogate pair, starts with valid high surrogate (0x%04X)"
1578
+ +" but ends with invalid low surrogate (0x%04X), not in valid range [0xDC00, 0xDFFF]" ,
1579
+ surr1 , surr2 ));
1580
+ }
1581
+
1582
+ private int _appendReplacementChar (byte [] outBuf , int outputPtr ) {
1583
+ outBuf [outputPtr ++] = (byte ) (0xe0 | (REPLACEMENT_CHAR >> 12 ));
1584
+ outBuf [outputPtr ++] = (byte ) (0x80 | ((REPLACEMENT_CHAR >> 6 ) & 0x3f ));
1585
+ outBuf [outputPtr ++] = (byte ) (0x80 | (REPLACEMENT_CHAR & 0x3f ));
1586
+ return outputPtr ;
1587
+ }
1588
+
1589
+ private int _decodeAndWriteSurrogate (int surr1 , int surr2 ,
1590
+ byte [] outBuf , int outputPtr )
1591
+ {
1592
+ final int c = 0x10000 + ((surr1 - SURR1_FIRST ) << 10 )
1593
+ + (surr2 - SURR2_FIRST );
1594
+ outBuf [outputPtr ++] = (byte ) (0xf0 | (c >> 18 ));
1595
+ outBuf [outputPtr ++] = (byte ) (0x80 | ((c >> 12 ) & 0x3f ));
1596
+ outBuf [outputPtr ++] = (byte ) (0x80 | ((c >> 6 ) & 0x3f ));
1597
+ outBuf [outputPtr ++] = (byte ) (0x80 | (c & 0x3f ));
1598
+ return outputPtr ;
1577
1599
}
1578
1600
1579
1601
/*
0 commit comments