@@ -74,6 +74,8 @@ public enum Feature implements FormatFeature {
74
74
* Feature that determines if an invalid surrogate encoding found in the
75
75
* incoming String should fail with an exception or silently be outputed
76
76
* as the Unicode 'REPLACEMENT CHARACTER' (U+FFFD)
77
+ *
78
+ * @since 2.12
77
79
*/
78
80
LENIENT_UTF_ENCODING (false ),
79
81
@@ -152,6 +154,11 @@ public int getMask() {
152
154
153
155
protected boolean _cfgMinimalInts ;
154
156
157
+
158
+ /**
159
+ * If true we will output the REPLACEMENT_CHAR for invalid unicode sequences.
160
+ * If false we will throw an IllegalArgumentException for invalid unicode sequences.
161
+ */
155
162
protected boolean _cfgLenientUnicodeEncoding ;
156
163
157
164
/*
@@ -1493,27 +1500,15 @@ private final int _encode2(int i, int outputPtr, char[] str, int len,
1493
1500
}
1494
1501
// Yup, a surrogate pair
1495
1502
if (c > SURR1_LAST ) { // must be from first range; second won't do
1496
- if (_cfgLenientUnicodeEncoding ) {
1497
- c = REPLACEMENT_CHAR ;
1498
- } else {
1499
- _throwIllegalSurrogate (c );
1500
- }
1503
+ c = _illegalSurrogateFound (c );
1501
1504
}
1502
1505
// ... meaning it must have a pair
1503
1506
else if (i >= len ) {
1504
- if (_cfgLenientUnicodeEncoding ) {
1505
- c = REPLACEMENT_CHAR ;
1506
- } else {
1507
- _throwIllegalSurrogate (c );
1508
- }
1507
+ c = _illegalSurrogateFound (c );
1509
1508
}
1510
1509
// ... verify that the next character is in range
1511
1510
else if (str [i ] < SURR2_FIRST || str [i ] > SURR2_LAST ) {
1512
- if (_cfgLenientUnicodeEncoding ) {
1513
- c = REPLACEMENT_CHAR ;
1514
- } else {
1515
- _throwIllegalSurrogatePair (c , str [i ]);
1516
- }
1511
+ c = _illegalSurrogatePairFound (c , str [i ]);
1517
1512
}
1518
1513
// ... we have a valid surrogate pair
1519
1514
else {
@@ -1541,43 +1536,47 @@ private int _convertSurrogate(int firstPart, int secondPart) {
1541
1536
int c = 0x10000 + ((firstPart - SURR1_FIRST ) << 10 )
1542
1537
+ (secondPart - SURR2_FIRST );
1543
1538
if (c > 0x10FFFF ) { // illegal in JSON as well as in XML
1544
- if (_cfgLenientUnicodeEncoding ) {
1545
- c = REPLACEMENT_CHAR ;
1546
- } else {
1547
- _throwIllegalSurrogate (c );
1548
- }
1539
+ c = _illegalSurrogatePairFound (firstPart , secondPart );
1549
1540
}
1550
1541
return c ;
1551
1542
}
1552
1543
1553
- private void _throwIllegalSurrogatePair (int firstPart , int secondPart ) {
1554
- throw new IllegalArgumentException (
1555
- "Broken surrogate pair: first char 0x"
1556
- + Integer .toHexString (firstPart ) + ", second 0x"
1557
- + Integer .toHexString (secondPart )
1558
- + "; illegal combination" );
1544
+ private int _illegalSurrogatePairFound (int firstPart , int secondPart ) {
1545
+ if (_cfgLenientUnicodeEncoding ) {
1546
+ return REPLACEMENT_CHAR ;
1547
+ } else {
1548
+ throw new IllegalArgumentException (
1549
+ "Broken surrogate pair: first char 0x"
1550
+ + Integer .toHexString (firstPart ) + ", second 0x"
1551
+ + Integer .toHexString (secondPart )
1552
+ + "; illegal combination" );
1553
+ }
1559
1554
}
1560
1555
1561
- private void _throwIllegalSurrogate (int code ) {
1562
- if (code > 0x10FFFF ) { // over max?
1563
- throw new IllegalArgumentException ("Illegal character point (0x"
1564
- + Integer .toHexString (code )
1565
- + ") to output; max is 0x10FFFF as per RFC 4627" );
1566
- }
1567
- if (code >= SURR1_FIRST ) {
1568
- if (code <= SURR1_LAST ) { // Unmatched first part (closing without
1569
- // second part?)
1556
+ private int _illegalSurrogateFound (int code ) {
1557
+ if (_cfgLenientUnicodeEncoding ) {
1558
+ return REPLACEMENT_CHAR ;
1559
+ } else {
1560
+ if (code > 0x10FFFF ) { // over max?
1561
+ throw new IllegalArgumentException ("Illegal character point (0x"
1562
+ + Integer .toHexString (code )
1563
+ + ") to output; max is 0x10FFFF as per RFC 4627" );
1564
+ }
1565
+ if (code >= SURR1_FIRST ) {
1566
+ if (code <= SURR1_LAST ) { // Unmatched first part (closing without
1567
+ // second part?)
1568
+ throw new IllegalArgumentException (
1569
+ "Unmatched first part of surrogate pair (0x"
1570
+ + Integer .toHexString (code ) + ")" );
1571
+ }
1570
1572
throw new IllegalArgumentException (
1571
- "Unmatched first part of surrogate pair (0x"
1573
+ "Unmatched second part of surrogate pair (0x"
1572
1574
+ Integer .toHexString (code ) + ")" );
1573
1575
}
1574
- throw new IllegalArgumentException (
1575
- "Unmatched second part of surrogate pair (0x"
1576
- + Integer .toHexString (code ) + ")" );
1576
+ // should we ever get this?
1577
+ throw new IllegalArgumentException ( "Illegal character point (0x"
1578
+ + Integer .toHexString (code ) + ") to output " );
1577
1579
}
1578
- // should we ever get this?
1579
- throw new IllegalArgumentException ("Illegal character point (0x"
1580
- + Integer .toHexString (code ) + ") to output" );
1581
1580
}
1582
1581
1583
1582
/*
0 commit comments