@@ -72,6 +72,8 @@ public enum Feature implements FormatFeature {
72
72
* Feature that determines if an invalid surrogate encoding found in the
73
73
* incoming String should fail with an exception or silently be outputed
74
74
* as the Unicode 'REPLACEMENT CHARACTER' (U+FFFD)
75
+ *
76
+ * @since 2.12
75
77
*/
76
78
LENIENT_UTF_ENCODING (false ),
77
79
@@ -150,6 +152,11 @@ public int getMask() {
150
152
151
153
protected boolean _cfgMinimalInts ;
152
154
155
+
156
+ /**
157
+ * If true we will output the REPLACEMENT_CHAR for invalid unicode sequences.
158
+ * If false we will throw an IllegalArgumentException for invalid unicode sequences.
159
+ */
153
160
protected boolean _cfgLenientUnicodeEncoding ;
154
161
155
162
/*
@@ -1425,27 +1432,15 @@ private final int _encode2(int i, int outputPtr, char[] str, int len,
1425
1432
}
1426
1433
// Yup, a surrogate pair
1427
1434
if (c > SURR1_LAST ) { // must be from first range; second won't do
1428
- if (_cfgLenientUnicodeEncoding ) {
1429
- c = REPLACEMENT_CHAR ;
1430
- } else {
1431
- _throwIllegalSurrogate (c );
1432
- }
1435
+ c = _illegalSurrogateFound (c );
1433
1436
}
1434
1437
// ... meaning it must have a pair
1435
1438
else if (i >= len ) {
1436
- if (_cfgLenientUnicodeEncoding ) {
1437
- c = REPLACEMENT_CHAR ;
1438
- } else {
1439
- _throwIllegalSurrogate (c );
1440
- }
1439
+ c = _illegalSurrogateFound (c );
1441
1440
}
1442
1441
// ... verify that the next character is in range
1443
1442
else if (str [i ] < SURR2_FIRST || str [i ] > SURR2_LAST ) {
1444
- if (_cfgLenientUnicodeEncoding ) {
1445
- c = REPLACEMENT_CHAR ;
1446
- } else {
1447
- _throwIllegalSurrogatePair (c , str [i ]);
1448
- }
1443
+ c = _illegalSurrogatePairFound (c , str [i ]);
1449
1444
}
1450
1445
// ... we have a valid surrogate pair
1451
1446
else {
@@ -1473,43 +1468,47 @@ private int _convertSurrogate(int firstPart, int secondPart) {
1473
1468
int c = 0x10000 + ((firstPart - SURR1_FIRST ) << 10 )
1474
1469
+ (secondPart - SURR2_FIRST );
1475
1470
if (c > 0x10FFFF ) { // illegal in JSON as well as in XML
1476
- if (_cfgLenientUnicodeEncoding ) {
1477
- c = REPLACEMENT_CHAR ;
1478
- } else {
1479
- _throwIllegalSurrogate (c );
1480
- }
1471
+ c = _illegalSurrogatePairFound (firstPart , secondPart );
1481
1472
}
1482
1473
return c ;
1483
1474
}
1484
1475
1485
- private void _throwIllegalSurrogatePair (int firstPart , int secondPart ) {
1486
- throw new IllegalArgumentException (
1487
- "Broken surrogate pair: first char 0x"
1488
- + Integer .toHexString (firstPart ) + ", second 0x"
1489
- + Integer .toHexString (secondPart )
1490
- + "; illegal combination" );
1476
+ private int _illegalSurrogatePairFound (int firstPart , int secondPart ) {
1477
+ if (_cfgLenientUnicodeEncoding ) {
1478
+ return REPLACEMENT_CHAR ;
1479
+ } else {
1480
+ throw new IllegalArgumentException (
1481
+ "Broken surrogate pair: first char 0x"
1482
+ + Integer .toHexString (firstPart ) + ", second 0x"
1483
+ + Integer .toHexString (secondPart )
1484
+ + "; illegal combination" );
1485
+ }
1491
1486
}
1492
1487
1493
- private void _throwIllegalSurrogate (int code ) {
1494
- if (code > 0x10FFFF ) { // over max?
1495
- throw new IllegalArgumentException ("Illegal character point (0x"
1496
- + Integer .toHexString (code )
1497
- + ") to output; max is 0x10FFFF as per RFC 4627" );
1498
- }
1499
- if (code >= SURR1_FIRST ) {
1500
- if (code <= SURR1_LAST ) { // Unmatched first part (closing without
1501
- // second part?)
1488
+ private int _illegalSurrogateFound (int code ) {
1489
+ if (_cfgLenientUnicodeEncoding ) {
1490
+ return REPLACEMENT_CHAR ;
1491
+ } else {
1492
+ if (code > 0x10FFFF ) { // over max?
1493
+ throw new IllegalArgumentException ("Illegal character point (0x"
1494
+ + Integer .toHexString (code )
1495
+ + ") to output; max is 0x10FFFF as per RFC 4627" );
1496
+ }
1497
+ if (code >= SURR1_FIRST ) {
1498
+ if (code <= SURR1_LAST ) { // Unmatched first part (closing without
1499
+ // second part?)
1500
+ throw new IllegalArgumentException (
1501
+ "Unmatched first part of surrogate pair (0x"
1502
+ + Integer .toHexString (code ) + ")" );
1503
+ }
1502
1504
throw new IllegalArgumentException (
1503
- "Unmatched first part of surrogate pair (0x"
1505
+ "Unmatched second part of surrogate pair (0x"
1504
1506
+ Integer .toHexString (code ) + ")" );
1505
1507
}
1506
- throw new IllegalArgumentException (
1507
- "Unmatched second part of surrogate pair (0x"
1508
- + Integer .toHexString (code ) + ")" );
1508
+ // should we ever get this?
1509
+ throw new IllegalArgumentException ( "Illegal character point (0x"
1510
+ + Integer .toHexString (code ) + ") to output " );
1509
1511
}
1510
- // should we ever get this?
1511
- throw new IllegalArgumentException ("Illegal character point (0x"
1512
- + Integer .toHexString (code ) + ") to output" );
1513
1512
}
1514
1513
1515
1514
/*
0 commit comments