-
-
Notifications
You must be signed in to change notification settings - Fork 143
Add CBORGenerator.Feature.LENIENT_UTF_ENCODING
for lenient handling of Unicode surrogate pairs on writing
#222
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -22,6 +22,11 @@ public class CBORGenerator extends GeneratorBase | |
{ | ||
private final static int[] NO_INTS = new int[0]; | ||
|
||
/** | ||
* The replacement character to use to fix invalid unicode sequences. | ||
*/ | ||
final static int REPLACEMENT_CHAR = 0xfffd; | ||
|
||
/** | ||
* Let's ensure that we have big enough output buffer because of safety | ||
* margins we need for UTF-8 encoding. | ||
|
@@ -63,7 +68,16 @@ public enum Feature implements FormatFeature { | |
* | ||
* @since 2.5 | ||
*/ | ||
WRITE_TYPE_HEADER(false) | ||
WRITE_TYPE_HEADER(false), | ||
|
||
/** | ||
* Feature that determines if an invalid surrogate encoding found in the | ||
* incoming String should fail with an exception or silently be outputed | ||
* as the Unicode 'REPLACEMENT CHARACTER' (U+FFFD) | ||
* | ||
* @since 2.12 | ||
*/ | ||
LENIENT_UTF_ENCODING(false), | ||
|
||
; | ||
|
||
|
@@ -140,6 +154,13 @@ public int getMask() { | |
|
||
protected boolean _cfgMinimalInts; | ||
|
||
|
||
/** | ||
* If true we will output the REPLACEMENT_CHAR for invalid unicode sequences. | ||
* If false we will throw an IllegalArgumentException for invalid unicode sequences. | ||
*/ | ||
protected boolean _cfgLenientUnicodeEncoding; | ||
guillaumebort marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
/* | ||
/********************************************************** | ||
/* Output state | ||
|
@@ -234,6 +255,7 @@ public CBORGenerator(IOContext ctxt, int stdFeatures, int formatFeatures, | |
_cborContext = CBORWriteContext.createRootContext(dups); | ||
_formatFeatures = formatFeatures; | ||
_cfgMinimalInts = Feature.WRITE_MINIMAL_INTS.enabledIn(formatFeatures); | ||
_cfgLenientUnicodeEncoding = Feature.LENIENT_UTF_ENCODING.enabledIn(formatFeatures); | ||
_ioContext = ctxt; | ||
_out = out; | ||
_bufferRecyclable = true; | ||
|
@@ -406,6 +428,9 @@ public CBORGenerator enable(Feature f) { | |
if (f == Feature.WRITE_MINIMAL_INTS) { | ||
_cfgMinimalInts = true; | ||
} | ||
if (f == Feature.LENIENT_UTF_ENCODING) { | ||
_cfgLenientUnicodeEncoding = true; | ||
} | ||
return this; | ||
} | ||
|
||
|
@@ -414,6 +439,9 @@ public CBORGenerator disable(Feature f) { | |
if (f == Feature.WRITE_MINIMAL_INTS) { | ||
_cfgMinimalInts = false; | ||
} | ||
if (f == Feature.LENIENT_UTF_ENCODING) { | ||
_cfgLenientUnicodeEncoding = false; | ||
} | ||
return this; | ||
} | ||
|
||
|
@@ -1424,81 +1452,33 @@ private final int _encode(int outputPtr, char[] str, int i, int end) { | |
do { | ||
int c = str[i]; | ||
if (c > 0x7F) { | ||
return _shortUTF8Encode2(str, i, end, outputPtr, outputStart); | ||
return _encode2(i, outputPtr, str, end, outputStart); | ||
} | ||
outBuf[outputPtr++] = (byte) c; | ||
} while (++i < end); | ||
return outputPtr - outputStart; | ||
} | ||
|
||
/** | ||
* Helper method called when the whole character sequence is known to fit in | ||
* the output buffer, but not all characters are single-byte (ASCII) | ||
* characters. | ||
*/ | ||
private final int _shortUTF8Encode2(char[] str, int i, int end, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Curious as to why this was removed? Or did it just get moved and diff is confused. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It has been removed because this code was duplicated: |
||
int outputPtr, int outputStart) { | ||
final byte[] outBuf = _outputBuffer; | ||
while (i < end) { | ||
int c = str[i++]; | ||
if (c <= 0x7F) { | ||
outBuf[outputPtr++] = (byte) c; | ||
continue; | ||
} | ||
// Nope, multi-byte: | ||
if (c < 0x800) { // 2-byte | ||
outBuf[outputPtr++] = (byte) (0xc0 | (c >> 6)); | ||
outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f)); | ||
continue; | ||
} | ||
// 3 or 4 bytes (surrogate) | ||
// Surrogates? | ||
if (c < SURR1_FIRST || c > SURR2_LAST) { // nope, regular 3-byte character | ||
outBuf[outputPtr++] = (byte) (0xe0 | (c >> 12)); | ||
outBuf[outputPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f)); | ||
outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f)); | ||
continue; | ||
} | ||
// Yup, a surrogate pair | ||
if (c > SURR1_LAST) { // must be from first range; second won't do | ||
_throwIllegalSurrogate(c); | ||
} | ||
// ... meaning it must have a pair | ||
if (i >= end) { | ||
_throwIllegalSurrogate(c); | ||
} | ||
c = _convertSurrogate(c, str[i++]); | ||
if (c > 0x10FFFF) { // illegal in JSON as well as in XML | ||
_throwIllegalSurrogate(c); | ||
} | ||
outBuf[outputPtr++] = (byte) (0xf0 | (c >> 18)); | ||
outBuf[outputPtr++] = (byte) (0x80 | ((c >> 12) & 0x3f)); | ||
outBuf[outputPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f)); | ||
outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f)); | ||
} | ||
return (outputPtr - outputStart); | ||
} | ||
|
||
private final int _encode(int outputPtr, String str, int len) { | ||
final byte[] outBuf = _outputBuffer; | ||
final int outputStart = outputPtr; | ||
|
||
for (int i = 0; i < len; ++i) { | ||
int c = str.charAt(i); | ||
if (c > 0x7F) { | ||
return _encode2(i, outputPtr, str, len, outputStart); | ||
return _encode2(i, outputPtr, str.toCharArray(), len, outputStart); | ||
cowtowncoder marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
outBuf[outputPtr++] = (byte) c; | ||
} | ||
return (outputPtr - outputStart); | ||
} | ||
|
||
private final int _encode2(int i, int outputPtr, String str, int len, | ||
private final int _encode2(int i, int outputPtr, char[] str, int len, | ||
int outputStart) { | ||
final byte[] outBuf = _outputBuffer; | ||
// no; non-ASCII stuff, slower loop | ||
while (i < len) { | ||
int c = str.charAt(i++); | ||
int c = str[i++]; | ||
if (c <= 0x7F) { | ||
outBuf[outputPtr++] = (byte) c; | ||
continue; | ||
|
@@ -1520,20 +1500,31 @@ private final int _encode2(int i, int outputPtr, String str, int len, | |
} | ||
// Yup, a surrogate pair | ||
if (c > SURR1_LAST) { // must be from first range; second won't do | ||
_throwIllegalSurrogate(c); | ||
c = _illegalSurrogateFound(c); | ||
} | ||
// ... meaning it must have a pair | ||
if (i >= len) { | ||
_throwIllegalSurrogate(c); | ||
else if (i >= len) { | ||
c = _illegalSurrogateFound(c); | ||
} | ||
c = _convertSurrogate(c, str.charAt(i++)); | ||
if (c > 0x10FFFF) { // illegal in JSON as well as in XML | ||
_throwIllegalSurrogate(c); | ||
// ... verify that the next character is in range | ||
else if (str[i] < SURR2_FIRST || str[i] > SURR2_LAST) { | ||
c = _illegalSurrogatePairFound(c, str[i]); | ||
} | ||
// ... we have a valid surrogate pair | ||
else { | ||
c = _convertSurrogate(c, str[i++]); | ||
} | ||
// if we replaced by the replacement char we actually have a 3 bytes char | ||
if (c == REPLACEMENT_CHAR) { | ||
outBuf[outputPtr++] = (byte) (0xe0 | (c >> 12)); | ||
outBuf[outputPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f)); | ||
outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f)); | ||
} else { | ||
outBuf[outputPtr++] = (byte) (0xf0 | (c >> 18)); | ||
outBuf[outputPtr++] = (byte) (0x80 | ((c >> 12) & 0x3f)); | ||
outBuf[outputPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f)); | ||
outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f)); | ||
} | ||
outBuf[outputPtr++] = (byte) (0xf0 | (c >> 18)); | ||
outBuf[outputPtr++] = (byte) (0x80 | ((c >> 12) & 0x3f)); | ||
outBuf[outputPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f)); | ||
outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f)); | ||
} | ||
return (outputPtr - outputStart); | ||
} | ||
|
@@ -1542,38 +1533,50 @@ private final int _encode2(int i, int outputPtr, String str, int len, | |
* Method called to calculate UTF codepoint, from a surrogate pair. | ||
*/ | ||
private int _convertSurrogate(int firstPart, int secondPart) { | ||
// Ok, then, is the second part valid? | ||
if (secondPart < SURR2_FIRST || secondPart > SURR2_LAST) { | ||
throw new IllegalArgumentException( | ||
"Broken surrogate pair: first char 0x" | ||
+ Integer.toHexString(firstPart) + ", second 0x" | ||
+ Integer.toHexString(secondPart) | ||
+ "; illegal combination"); | ||
} | ||
return 0x10000 + ((firstPart - SURR1_FIRST) << 10) | ||
int c = 0x10000 + ((firstPart - SURR1_FIRST) << 10) | ||
+ (secondPart - SURR2_FIRST); | ||
if (c > 0x10FFFF) { // illegal in JSON as well as in XML | ||
c = _illegalSurrogatePairFound(firstPart, secondPart); | ||
} | ||
return c; | ||
} | ||
|
||
private void _throwIllegalSurrogate(int code) { | ||
if (code > 0x10FFFF) { // over max? | ||
throw new IllegalArgumentException("Illegal character point (0x" | ||
+ Integer.toHexString(code) | ||
+ ") to output; max is 0x10FFFF as per RFC 4627"); | ||
private int _illegalSurrogatePairFound(int firstPart, int secondPart) { | ||
if (_cfgLenientUnicodeEncoding) { | ||
return REPLACEMENT_CHAR; | ||
} else { | ||
throw new IllegalArgumentException( | ||
"Broken surrogate pair: first char 0x" | ||
+ Integer.toHexString(firstPart) + ", second 0x" | ||
+ Integer.toHexString(secondPart) | ||
+ "; illegal combination"); | ||
} | ||
if (code >= SURR1_FIRST) { | ||
if (code <= SURR1_LAST) { // Unmatched first part (closing without | ||
// second part?) | ||
} | ||
|
||
private int _illegalSurrogateFound(int code) { | ||
if (_cfgLenientUnicodeEncoding) { | ||
return REPLACEMENT_CHAR; | ||
} else { | ||
if (code > 0x10FFFF) { // over max? | ||
throw new IllegalArgumentException("Illegal character point (0x" | ||
+ Integer.toHexString(code) | ||
+ ") to output; max is 0x10FFFF as per RFC 4627"); | ||
} | ||
if (code >= SURR1_FIRST) { | ||
if (code <= SURR1_LAST) { // Unmatched first part (closing without | ||
// second part?) | ||
throw new IllegalArgumentException( | ||
"Unmatched first part of surrogate pair (0x" | ||
+ Integer.toHexString(code) + ")"); | ||
} | ||
throw new IllegalArgumentException( | ||
"Unmatched first part of surrogate pair (0x" | ||
"Unmatched second part of surrogate pair (0x" | ||
+ Integer.toHexString(code) + ")"); | ||
} | ||
throw new IllegalArgumentException( | ||
"Unmatched second part of surrogate pair (0x" | ||
+ Integer.toHexString(code) + ")"); | ||
// should we ever get this? | ||
throw new IllegalArgumentException("Illegal character point (0x" | ||
+ Integer.toHexString(code) + ") to output"); | ||
} | ||
// should we ever get this? | ||
throw new IllegalArgumentException("Illegal character point (0x" | ||
+ Integer.toHexString(code) + ") to output"); | ||
} | ||
|
||
/* | ||
|
Uh oh!
There was an error while loading. Please reload this page.