Skip to content

Add CBORGenerator.Feature.LENIENT_UTF_ENCODING for lenient handling of Unicode surrogate pairs on writing #222

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,11 @@ public class CBORGenerator extends GeneratorBase
{
private final static int[] NO_INTS = new int[0];

/**
* The replacement character to use to fix invalid unicode sequences.
*/
final static int REPLACEMENT_CHAR = 0xfffd;

/**
* Let's ensure that we have big enough output buffer because of safety
* margins we need for UTF-8 encoding.
Expand Down Expand Up @@ -63,7 +68,16 @@ public enum Feature implements FormatFeature {
*
* @since 2.5
*/
WRITE_TYPE_HEADER(false)
WRITE_TYPE_HEADER(false),

/**
* Feature that determines if an invalid surrogate encoding found in the
* incoming String should fail with an exception or silently be outputed
* as the Unicode 'REPLACEMENT CHARACTER' (U+FFFD)
*
* @since 2.12
*/
LENIENT_UTF_ENCODING(false),

;

Expand Down Expand Up @@ -140,6 +154,13 @@ public int getMask() {

protected boolean _cfgMinimalInts;


/**
* If true we will output the REPLACEMENT_CHAR for invalid unicode sequences.
* If false we will throw an IllegalArgumentException for invalid unicode sequences.
*/
protected boolean _cfgLenientUnicodeEncoding;

/*
/**********************************************************
/* Output state
Expand Down Expand Up @@ -234,6 +255,7 @@ public CBORGenerator(IOContext ctxt, int stdFeatures, int formatFeatures,
_cborContext = CBORWriteContext.createRootContext(dups);
_formatFeatures = formatFeatures;
_cfgMinimalInts = Feature.WRITE_MINIMAL_INTS.enabledIn(formatFeatures);
_cfgLenientUnicodeEncoding = Feature.LENIENT_UTF_ENCODING.enabledIn(formatFeatures);
_ioContext = ctxt;
_out = out;
_bufferRecyclable = true;
Expand Down Expand Up @@ -406,6 +428,9 @@ public CBORGenerator enable(Feature f) {
if (f == Feature.WRITE_MINIMAL_INTS) {
_cfgMinimalInts = true;
}
if (f == Feature.LENIENT_UTF_ENCODING) {
_cfgLenientUnicodeEncoding = true;
}
return this;
}

Expand All @@ -414,6 +439,9 @@ public CBORGenerator disable(Feature f) {
if (f == Feature.WRITE_MINIMAL_INTS) {
_cfgMinimalInts = false;
}
if (f == Feature.LENIENT_UTF_ENCODING) {
_cfgLenientUnicodeEncoding = false;
}
return this;
}

Expand Down Expand Up @@ -1424,81 +1452,33 @@ private final int _encode(int outputPtr, char[] str, int i, int end) {
do {
int c = str[i];
if (c > 0x7F) {
return _shortUTF8Encode2(str, i, end, outputPtr, outputStart);
return _encode2(i, outputPtr, str, end, outputStart);
}
outBuf[outputPtr++] = (byte) c;
} while (++i < end);
return outputPtr - outputStart;
}

/**
* Helper method called when the whole character sequence is known to fit in
* the output buffer, but not all characters are single-byte (ASCII)
* characters.
*/
private final int _shortUTF8Encode2(char[] str, int i, int end,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Curious as to why this was removed? Or did it just get moved and diff is confused.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It has been removed because this code was duplicated: _shortUTF8Encode2 and _encode2 were basically the same with the difference that one was taking a String as an argument and the other one was taking a char[]. Since this code contains some really non trivial logic I thought it would be better to not duplicate it.

int outputPtr, int outputStart) {
final byte[] outBuf = _outputBuffer;
while (i < end) {
int c = str[i++];
if (c <= 0x7F) {
outBuf[outputPtr++] = (byte) c;
continue;
}
// Nope, multi-byte:
if (c < 0x800) { // 2-byte
outBuf[outputPtr++] = (byte) (0xc0 | (c >> 6));
outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f));
continue;
}
// 3 or 4 bytes (surrogate)
// Surrogates?
if (c < SURR1_FIRST || c > SURR2_LAST) { // nope, regular 3-byte character
outBuf[outputPtr++] = (byte) (0xe0 | (c >> 12));
outBuf[outputPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f));
continue;
}
// Yup, a surrogate pair
if (c > SURR1_LAST) { // must be from first range; second won't do
_throwIllegalSurrogate(c);
}
// ... meaning it must have a pair
if (i >= end) {
_throwIllegalSurrogate(c);
}
c = _convertSurrogate(c, str[i++]);
if (c > 0x10FFFF) { // illegal in JSON as well as in XML
_throwIllegalSurrogate(c);
}
outBuf[outputPtr++] = (byte) (0xf0 | (c >> 18));
outBuf[outputPtr++] = (byte) (0x80 | ((c >> 12) & 0x3f));
outBuf[outputPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f));
}
return (outputPtr - outputStart);
}

private final int _encode(int outputPtr, String str, int len) {
final byte[] outBuf = _outputBuffer;
final int outputStart = outputPtr;

for (int i = 0; i < len; ++i) {
int c = str.charAt(i);
if (c > 0x7F) {
return _encode2(i, outputPtr, str, len, outputStart);
return _encode2(i, outputPtr, str.toCharArray(), len, outputStart);
}
outBuf[outputPtr++] = (byte) c;
}
return (outputPtr - outputStart);
}

private final int _encode2(int i, int outputPtr, String str, int len,
private final int _encode2(int i, int outputPtr, char[] str, int len,
int outputStart) {
final byte[] outBuf = _outputBuffer;
// no; non-ASCII stuff, slower loop
while (i < len) {
int c = str.charAt(i++);
int c = str[i++];
if (c <= 0x7F) {
outBuf[outputPtr++] = (byte) c;
continue;
Expand All @@ -1520,20 +1500,31 @@ private final int _encode2(int i, int outputPtr, String str, int len,
}
// Yup, a surrogate pair
if (c > SURR1_LAST) { // must be from first range; second won't do
_throwIllegalSurrogate(c);
c = _illegalSurrogateFound(c);
}
// ... meaning it must have a pair
if (i >= len) {
_throwIllegalSurrogate(c);
else if (i >= len) {
c = _illegalSurrogateFound(c);
}
c = _convertSurrogate(c, str.charAt(i++));
if (c > 0x10FFFF) { // illegal in JSON as well as in XML
_throwIllegalSurrogate(c);
// ... verify that the next character is in range
else if (str[i] < SURR2_FIRST || str[i] > SURR2_LAST) {
c = _illegalSurrogatePairFound(c, str[i]);
}
// ... we have a valid surrogate pair
else {
c = _convertSurrogate(c, str[i++]);
}
// if we replaced by the replacement char we actually have a 3 bytes char
if (c == REPLACEMENT_CHAR) {
outBuf[outputPtr++] = (byte) (0xe0 | (c >> 12));
outBuf[outputPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f));
} else {
outBuf[outputPtr++] = (byte) (0xf0 | (c >> 18));
outBuf[outputPtr++] = (byte) (0x80 | ((c >> 12) & 0x3f));
outBuf[outputPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f));
}
outBuf[outputPtr++] = (byte) (0xf0 | (c >> 18));
outBuf[outputPtr++] = (byte) (0x80 | ((c >> 12) & 0x3f));
outBuf[outputPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f));
}
return (outputPtr - outputStart);
}
Expand All @@ -1542,38 +1533,50 @@ private final int _encode2(int i, int outputPtr, String str, int len,
* Method called to calculate UTF codepoint, from a surrogate pair.
*/
private int _convertSurrogate(int firstPart, int secondPart) {
// Ok, then, is the second part valid?
if (secondPart < SURR2_FIRST || secondPart > SURR2_LAST) {
throw new IllegalArgumentException(
"Broken surrogate pair: first char 0x"
+ Integer.toHexString(firstPart) + ", second 0x"
+ Integer.toHexString(secondPart)
+ "; illegal combination");
}
return 0x10000 + ((firstPart - SURR1_FIRST) << 10)
int c = 0x10000 + ((firstPart - SURR1_FIRST) << 10)
+ (secondPart - SURR2_FIRST);
if (c > 0x10FFFF) { // illegal in JSON as well as in XML
c = _illegalSurrogatePairFound(firstPart, secondPart);
}
return c;
}

private void _throwIllegalSurrogate(int code) {
if (code > 0x10FFFF) { // over max?
throw new IllegalArgumentException("Illegal character point (0x"
+ Integer.toHexString(code)
+ ") to output; max is 0x10FFFF as per RFC 4627");
private int _illegalSurrogatePairFound(int firstPart, int secondPart) {
if (_cfgLenientUnicodeEncoding) {
return REPLACEMENT_CHAR;
} else {
throw new IllegalArgumentException(
"Broken surrogate pair: first char 0x"
+ Integer.toHexString(firstPart) + ", second 0x"
+ Integer.toHexString(secondPart)
+ "; illegal combination");
}
if (code >= SURR1_FIRST) {
if (code <= SURR1_LAST) { // Unmatched first part (closing without
// second part?)
}

private int _illegalSurrogateFound(int code) {
if (_cfgLenientUnicodeEncoding) {
return REPLACEMENT_CHAR;
} else {
if (code > 0x10FFFF) { // over max?
throw new IllegalArgumentException("Illegal character point (0x"
+ Integer.toHexString(code)
+ ") to output; max is 0x10FFFF as per RFC 4627");
}
if (code >= SURR1_FIRST) {
if (code <= SURR1_LAST) { // Unmatched first part (closing without
// second part?)
throw new IllegalArgumentException(
"Unmatched first part of surrogate pair (0x"
+ Integer.toHexString(code) + ")");
}
throw new IllegalArgumentException(
"Unmatched first part of surrogate pair (0x"
"Unmatched second part of surrogate pair (0x"
+ Integer.toHexString(code) + ")");
}
throw new IllegalArgumentException(
"Unmatched second part of surrogate pair (0x"
+ Integer.toHexString(code) + ")");
// should we ever get this?
throw new IllegalArgumentException("Illegal character point (0x"
+ Integer.toHexString(code) + ") to output");
}
// should we ever get this?
throw new IllegalArgumentException("Illegal character point (0x"
+ Integer.toHexString(code) + ") to output");
}

/*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,14 @@ protected CBORGenerator cborGenerator(CBORFactory f,
return f.createGenerator(result, null);
}

protected CBORGenerator lenientUnicodeCborGenerator(ByteArrayOutputStream result)
throws IOException
{
CBORGenerator gen = cborGenerator(result);
gen.enable(CBORGenerator.Feature.LENIENT_UTF_ENCODING);
return gen;
}

/*
/**********************************************************
/* Additional assertion methods
Expand Down
Loading