diff --git a/cbor/src/main/java/com/fasterxml/jackson/dataformat/cbor/CBORGenerator.java b/cbor/src/main/java/com/fasterxml/jackson/dataformat/cbor/CBORGenerator.java index ebf8cb56b..7b75bd10a 100644 --- a/cbor/src/main/java/com/fasterxml/jackson/dataformat/cbor/CBORGenerator.java +++ b/cbor/src/main/java/com/fasterxml/jackson/dataformat/cbor/CBORGenerator.java @@ -22,6 +22,11 @@ public class CBORGenerator extends GeneratorBase { private final static int[] NO_INTS = new int[0]; + /** + * The replacement character to use to fix invalid unicode sequences. + */ + final static int REPLACEMENT_CHAR = 0xfffd; + /** * Let's ensure that we have big enough output buffer because of safety * margins we need for UTF-8 encoding. @@ -63,7 +68,16 @@ public enum Feature implements FormatFeature { * * @since 2.5 */ - WRITE_TYPE_HEADER(false) + WRITE_TYPE_HEADER(false), + + /** + * Feature that determines if an invalid surrogate encoding found in the + * incoming String should fail with an exception or silently be outputed + * as the Unicode 'REPLACEMENT CHARACTER' (U+FFFD) + * + * @since 2.12 + */ + LENIENT_UTF_ENCODING(false), ; @@ -140,6 +154,13 @@ public int getMask() { protected boolean _cfgMinimalInts; + + /** + * If true we will output the REPLACEMENT_CHAR for invalid unicode sequences. + * If false we will throw an IllegalArgumentException for invalid unicode sequences. + */ + protected boolean _cfgLenientUnicodeEncoding; + /* /********************************************************** /* Output state @@ -234,6 +255,7 @@ public CBORGenerator(IOContext ctxt, int stdFeatures, int formatFeatures, _cborContext = CBORWriteContext.createRootContext(dups); _formatFeatures = formatFeatures; _cfgMinimalInts = Feature.WRITE_MINIMAL_INTS.enabledIn(formatFeatures); + _cfgLenientUnicodeEncoding = Feature.LENIENT_UTF_ENCODING.enabledIn(formatFeatures); _ioContext = ctxt; _out = out; _bufferRecyclable = true; @@ -406,6 +428,9 @@ public CBORGenerator enable(Feature f) { if (f == Feature.WRITE_MINIMAL_INTS) { _cfgMinimalInts = true; } + if (f == Feature.LENIENT_UTF_ENCODING) { + _cfgLenientUnicodeEncoding = true; + } return this; } @@ -414,6 +439,9 @@ public CBORGenerator disable(Feature f) { if (f == Feature.WRITE_MINIMAL_INTS) { _cfgMinimalInts = false; } + if (f == Feature.LENIENT_UTF_ENCODING) { + _cfgLenientUnicodeEncoding = false; + } return this; } @@ -1424,61 +1452,13 @@ private final int _encode(int outputPtr, char[] str, int i, int end) { do { int c = str[i]; if (c > 0x7F) { - return _shortUTF8Encode2(str, i, end, outputPtr, outputStart); + return _encode2(i, outputPtr, str, end, outputStart); } outBuf[outputPtr++] = (byte) c; } while (++i < end); return outputPtr - outputStart; } - /** - * Helper method called when the whole character sequence is known to fit in - * the output buffer, but not all characters are single-byte (ASCII) - * characters. - */ - private final int _shortUTF8Encode2(char[] str, int i, int end, - int outputPtr, int outputStart) { - final byte[] outBuf = _outputBuffer; - while (i < end) { - int c = str[i++]; - if (c <= 0x7F) { - outBuf[outputPtr++] = (byte) c; - continue; - } - // Nope, multi-byte: - if (c < 0x800) { // 2-byte - outBuf[outputPtr++] = (byte) (0xc0 | (c >> 6)); - outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f)); - continue; - } - // 3 or 4 bytes (surrogate) - // Surrogates? - if (c < SURR1_FIRST || c > SURR2_LAST) { // nope, regular 3-byte character - outBuf[outputPtr++] = (byte) (0xe0 | (c >> 12)); - outBuf[outputPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f)); - outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f)); - continue; - } - // Yup, a surrogate pair - if (c > SURR1_LAST) { // must be from first range; second won't do - _throwIllegalSurrogate(c); - } - // ... meaning it must have a pair - if (i >= end) { - _throwIllegalSurrogate(c); - } - c = _convertSurrogate(c, str[i++]); - if (c > 0x10FFFF) { // illegal in JSON as well as in XML - _throwIllegalSurrogate(c); - } - outBuf[outputPtr++] = (byte) (0xf0 | (c >> 18)); - outBuf[outputPtr++] = (byte) (0x80 | ((c >> 12) & 0x3f)); - outBuf[outputPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f)); - outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f)); - } - return (outputPtr - outputStart); - } - private final int _encode(int outputPtr, String str, int len) { final byte[] outBuf = _outputBuffer; final int outputStart = outputPtr; @@ -1486,19 +1466,19 @@ private final int _encode(int outputPtr, String str, int len) { for (int i = 0; i < len; ++i) { int c = str.charAt(i); if (c > 0x7F) { - return _encode2(i, outputPtr, str, len, outputStart); + return _encode2(i, outputPtr, str.toCharArray(), len, outputStart); } outBuf[outputPtr++] = (byte) c; } return (outputPtr - outputStart); } - private final int _encode2(int i, int outputPtr, String str, int len, + private final int _encode2(int i, int outputPtr, char[] str, int len, int outputStart) { final byte[] outBuf = _outputBuffer; // no; non-ASCII stuff, slower loop while (i < len) { - int c = str.charAt(i++); + int c = str[i++]; if (c <= 0x7F) { outBuf[outputPtr++] = (byte) c; continue; @@ -1520,20 +1500,31 @@ private final int _encode2(int i, int outputPtr, String str, int len, } // Yup, a surrogate pair if (c > SURR1_LAST) { // must be from first range; second won't do - _throwIllegalSurrogate(c); + c = _illegalSurrogateFound(c); } // ... meaning it must have a pair - if (i >= len) { - _throwIllegalSurrogate(c); + else if (i >= len) { + c = _illegalSurrogateFound(c); } - c = _convertSurrogate(c, str.charAt(i++)); - if (c > 0x10FFFF) { // illegal in JSON as well as in XML - _throwIllegalSurrogate(c); + // ... verify that the next character is in range + else if (str[i] < SURR2_FIRST || str[i] > SURR2_LAST) { + c = _illegalSurrogatePairFound(c, str[i]); + } + // ... we have a valid surrogate pair + else { + c = _convertSurrogate(c, str[i++]); + } + // if we replaced by the replacement char we actually have a 3 bytes char + if (c == REPLACEMENT_CHAR) { + outBuf[outputPtr++] = (byte) (0xe0 | (c >> 12)); + outBuf[outputPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f)); + outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f)); + } else { + outBuf[outputPtr++] = (byte) (0xf0 | (c >> 18)); + outBuf[outputPtr++] = (byte) (0x80 | ((c >> 12) & 0x3f)); + outBuf[outputPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f)); + outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f)); } - outBuf[outputPtr++] = (byte) (0xf0 | (c >> 18)); - outBuf[outputPtr++] = (byte) (0x80 | ((c >> 12) & 0x3f)); - outBuf[outputPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f)); - outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f)); } return (outputPtr - outputStart); } @@ -1542,38 +1533,50 @@ private final int _encode2(int i, int outputPtr, String str, int len, * Method called to calculate UTF codepoint, from a surrogate pair. */ private int _convertSurrogate(int firstPart, int secondPart) { - // Ok, then, is the second part valid? - if (secondPart < SURR2_FIRST || secondPart > SURR2_LAST) { - throw new IllegalArgumentException( - "Broken surrogate pair: first char 0x" - + Integer.toHexString(firstPart) + ", second 0x" - + Integer.toHexString(secondPart) - + "; illegal combination"); - } - return 0x10000 + ((firstPart - SURR1_FIRST) << 10) + int c = 0x10000 + ((firstPart - SURR1_FIRST) << 10) + (secondPart - SURR2_FIRST); + if (c > 0x10FFFF) { // illegal in JSON as well as in XML + c = _illegalSurrogatePairFound(firstPart, secondPart); + } + return c; } - private void _throwIllegalSurrogate(int code) { - if (code > 0x10FFFF) { // over max? - throw new IllegalArgumentException("Illegal character point (0x" - + Integer.toHexString(code) - + ") to output; max is 0x10FFFF as per RFC 4627"); + private int _illegalSurrogatePairFound(int firstPart, int secondPart) { + if (_cfgLenientUnicodeEncoding) { + return REPLACEMENT_CHAR; + } else { + throw new IllegalArgumentException( + "Broken surrogate pair: first char 0x" + + Integer.toHexString(firstPart) + ", second 0x" + + Integer.toHexString(secondPart) + + "; illegal combination"); } - if (code >= SURR1_FIRST) { - if (code <= SURR1_LAST) { // Unmatched first part (closing without - // second part?) + } + + private int _illegalSurrogateFound(int code) { + if (_cfgLenientUnicodeEncoding) { + return REPLACEMENT_CHAR; + } else { + if (code > 0x10FFFF) { // over max? + throw new IllegalArgumentException("Illegal character point (0x" + + Integer.toHexString(code) + + ") to output; max is 0x10FFFF as per RFC 4627"); + } + if (code >= SURR1_FIRST) { + if (code <= SURR1_LAST) { // Unmatched first part (closing without + // second part?) + throw new IllegalArgumentException( + "Unmatched first part of surrogate pair (0x" + + Integer.toHexString(code) + ")"); + } throw new IllegalArgumentException( - "Unmatched first part of surrogate pair (0x" + "Unmatched second part of surrogate pair (0x" + Integer.toHexString(code) + ")"); } - throw new IllegalArgumentException( - "Unmatched second part of surrogate pair (0x" - + Integer.toHexString(code) + ")"); + // should we ever get this? + throw new IllegalArgumentException("Illegal character point (0x" + + Integer.toHexString(code) + ") to output"); } - // should we ever get this? - throw new IllegalArgumentException("Illegal character point (0x" - + Integer.toHexString(code) + ") to output"); } /* diff --git a/cbor/src/test/java/com/fasterxml/jackson/dataformat/cbor/CBORTestBase.java b/cbor/src/test/java/com/fasterxml/jackson/dataformat/cbor/CBORTestBase.java index 43c609637..a4a4e553d 100644 --- a/cbor/src/test/java/com/fasterxml/jackson/dataformat/cbor/CBORTestBase.java +++ b/cbor/src/test/java/com/fasterxml/jackson/dataformat/cbor/CBORTestBase.java @@ -85,6 +85,14 @@ protected CBORGenerator cborGenerator(CBORFactory f, return f.createGenerator(result, null); } + protected CBORGenerator lenientUnicodeCborGenerator(ByteArrayOutputStream result) + throws IOException + { + CBORGenerator gen = cborGenerator(result); + gen.enable(CBORGenerator.Feature.LENIENT_UTF_ENCODING); + return gen; + } + /* /********************************************************** /* Additional assertion methods diff --git a/cbor/src/test/java/com/fasterxml/jackson/dataformat/cbor/gen/UnicodeGenerationTest.java b/cbor/src/test/java/com/fasterxml/jackson/dataformat/cbor/gen/UnicodeGenerationTest.java new file mode 100644 index 000000000..5fb96d0a2 --- /dev/null +++ b/cbor/src/test/java/com/fasterxml/jackson/dataformat/cbor/gen/UnicodeGenerationTest.java @@ -0,0 +1,114 @@ + +import java.io.*; +import java.math.BigDecimal; +import java.math.BigInteger; +import java.util.*; + +import org.junit.Assert; + +import com.fasterxml.jackson.core.JsonGenerationException; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import com.fasterxml.jackson.dataformat.cbor.CBORConstants; +import com.fasterxml.jackson.dataformat.cbor.CBORGenerator; +import com.fasterxml.jackson.dataformat.cbor.CBORParser; +import com.fasterxml.jackson.dataformat.cbor.CBORTestBase; + +public class UnicodeGenerationTest extends CBORTestBase +{ + /** + * Test that encoding a String containing invalid surrogates fail with an exception + */ + public void testFailForInvalidSurrogate() throws Exception + { + ByteArrayOutputStream out = new ByteArrayOutputStream(); + CBORGenerator gen = cborGenerator(out); + + assertEquals(0, gen.getOutputBuffered()); + + // Unmatched first surrogate character + try { + gen.writeString("x\ud83d"); + } catch (IllegalArgumentException e) { + } + assertEquals(0, gen.getOutputBuffered()); + + // Unmatched second surrogate character + try { + gen.writeString("x\ude01"); + } catch (IllegalArgumentException e) { + } + assertEquals(0, gen.getOutputBuffered()); + + // Unmatched second surrogate character (2) + try { + gen.writeString("x\ude01x"); + } catch (IllegalArgumentException e) { + } + assertEquals(0, gen.getOutputBuffered()); + + // Broken surrogate pair + try { + gen.writeString("x\ud83dx"); + } catch (IllegalArgumentException e) { + } + assertEquals(0, gen.getOutputBuffered()); + } + + /** + * Test that when the lenient unicode feature is enabled, the replacement character is used to fix invalid sequences + */ + public void testRecoverInvalidSurrogate() throws Exception + { + ByteArrayOutputStream out; + CBORGenerator gen; + byte[] b; + + out = new ByteArrayOutputStream(); + gen = lenientUnicodeCborGenerator(out); + assertEquals(0, gen.getOutputBuffered()); + + // Unmatched first surrogate character + gen.writeString("x\ud83d"); + gen.close(); + b = "x\ufffd".getBytes("utf-8"); + _verifyBytes(out.toByteArray(), + (byte) (CBORConstants.PREFIX_TYPE_TEXT + b.length), b); + + out = new ByteArrayOutputStream(); + gen = lenientUnicodeCborGenerator(out); + assertEquals(0, gen.getOutputBuffered()); + + // Unmatched second surrogate character + gen.writeString("x\ude01"); + gen.close(); + b = "x\ufffd".getBytes("utf-8"); + _verifyBytes(out.toByteArray(), + (byte) (CBORConstants.PREFIX_TYPE_TEXT + b.length), b); + + out = new ByteArrayOutputStream(); + gen = lenientUnicodeCborGenerator(out); + assertEquals(0, gen.getOutputBuffered()); + + // Unmatched second surrogate character (2) + gen.writeString("x\ude01x"); + gen.close(); + b = "x\ufffdx".getBytes("utf-8"); + _verifyBytes(out.toByteArray(), + (byte) (CBORConstants.PREFIX_TYPE_TEXT + b.length), b); + + out = new ByteArrayOutputStream(); + gen = lenientUnicodeCborGenerator(out); + assertEquals(0, gen.getOutputBuffered()); + + // Broken surrogate pair + gen.writeString("x\ud83dx"); + gen.close(); + b = "x\ufffdx".getBytes("utf-8"); + _verifyBytes(out.toByteArray(), + (byte) (CBORConstants.PREFIX_TYPE_TEXT + b.length), b); + + } + +} \ No newline at end of file