FasterXML · guillaumebort · Sep 29, 2020 · Sep 30, 2020 · cowtowncoder · Sep 29, 2020
diff --git a/cbor/src/main/java/com/fasterxml/jackson/dataformat/cbor/CBORGenerator.java b/cbor/src/main/java/com/fasterxml/jackson/dataformat/cbor/CBORGenerator.java
@@ -22,6 +22,11 @@ public class CBORGenerator extends GeneratorBase
 {
     private final static int[] NO_INTS = new int[0];
 
+    /**
+     * The replacement character to use to fix invalid unicode sequences.
+     */
+    final static int REPLACEMENT_CHAR = 0xfffd;
+
     /**
      * Let's ensure that we have big enough output buffer because of safety
      * margins we need for UTF-8 encoding.
@@ -63,7 +68,16 @@ public enum Feature implements FormatFeature {
          *
          * @since 2.5
          */
-        WRITE_TYPE_HEADER(false)
+        WRITE_TYPE_HEADER(false),
+
+        /**
+         * Feature that determines if an invalid surrogate encoding found in the
+         * incoming String should fail with an exception or silently be outputed
+         * as the Unicode 'REPLACEMENT CHARACTER' (U+FFFD)
+         *
+         * @since 2.12
+         */
+        LENIENT_UTF_ENCODING(false),
 
         ;
 
@@ -140,6 +154,13 @@ public int getMask() {
 
     protected boolean _cfgMinimalInts;
 
+
+    /**
+     * If true we will output the REPLACEMENT_CHAR for invalid unicode sequences.
+     * If false we will throw an IllegalArgumentException for invalid unicode sequences.
+     */
+    protected boolean _cfgLenientUnicodeEncoding;
+
     /*
     /**********************************************************
     /* Output state
@@ -234,6 +255,7 @@ public CBORGenerator(IOContext ctxt, int stdFeatures, int formatFeatures,
         _cborContext = CBORWriteContext.createRootContext(dups);
         _formatFeatures = formatFeatures;
         _cfgMinimalInts = Feature.WRITE_MINIMAL_INTS.enabledIn(formatFeatures);
+        _cfgLenientUnicodeEncoding = Feature.LENIENT_UTF_ENCODING.enabledIn(formatFeatures);
         _ioContext = ctxt;
         _out = out;
         _bufferRecyclable = true;
@@ -406,6 +428,9 @@ public CBORGenerator enable(Feature f) {
         if (f == Feature.WRITE_MINIMAL_INTS) {
             _cfgMinimalInts = true;
         }
+        if (f == Feature.LENIENT_UTF_ENCODING) {
+            _cfgLenientUnicodeEncoding = true;
+        }
         return this;
     }
 
@@ -414,6 +439,9 @@ public CBORGenerator disable(Feature f) {
         if (f == Feature.WRITE_MINIMAL_INTS) {
             _cfgMinimalInts = false;
         }
+        if (f == Feature.LENIENT_UTF_ENCODING) {
+            _cfgLenientUnicodeEncoding = false;
+        }
         return this;
     }
 
@@ -1424,81 +1452,33 @@ private final int _encode(int outputPtr, char[] str, int i, int end) {
         do {
             int c = str[i];
             if (c > 0x7F) {
-                return _shortUTF8Encode2(str, i, end, outputPtr, outputStart);
+                return _encode2(i, outputPtr, str, end, outputStart);
             }
             outBuf[outputPtr++] = (byte) c;
         } while (++i < end);
         return outputPtr - outputStart;
     }
 
-    /**
-     * Helper method called when the whole character sequence is known to fit in
-     * the output buffer, but not all characters are single-byte (ASCII)
-     * characters.
-     */
-    private final int _shortUTF8Encode2(char[] str, int i, int end,
-            int outputPtr, int outputStart) {
-        final byte[] outBuf = _outputBuffer;
-        while (i < end) {
-            int c = str[i++];
-            if (c <= 0x7F) {
-                outBuf[outputPtr++] = (byte) c;
-                continue;
-            }
-            // Nope, multi-byte:
-            if (c < 0x800) { // 2-byte
-                outBuf[outputPtr++] = (byte) (0xc0 | (c >> 6));
-                outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f));
-                continue;
-            }
-            // 3 or 4 bytes (surrogate)
-            // Surrogates?
-            if (c < SURR1_FIRST || c > SURR2_LAST) { // nope, regular 3-byte character
-                outBuf[outputPtr++] = (byte) (0xe0 | (c >> 12));
-                outBuf[outputPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
-                outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f));
-                continue;
-            }
-            // Yup, a surrogate pair
-            if (c > SURR1_LAST) { // must be from first range; second won't do
-                _throwIllegalSurrogate(c);
-            }
-            // ... meaning it must have a pair
-            if (i >= end) {
-                _throwIllegalSurrogate(c);
-            }
-            c = _convertSurrogate(c, str[i++]);
-            if (c > 0x10FFFF) { // illegal in JSON as well as in XML
-                _throwIllegalSurrogate(c);
-            }
-            outBuf[outputPtr++] = (byte) (0xf0 | (c >> 18));
-            outBuf[outputPtr++] = (byte) (0x80 | ((c >> 12) & 0x3f));
-            outBuf[outputPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
-            outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f));
-        }
-        return (outputPtr - outputStart);
-    }
-
     private final int _encode(int outputPtr, String str, int len) {
         final byte[] outBuf = _outputBuffer;
         final int outputStart = outputPtr;
 
         for (int i = 0; i < len; ++i) {
             int c = str.charAt(i);
             if (c > 0x7F) {
-                return _encode2(i, outputPtr, str, len, outputStart);
+                return _encode2(i, outputPtr, str.toCharArray(), len, outputStart);
             }
             outBuf[outputPtr++] = (byte) c;
         }
         return (outputPtr - outputStart);
     }
 
-    private final int _encode2(int i, int outputPtr, String str, int len,
+    private final int _encode2(int i, int outputPtr, char[] str, int len,
             int outputStart) {
         final byte[] outBuf = _outputBuffer;
         // no; non-ASCII stuff, slower loop
         while (i < len) {
-            int c = str.charAt(i++);
+            int c = str[i++];
             if (c <= 0x7F) {
                 outBuf[outputPtr++] = (byte) c;
                 continue;
@@ -1520,20 +1500,31 @@ private final int _encode2(int i, int outputPtr, String str, int len,
             }
             // Yup, a surrogate pair
             if (c > SURR1_LAST) { // must be from first range; second won't do
-                _throwIllegalSurrogate(c);
+                c = _illegalSurrogateFound(c);
             }
             // ... meaning it must have a pair
-            if (i >= len) {
-                _throwIllegalSurrogate(c);
+            else if (i >= len) {
+                c = _illegalSurrogateFound(c);
             }
-            c = _convertSurrogate(c, str.charAt(i++));
-            if (c > 0x10FFFF) { // illegal in JSON as well as in XML
-                _throwIllegalSurrogate(c);
+            // ... verify that the next character is in range
+            else if (str[i] < SURR2_FIRST || str[i] > SURR2_LAST) {
+                c = _illegalSurrogatePairFound(c, str[i]);
+            }
+            // ... we have a valid surrogate pair
+            else {
+                c = _convertSurrogate(c, str[i++]);
+            }
+            // if we replaced by the replacement char we actually have a 3 bytes char
+            if (c == REPLACEMENT_CHAR) {
+                outBuf[outputPtr++] = (byte) (0xe0 | (c >> 12));
+                outBuf[outputPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
+                outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f));
+            } else {
+                outBuf[outputPtr++] = (byte) (0xf0 | (c >> 18));
+                outBuf[outputPtr++] = (byte) (0x80 | ((c >> 12) & 0x3f));
+                outBuf[outputPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
+                outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f));
             }
-            outBuf[outputPtr++] = (byte) (0xf0 | (c >> 18));
-            outBuf[outputPtr++] = (byte) (0x80 | ((c >> 12) & 0x3f));
-            outBuf[outputPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
-            outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f));
         }
         return (outputPtr - outputStart);
     }
@@ -1542,38 +1533,50 @@ private final int _encode2(int i, int outputPtr, String str, int len,
      * Method called to calculate UTF codepoint, from a surrogate pair.
      */
     private int _convertSurrogate(int firstPart, int secondPart) {
-        // Ok, then, is the second part valid?
-        if (secondPart < SURR2_FIRST || secondPart > SURR2_LAST) {
-            throw new IllegalArgumentException(
-                    "Broken surrogate pair: first char 0x"
-                            + Integer.toHexString(firstPart) + ", second 0x"
-                            + Integer.toHexString(secondPart)
-                            + "; illegal combination");
-        }
-        return 0x10000 + ((firstPart - SURR1_FIRST) << 10)
+        int c = 0x10000 + ((firstPart - SURR1_FIRST) << 10)
                 + (secondPart - SURR2_FIRST);
+        if (c > 0x10FFFF) { // illegal in JSON as well as in XML
+            c = _illegalSurrogatePairFound(firstPart, secondPart);
+        }
+        return c;
     }
 
-    private void _throwIllegalSurrogate(int code) {
-        if (code > 0x10FFFF) { // over max?
-            throw new IllegalArgumentException("Illegal character point (0x"
-                    + Integer.toHexString(code)
-                    + ") to output; max is 0x10FFFF as per RFC 4627");
+    private int _illegalSurrogatePairFound(int firstPart, int secondPart) {
+        if (_cfgLenientUnicodeEncoding) {
+            return REPLACEMENT_CHAR;
+        } else {
+            throw new IllegalArgumentException(
+                "Broken surrogate pair: first char 0x"
+                        + Integer.toHexString(firstPart) + ", second 0x"
+                        + Integer.toHexString(secondPart)
+                        + "; illegal combination");
         }
-        if (code >= SURR1_FIRST) {
-            if (code <= SURR1_LAST) { // Unmatched first part (closing without
-                                      // second part?)
+    }
+
+    private int _illegalSurrogateFound(int code) {
+        if (_cfgLenientUnicodeEncoding) {
+            return REPLACEMENT_CHAR;
+        } else {
+            if (code > 0x10FFFF) { // over max?
+                throw new IllegalArgumentException("Illegal character point (0x"
+                        + Integer.toHexString(code)
+                        + ") to output; max is 0x10FFFF as per RFC 4627");
+            }
+            if (code >= SURR1_FIRST) {
+                if (code <= SURR1_LAST) { // Unmatched first part (closing without
+                                        // second part?)
+                    throw new IllegalArgumentException(
+                            "Unmatched first part of surrogate pair (0x"
+                                    + Integer.toHexString(code) + ")");
+                }
                 throw new IllegalArgumentException(
-                        "Unmatched first part of surrogate pair (0x"
+                        "Unmatched second part of surrogate pair (0x"
                                 + Integer.toHexString(code) + ")");
             }
-            throw new IllegalArgumentException(
-                    "Unmatched second part of surrogate pair (0x"
-                            + Integer.toHexString(code) + ")");
+            // should we ever get this?
+            throw new IllegalArgumentException("Illegal character point (0x"
+                    + Integer.toHexString(code) + ") to output");
         }
-        // should we ever get this?
-        throw new IllegalArgumentException("Illegal character point (0x"
-                + Integer.toHexString(code) + ") to output");
     }
 
     /*

diff --git a/cbor/src/test/java/com/fasterxml/jackson/dataformat/cbor/CBORTestBase.java b/cbor/src/test/java/com/fasterxml/jackson/dataformat/cbor/CBORTestBase.java
@@ -85,6 +85,14 @@ protected CBORGenerator cborGenerator(CBORFactory f,
         return f.createGenerator(result, null);
     }
 
+    protected CBORGenerator lenientUnicodeCborGenerator(ByteArrayOutputStream result)
+        throws IOException
+    {
+        CBORGenerator gen = cborGenerator(result);
+        gen.enable(CBORGenerator.Feature.LENIENT_UTF_ENCODING);
+        return gen;
+    }
+
     /*
     /**********************************************************
     /* Additional assertion methods