Skip to content

Fixing a bug when multi-byte characters were split #75

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jan 14, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 12 additions & 8 deletions src/main/java/com/fasterxml/aalto/out/ByteXmlWriter.java
Original file line number Diff line number Diff line change
Expand Up @@ -390,7 +390,7 @@ public final void writeStartTagEnd()
flushBuffer();
}
_outputBuffer[_outputPtr++] = BYTE_GT;
}
}

@Override
public void writeStartTagEmptyEnd()
Expand Down Expand Up @@ -435,7 +435,7 @@ public final void writeEndTag(WName name)
ptr += name.appendBytes(bbuf, ptr);
bbuf[ptr++] = BYTE_GT;
_outputPtr = ptr;
}
}

/*
/**********************************************************************
Expand Down Expand Up @@ -572,6 +572,8 @@ protected final void writeAttrValue(char[] vbuf, int offset, int len)
{
if (_surrogate != 0) {
outputSurrogates(_surrogate, vbuf[offset]);
// reset the temporary surrogate storage
_surrogate = 0;
++offset;
--len;
}
Expand Down Expand Up @@ -785,7 +787,7 @@ public int writeCData(char[] cbuf, int offset, int len)
writeCDataEnd(); // will check surrogates
}
return ix;
}
}

protected int writeCDataContents(char[] cbuf, int offset, int len)
throws IOException, XMLStreamException
Expand Down Expand Up @@ -865,7 +867,7 @@ protected int writeCDataContents(char[] cbuf, int offset, int len)
}
}
return -1;
}
}

@Override
public final void writeCharacters(String text)
Expand Down Expand Up @@ -908,6 +910,8 @@ public final void writeCharacters(char[] cbuf, int offset, int len)
{
if (_surrogate != 0) {
outputSurrogates(_surrogate, cbuf[offset]);
// reset the temporary surrogate storage
_surrogate = 0;
++offset;
--len;
}
Expand Down Expand Up @@ -1088,7 +1092,7 @@ private final void writeSplitCharacters(char[] cbuf, int offset, int len)
}
_outputBuffer[_outputPtr++] = (byte)ch;
}
}
}

/*
/**********************************************************************
Expand Down Expand Up @@ -1439,7 +1443,7 @@ public void writeXmlDeclaration(String version, String encoding, String standalo
// !!! TBI: check validity
writeRaw(version, 0, version.length());
writeRaw(BYTE_APOS);

if (encoding != null && encoding.length() > 0) {
writeRaw(BYTES_XMLDECL_ENCODING);
// !!! TBI: check validity
Expand All @@ -1453,7 +1457,7 @@ public void writeXmlDeclaration(String version, String encoding, String standalo
writeRaw(BYTE_APOS);
}
writeRaw(BYTE_QMARK, BYTE_GT);
}
}

/*
/**********************************************************************
Expand Down Expand Up @@ -1594,7 +1598,7 @@ protected final void flushBuffer()
protected final void writeAsEntity(int c)
throws IOException
{
// Quickie check to avoid
// Quickie check to avoid

byte[] buf = _outputBuffer;
int ptr = _outputPtr;
Expand Down
57 changes: 57 additions & 0 deletions src/test/java/com/fasterxml/aalto/sax/TestSaxWriter.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
package com.fasterxml.aalto.sax;

import com.fasterxml.aalto.out.Utf8XmlWriter;
import com.fasterxml.aalto.out.WriterConfig;

import java.io.ByteArrayOutputStream;

public class TestSaxWriter extends base.BaseTestCase {

public void testSurrogateMemory1() throws Exception {
// This test aims to produce the
// javax.xml.stream.XMLStreamException: Incomplete surrogate pair in content: first char 0xd835, second 0x78
// error message. Before fixing the respective issue, it was provoked by a multi-byte character
// where the first byte was exactly at the end of the internal reading buffer and enough further data
// to also fill the next two internal reading buffers. Then, the code would try to fuse the first byte
// of the original multi-byte character with the first character in the third buffer because
// ByteXmlWriter#_surrogate was not set back to 0 after writing the original multi-byte character.
StringBuilder testText = new StringBuilder();
for (int i = 0; i < 511; i++)
testText.append('x');
testText.append("\uD835\uDFCE");
for (int i = 0; i < 512; i++)
testText.append('x');

WriterConfig writerConfig = new WriterConfig();
ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
Utf8XmlWriter writer = new Utf8XmlWriter(writerConfig, byteArrayOutputStream);
writer.writeStartTagStart(writer.constructName("testelement"));
writer.writeAttribute(writer.constructName("testattr"), testText.toString());
writer.writeStartTagEnd();
writer.writeEndTag(writer.constructName("testelement"));
writer.close(false);

}

public void testSurrogateMemory2() throws Exception {
// This test aims to produce the
// java.io.IOException: Unpaired surrogate character (0xd835)
// error message. Before fixing the respective issue, it was provoked by a multi-byte character
// where the first byte was exactly at the end of the internal reading buffer and the next
// reading buffer was enough to write all the remaining data. Then, by the missing reset of
// ByteXmlWriter#_surrogate, the code expected another multi-byte surrogate that never came.
StringBuilder testText = new StringBuilder();
for (int i = 0; i < 511; i++)
testText.append('x');
testText.append("\uD835\uDFCE");

WriterConfig writerConfig = new WriterConfig();
ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
Utf8XmlWriter writer = new Utf8XmlWriter(writerConfig, byteArrayOutputStream);
writer.writeStartTagStart(writer.constructName("testelement"));
writer.writeAttribute(writer.constructName("testattr"), testText.toString());
writer.writeStartTagEnd();
writer.writeEndTag(writer.constructName("testelement"));
writer.close(false);
}
}