Skip to content

Commit

Permalink
Allow PrettyPrintWriter to replace invalid XML characters when not …
Browse files Browse the repository at this point in the history
…running in quirks mode. Closes #335.
  • Loading branch information
basil authored and joehni committed Sep 27, 2023
1 parent dbe845d commit c2deef3
Show file tree
Hide file tree
Showing 3 changed files with 148 additions and 18 deletions.
1 change: 1 addition & 0 deletions xstream-distribution/src/content/changes.html
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ <h1 id="upcoming-1.4.x">Upcoming 1.4.x maintenance release</h1>
<h2>Minor changes</h2>

<ul>
<li>GHPR:#335: Allow PrettyPrintWriter to replace invalid XML characters when not running in quirks mode (by Basil Crow).</li>
<li>GHPR:#331, GHI:#326: Fix handling of empty java.util.concurrent.atomic.AtomicReference (by Alex Blekhman of Atlassian).</li>
<li>GHPR:#334: Fix remaining buffer size calculation in QuickWriter (by Higuchi Yuta).</li>
<li>GHI:#342: Optimize internal handling of children in DomReader avoiding O(n²) access times for siblings (by Shiang-Yun Yang).</li>
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/*
* Copyright (C) 2004, 2005, 2006 Joe Walnes.
* Copyright (C) 2006, 2007, 2008, 2009, 2011, 2013, 2014, 2015 XStream Committers.
* Copyright (C) 2006, 2007, 2008, 2009, 2011, 2013, 2014, 2015, 2023 XStream Committers.
* All rights reserved.
*
* The software in this package is published under the terms of the BSD
Expand Down Expand Up @@ -42,6 +42,8 @@
* href="http://www.w3.org/TR/2006/REC-xml11-20060816/#charsets">1.1</a>. If a character is not supported, a
* {@link StreamException} is thrown. Select a proper parser implementation that respects the version in the XML header
* (the Xpp3 parser will also read character entities of normally invalid characters).
* You may also switch to XML_1_0_REPLACEMENT or XML_1_1_REPLACEMENT mode, which will replace the invalid characters
* with a U+FFFD replacement character.
* </p>
*
* @author Joe Walnes
Expand All @@ -52,6 +54,8 @@ public class PrettyPrintWriter extends AbstractXmlWriter {
public static int XML_QUIRKS = -1;
public static int XML_1_0 = 0;
public static int XML_1_1 = 1;
public static int XML_1_0_REPLACEMENT = 2;
public static int XML_1_1_REPLACEMENT = 3;

private final QuickWriter writer;
private final FastStack<String> elementStack = new FastStack<>(16);
Expand All @@ -71,6 +75,7 @@ public class PrettyPrintWriter extends AbstractXmlWriter {
private static final char[] QUOT = "&quot;".toCharArray();
private static final char[] APOS = "&apos;".toCharArray();
private static final char[] CLOSE = "</".toCharArray();
private static final char[] REPLACEMENT = "&#xfffd;".toCharArray();

/**
* @since 1.4
Expand All @@ -80,8 +85,8 @@ public PrettyPrintWriter(final Writer writer, final int mode, final char[] lineI
this.writer = new QuickWriter(writer);
this.lineIndenter = lineIndenter;
this.mode = mode;
if (mode < XML_QUIRKS || mode > XML_1_1) {
throw new IllegalArgumentException("Not a valid XML mode");
if (mode < XML_QUIRKS || mode > XML_1_1_REPLACEMENT) {
throw new IllegalArgumentException("Not a valid XML mode: " + mode);
}
}

Expand Down Expand Up @@ -213,6 +218,8 @@ private void writeText(final String text, final boolean isAttribute) {
case '\0':
if (mode == XML_QUIRKS) {
writer.write(NULL);
} else if (mode == XML_1_0_REPLACEMENT || mode == XML_1_1_REPLACEMENT) {
writer.write(REPLACEMENT);
} else {
throw new StreamException("Invalid character 0x0 in XML stream");
}
Expand Down Expand Up @@ -244,32 +251,53 @@ private void writeText(final String text, final boolean isAttribute) {
//$FALL-THROUGH$
default:
if (Character.isDefined(c) && !Character.isISOControl(c)) {
boolean replaced = false;
if (mode != XML_QUIRKS) {
if (c > '\ud7ff' && c < '\ue000') {
throw new StreamException("Invalid character 0x"
+ Integer.toHexString(c)
+ " in XML stream");
if (mode == XML_1_0_REPLACEMENT || mode == XML_1_1_REPLACEMENT) {
writer.write(REPLACEMENT);
replaced = true;
} else {
throw new StreamException("Invalid character 0x"
+ Integer.toHexString(c)
+ " in XML stream");
}
}
}
writer.write(c);
if (!replaced) {
writer.write(c);
}
} else {
if (mode == XML_1_0) {
boolean replaced = false;
if (mode == XML_1_0 || mode == XML_1_0_REPLACEMENT) {
if (c < 9 || c == '\u000b' || c == '\u000c' || c == '\u000e' || c >= '\u000f' && c <= '\u001f') {
throw new StreamException("Invalid character 0x"
+ Integer.toHexString(c)
+ " in XML 1.0 stream");
if (mode == XML_1_0_REPLACEMENT) {
writer.write(REPLACEMENT);
replaced = true;
} else {
throw new StreamException("Invalid character 0x"
+ Integer.toHexString(c)
+ " in XML 1.0 stream");
}
}
}
if (mode != XML_QUIRKS) {
if (c == '\ufffe' || c == '\uffff') {
throw new StreamException("Invalid character 0x"
+ Integer.toHexString(c)
+ " in XML stream");
if (mode == XML_1_0_REPLACEMENT || mode == XML_1_1_REPLACEMENT) {
writer.write(REPLACEMENT);
replaced = true;
} else {
throw new StreamException("Invalid character 0x"
+ Integer.toHexString(c)
+ " in XML stream");
}
}
}
writer.write("&#x");
writer.write(Integer.toHexString(c));
writer.write(';');
if (!replaced) {
writer.write("&#x");
writer.write(Integer.toHexString(c));
writer.write(';');
}
}
}
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/*
* Copyright (C) 2004, 2005 Joe Walnes.
* Copyright (C) 2006, 2007, 2008, 2013, 2018 XStream Committers.
* Copyright (C) 2006, 2007, 2008, 2013, 2018, 2023 XStream Committers.
* All rights reserved.
*
* The software in this package is published under the terms of the BSD
Expand Down Expand Up @@ -168,6 +168,24 @@ public void testThrowsForNullInXml1_1Mode() {
}
}

public void testReplacesNullInXml1_0ReplacementMode() {
writer = new PrettyPrintWriter(buffer, PrettyPrintWriter.XML_1_0_REPLACEMENT);
writer.startNode("tag");
writer.setValue("\u0000");
writer.endNode();

assertXmlProducedIs("<tag>&#xfffd;</tag>");
}

public void testReplacesNullInXml1_1ReplacementMode() {
writer = new PrettyPrintWriter(buffer, PrettyPrintWriter.XML_1_1_REPLACEMENT);
writer.startNode("tag");
writer.setValue("\u0000");
writer.endNode();

assertXmlProducedIs("<tag>&#xfffd;</tag>");
}

public void testSupportsOnlyValidControlCharactersInXml1_0Mode() {
writer = new PrettyPrintWriter(buffer, PrettyPrintWriter.XML_1_0);
writer.startNode("tag");
Expand Down Expand Up @@ -237,6 +255,65 @@ public void testSupportsOnlyValidControlCharactersInXml1_1Mode() {
+ "&#x98;&#x99;&#x9a;&#x9b;&#x9c;&#x9d;&#x9e;&#x9f;</tag>");
}

public void testReplacesInvalidControlCharactersInXml1_0ReplacementMode() {
writer = new PrettyPrintWriter(buffer, PrettyPrintWriter.XML_1_0_REPLACEMENT);
writer.startNode("tag");
final String ctrl = ""
+ "\u0000\u0001\u0002\u0003\u0004\u0005\u0006\u0007"
+ "\u0008\u0009\n\u000b\u000c\r\u000e\u000f"
+ "\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017"
+ "\u0018\u0019\u001a\u001b\u001c\u001d\u001e\u001f"
+ "\u007f"
+ "\u0080\u0081\u0082\u0083\u0084\u0085\u0086\u0087"
+ "\u0088\u0089\u008a\u008b\u008c\u008d\u008e\u008f"
+ "\u0090\u0091\u0092\u0093\u0094\u0095\u0096\u0097"
+ "\u0098\u0099\u009a\u009b\u009c\u009d\u009e\u009f"
+ "";
for (int i = 0; i < ctrl.length(); i++) {
final char c = ctrl.charAt(i);
writer.setValue(new Character(c).toString());
}
writer.endNode();

assertXmlProducedIs("<tag>&#xfffd;&#xfffd;&#xfffd;&#xfffd;&#xfffd;&#xfffd;&#xfffd;&#xfffd;"
+ "&#xfffd;\t\n&#xfffd;&#xfffd;&#xd;&#xfffd;&#xfffd;"
+ "&#xfffd;&#xfffd;&#xfffd;&#xfffd;&#xfffd;&#xfffd;&#xfffd;&#xfffd;"
+ "&#xfffd;&#xfffd;&#xfffd;&#xfffd;&#xfffd;&#xfffd;&#xfffd;&#xfffd;"
+ "&#x7f;"
+ "&#x80;&#x81;&#x82;&#x83;&#x84;&#x85;&#x86;&#x87;"
+ "&#x88;&#x89;&#x8a;&#x8b;&#x8c;&#x8d;&#x8e;&#x8f;"
+ "&#x90;&#x91;&#x92;&#x93;&#x94;&#x95;&#x96;&#x97;"
+ "&#x98;&#x99;&#x9a;&#x9b;&#x9c;&#x9d;&#x9e;&#x9f;</tag>"); }

public void testReplacesInvalidControlCharactersInXml1_1ReplacementMode() {
writer = new PrettyPrintWriter(buffer, PrettyPrintWriter.XML_1_1_REPLACEMENT);
writer.startNode("tag");
final String ctrl = ""
+ "\u0000\u0001\u0002\u0003\u0004\u0005\u0006\u0007"
+ "\u0008\u0009\n\u000b\u000c\r\u000e\u000f"
+ "\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017"
+ "\u0018\u0019\u001a\u001b\u001c\u001d\u001e\u001f"
+ "\u007f"
+ "\u0080\u0081\u0082\u0083\u0084\u0085\u0086\u0087"
+ "\u0088\u0089\u008a\u008b\u008c\u008d\u008e\u008f"
+ "\u0090\u0091\u0092\u0093\u0094\u0095\u0096\u0097"
+ "\u0098\u0099\u009a\u009b\u009c\u009d\u009e\u009f"
+ "";
for (int i = 0; i < ctrl.length(); i++) {
final char c = ctrl.charAt(i);
writer.setValue(new Character(c).toString());
}
writer.endNode();
assertXmlProducedIs("<tag>&#xfffd;&#x1;&#x2;&#x3;&#x4;&#x5;&#x6;&#x7;"
+ "&#x8;\t\n&#xb;&#xc;&#xd;&#xe;&#xf;"
+ "&#x10;&#x11;&#x12;&#x13;&#x14;&#x15;&#x16;&#x17;"
+ "&#x18;&#x19;&#x1a;&#x1b;&#x1c;&#x1d;&#x1e;&#x1f;&#x7f;"
+ "&#x80;&#x81;&#x82;&#x83;&#x84;&#x85;&#x86;&#x87;"
+ "&#x88;&#x89;&#x8a;&#x8b;&#x8c;&#x8d;&#x8e;&#x8f;"
+ "&#x90;&#x91;&#x92;&#x93;&#x94;&#x95;&#x96;&#x97;"
+ "&#x98;&#x99;&#x9a;&#x9b;&#x9c;&#x9d;&#x9e;&#x9f;</tag>");
}

public void testSupportsInvalidUnicodeCharacterslInQuirksMode() {
writer = new PrettyPrintWriter(buffer, PrettyPrintWriter.XML_QUIRKS);
writer.startNode("tag");
Expand Down Expand Up @@ -295,6 +372,30 @@ public void testThrowsForInvalidUnicodeCharacterslInXml1_1Mode() {
assertXmlProducedIs("<tag>&#xd7ff;\ue000\ufffd</tag>");
}

public void testReplacesInvalidUnicodeCharactersInXml1_0ReplacementMode() {
writer = new PrettyPrintWriter(buffer, PrettyPrintWriter.XML_1_0_REPLACEMENT);
writer.startNode("tag");
final String ctrl = "\ud7ff\ud800\udfff\ue000\ufffd\ufffe\uffff";
for (int i = 0; i < ctrl.length(); i++) {
final char c = ctrl.charAt(i);
writer.setValue(new Character(c).toString());
}
writer.endNode();
assertXmlProducedIs("<tag>&#xd7ff;&#xfffd;&#xfffd;\ue000\ufffd&#xfffd;&#xfffd;</tag>");
}

public void testReplacesInvalidUnicodeCharactersInXml1_1ReplacementMode() {
writer = new PrettyPrintWriter(buffer, PrettyPrintWriter.XML_1_1_REPLACEMENT);
writer.startNode("tag");
final String ctrl = "\ud7ff\ud800\udfff\ue000\ufffd\ufffe\uffff";
for (int i = 0; i < ctrl.length(); i++) {
final char c = ctrl.charAt(i);
writer.setValue(new Character(c).toString());
}
writer.endNode();
assertXmlProducedIs("<tag>&#xd7ff;&#xfffd;&#xfffd;\ue000\ufffd&#xfffd;&#xfffd;</tag>");
}

private String replace(final String in, final char what, final String with) {
final int pos = in.indexOf(what);
if (pos == -1) {
Expand Down

0 comments on commit c2deef3

Please sign in to comment.