From 9094b646e0504bb69ceb636109db6ce47f26ccef Mon Sep 17 00:00:00 2001 From: Jeremy Long Date: Thu, 13 Nov 2025 06:45:05 -0500 Subject: [PATCH 1/2] feat: add XML 1.1 encoding resolves #73 --- .../main/java/org/owasp/encoder/Encode.java | 81 ++++++ .../main/java/org/owasp/encoder/Encoders.java | 27 ++ .../java/org/owasp/encoder/XMLEncoder.java | 100 +++++++- .../org/owasp/encoder/XML11EncoderTest.java | 242 ++++++++++++++++++ 4 files changed, 437 insertions(+), 13 deletions(-) create mode 100644 core/src/test/java/org/owasp/encoder/XML11EncoderTest.java diff --git a/core/src/main/java/org/owasp/encoder/Encode.java b/core/src/main/java/org/owasp/encoder/Encode.java index 67972d1..be5ddab 100644 --- a/core/src/main/java/org/owasp/encoder/Encode.java +++ b/core/src/main/java/org/owasp/encoder/Encode.java @@ -868,6 +868,87 @@ public static void forXmlComment(Writer out, String input) encode(Encoders.XML_COMMENT_ENCODER, out, input); } + /** + * Encoder for XML 1.1 contexts. Similar to {@link #forXml(String)} but + * follows the XML 1.1 specification which allows all control characters + * (except null) to be encoded as character references. This method encodes + * control characters in the ranges [#x1-#x8, #xB-#xC, #xE-#x1F, #x7F-#x9F] + * as character references (e.g., {@code }), while tab, line feed, + * and carriage return are passed through unencoded. This is safe for use + * in both XML 1.1 content and attributes. + * + * @param input the input to encode + * @return the encoded result + */ + public static String forXml11(String input) { + return encode(Encoders.XML_11_ENCODER, input); + } + + /** + * See {@link #forXml11(String)} for description of encoding. This + * version writes directly to a Writer without an intervening string. + * + * @param out where to write encoded output + * @param input the input string to encode + * @throws IOException if thrown by writer + */ + public static void forXml11(Writer out, String input) + throws IOException + { + encode(Encoders.XML_11_ENCODER, out, input); + } + + /** + * Encoder for XML 1.1 content. Similar to {@link #forXmlContent(String)} + * but follows the XML 1.1 specification for control character handling. + * + * @param input the input to encode + * @return the encoded result + */ + public static String forXml11Content(String input) { + return encode(Encoders.XML_11_CONTENT_ENCODER, input); + } + + /** + * See {@link #forXml11Content(String)} for description of encoding. This + * version writes directly to a Writer without an intervening string. + * + * @param out where to write encoded output + * @param input the input string to encode + * @throws IOException if thrown by writer + */ + public static void forXml11Content(Writer out, String input) + throws IOException + { + encode(Encoders.XML_11_CONTENT_ENCODER, out, input); + } + + /** + * Encoder for XML 1.1 attribute content. Similar to + * {@link #forXmlAttribute(String)} but follows the XML 1.1 specification + * for control character handling. + * + * @param input the input to encode + * @return the encoded result + */ + public static String forXml11Attribute(String input) { + return encode(Encoders.XML_11_ATTRIBUTE_ENCODER, input); + } + + /** + * See {@link #forXml11Attribute(String)} for description of encoding. This + * version writes directly to a Writer without an intervening string. + * + * @param out where to write encoded output + * @param input the input string to encode + * @throws IOException if thrown by writer + */ + public static void forXml11Attribute(Writer out, String input) + throws IOException + { + encode(Encoders.XML_11_ATTRIBUTE_ENCODER, out, input); + } + /** * Encodes data for an XML CDATA section. On the chance that the input * contains a terminating {@code "]]>"}, it will be replaced by diff --git a/core/src/main/java/org/owasp/encoder/Encoders.java b/core/src/main/java/org/owasp/encoder/Encoders.java index 3879fd0..0e3a553 100644 --- a/core/src/main/java/org/owasp/encoder/Encoders.java +++ b/core/src/main/java/org/owasp/encoder/Encoders.java @@ -88,6 +88,18 @@ public final class Encoders { * Name of {@linkplain Encode#forXmlComment(String) XML comment} context. */ public static final String XML_COMMENT = "xml-comment"; + /** + * Name of XML 1.1 general context. + */ + public static final String XML_11 = "xml-1.1"; + /** + * Name of XML 1.1 content context. + */ + public static final String XML_11_CONTENT = "xml-1.1-content"; + /** + * Name of XML 1.1 attribute context. + */ + public static final String XML_11_ATTRIBUTE = "xml-1.1-attribute"; /** * Name of {@linkplain Encode#forCDATA(String) CDATA} context. */ @@ -160,6 +172,21 @@ public final class Encoders { */ static final XMLCommentEncoder XML_COMMENT_ENCODER = map(XML_COMMENT, new XMLCommentEncoder()); + /** + * Encoder for general XML 1.1 contexts. + */ + static final XMLEncoder XML_11_ENCODER + = map(XML_11, new XMLEncoder(XMLEncoder.Mode.ALL, XMLEncoder.Version.XML_1_1)); + /** + * Encoder for XML 1.1 content contexts. + */ + static final XMLEncoder XML_11_CONTENT_ENCODER + = map(XML_11_CONTENT, new XMLEncoder(XMLEncoder.Mode.CONTENT, XMLEncoder.Version.XML_1_1)); + /** + * Encoder for XML 1.1 attribute contexts. + */ + static final XMLEncoder XML_11_ATTRIBUTE_ENCODER + = map(XML_11_ATTRIBUTE, new XMLEncoder(XMLEncoder.Mode.ATTRIBUTE, XMLEncoder.Version.XML_1_1)); /** * Encoder for CDATA contexts. */ diff --git a/core/src/main/java/org/owasp/encoder/XMLEncoder.java b/core/src/main/java/org/owasp/encoder/XMLEncoder.java index cec6205..f32ad22 100644 --- a/core/src/main/java/org/owasp/encoder/XMLEncoder.java +++ b/core/src/main/java/org/owasp/encoder/XMLEncoder.java @@ -92,6 +92,27 @@ class XMLEncoder extends Encoder { * The encoded length of a double-quotation character. */ static final int QUOT_LENGTH = 5; + /** + * The encoded length of a control character reference (e.g., ). + */ + static final int CONTROL_CHAR_REF_LENGTH = 6; + + /** + * An enum of supported XML versions for the XMLEncoder. + */ + enum Version { + /** + * XML 1.0 - control characters (except tab, lf, cr) are replaced with space. + * Valid chars: #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] + */ + XML_1_0, + /** + * XML 1.1 - control characters (except tab, lf, cr) are encoded as character references. + * All chars [#x1-#x10FFFF] are allowed (excluding noncharacters). + * Restricted chars [#x1-#x8, #xB-#xC, #xE-#x1F, #x7F-#x9F] must be encoded. + */ + XML_1_1 + } /** * An enum of supported "modes" of operation for the XMLEncoder. @@ -174,27 +195,45 @@ long validMask() { * implementation. */ private final Mode _mode; + /** + * The XML version for this encoder. + */ + private final Version _version; /** - * Default constructor--equivalent to XMLEncoder(Mode.ALL). + * Default constructor--equivalent to XMLEncoder(Mode.ALL, Version.XML_1_0). */ XMLEncoder() { - this(Mode.ALL); + this(Mode.ALL, Version.XML_1_0); } /** - * Creates an XMLEncoder for the specified mode constant. + * Creates an XMLEncoder for the specified mode constant with XML 1.0. * * @param mode the mode of the encoder. */ XMLEncoder(Mode mode) { + this(mode, Version.XML_1_0); + } + + /** + * Creates an XMLEncoder for the specified mode and version. + * + * @param mode the mode of the encoder. + * @param version the XML version for the encoder. + */ + XMLEncoder(Mode mode, Version version) { _mode = mode; + _version = version; _validMask = mode.validMask(); } @Override public int maxEncodedLength(int n) { - // "&" = 5 chars. + // "&" = 5 chars, "" = 6 chars (XML 1.1 control chars) + if (_version == Version.XML_1_1) { + return n * CONTROL_CHAR_REF_LENGTH; + } return n * MAX_ENCODED_CHAR_LENGTH; } @@ -213,6 +252,7 @@ public int firstEncodedOffset(String input, int off, int len) { } } else if (ch < Character.MIN_HIGH_SURROGATE) { if (ch <= Unicode.MAX_C1_CTRL_CHAR && ch != Unicode.NEL) { + // C1 control character - needs encoding in XML 1.1 or replacement in XML 1.0 return i; // } else { // // valid @@ -314,23 +354,57 @@ protected CoderResult encodeArrays(CharBuffer input, CharBuffer output, boolean out[j++] = ';'; break; default: - // invalid character - if (j >= m) { - return overflow(input, i, output, j); + // invalid character for XML 1.0 + if (_version == Version.XML_1_1 && ch != 0) { + // In XML 1.1, encode C0 control characters (except null) as character references + if (j + CONTROL_CHAR_REF_LENGTH > m) { + return overflow(input, i, output, j); + } + out[j++] = '&'; + out[j++] = '#'; + out[j++] = 'x'; + int val = ch; + out[j++] = Character.forDigit((val >> 4) & 0xF, 16); + out[j++] = Character.forDigit(val & 0xF, 16); + out[j++] = ';'; + } else { + // XML 1.0: replace invalid character with space + // XML 1.1: null is still invalid, replace with space + if (j >= m) { + return overflow(input, i, output, j); + } + out[j++] = INVALID_CHARACTER_REPLACEMENT; } - out[j++] = INVALID_CHARACTER_REPLACEMENT; break; } } } else if (ch < Character.MIN_HIGH_SURROGATE) { - if (j >= m) { - return overflow(input, i, output, j); - } if (ch > Unicode.MAX_C1_CTRL_CHAR || ch == Unicode.NEL) { + if (j >= m) { + return overflow(input, i, output, j); + } out[j++] = ch; } else { // C1 control code - out[j++] = INVALID_CHARACTER_REPLACEMENT; + if (_version == Version.XML_1_1) { + // In XML 1.1, encode C1 control characters (except NEL) as character references + if (j + CONTROL_CHAR_REF_LENGTH > m) { + return overflow(input, i, output, j); + } + out[j++] = '&'; + out[j++] = '#'; + out[j++] = 'x'; + int val = ch; + out[j++] = Character.forDigit((val >> 4) & 0xF, 16); + out[j++] = Character.forDigit(val & 0xF, 16); + out[j++] = ';'; + } else { + // XML 1.0: replace invalid character with space + if (j >= m) { + return overflow(input, i, output, j); + } + out[j++] = INVALID_CHARACTER_REPLACEMENT; + } } } else if (ch <= Character.MAX_HIGH_SURROGATE) { if (i + 1 < n) { @@ -389,6 +463,6 @@ protected CoderResult encodeArrays(CharBuffer input, CharBuffer output, boolean @Override public String toString() { - return "XMLEncoder(" + _mode + ")"; + return "XMLEncoder(" + _mode + ", " + _version + ")"; } } diff --git a/core/src/test/java/org/owasp/encoder/XML11EncoderTest.java b/core/src/test/java/org/owasp/encoder/XML11EncoderTest.java new file mode 100644 index 0000000..fa947fe --- /dev/null +++ b/core/src/test/java/org/owasp/encoder/XML11EncoderTest.java @@ -0,0 +1,242 @@ +// Copyright (c) 2012 Jeff Ichnowski +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// +// * Redistributions of source code must retain the above +// copyright notice, this list of conditions and the following +// disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials +// provided with the distribution. +// +// * Neither the name of the OWASP nor the names of its +// contributors may be used to endorse or promote products +// derived from this software without specific prior written +// permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +// COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, +// INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, +// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED +// OF THE POSSIBILITY OF SUCH DAMAGE. + +package org.owasp.encoder; + +import junit.framework.Test; +import junit.framework.TestCase; +import junit.framework.TestSuite; + +/** + * XML11EncoderTest -- test suite for the XML 1.1 encoder. + * + * @author Jeff Ichnowski + */ +public class XML11EncoderTest extends TestCase { + + public XML11EncoderTest(String testName) { + super(testName); + } + + public static Test suite() { + TestSuite suite = new TestSuite(); + for (XMLEncoder.Mode mode : XMLEncoder.Mode.values()) { + XMLEncoder encoder = new XMLEncoder(mode, XMLEncoder.Version.XML_1_1); + EncoderTestSuiteBuilder builder = new EncoderTestSuiteBuilder(encoder, "-safe-", "-&-") + .encode("safe", "safe") + .encode("unencoded & encoded", "unencoded & encoded") + .encode("valid-surrogate-pair", "\ud800\udc00", "\ud800\udc00") + .encode("missing-low-surrogate", " ", "\ud800") + .encode("missing-high-surrogate", " ", "\udc00") + .encode("valid-upper-char", "\ufffd", "\ufffd") + .encode("invalid-upper-char", " ", "\uffff") + + // XML 1.1 specific: control characters are encoded, not replaced + .encode("control-char-0x01", "", "\u0001") + .encode("control-char-0x02", "", "\u0002") + .encode("control-char-0x08", "", "\u0008") + .encode("control-char-0x0B", " ", "\u000B") + .encode("control-char-0x0C", " ", "\u000C") + .encode("control-char-0x0E", "", "\u000E") + .encode("control-char-0x1F", "", "\u001F") + + // C1 control characters (0x7F-0x9F) should be encoded in XML 1.1 + .encode("control-char-0x7F", "", "\u007F") + .encode("control-char-0x80", "€", "\u0080") + .encode("control-char-0x9F", "Ÿ", "\u009F") + + // Tab, LF, CR are still passed through unencoded + .encode("tab-char", "\t", "\t") + .encode("lf-char", "\n", "\n") + .encode("cr-char", "\r", "\r") + + // NEL (0x85) is valid and unencoded in XML 1.1 + .encode("nel-char", "\u0085", "\u0085") + + // Combined test + .encode("mixed-control-chars", "a\tb\nc", "\u0001a\t\u007Fb\nc"); + + // Invalid characters: null, non-characters, surrogates should be replaced + builder.invalid(0x00, 0x00) + .invalid(Character.MIN_SURROGATE, Character.MAX_SURROGATE) + .invalid(0xfdd0, 0xfdef) + .invalid(0xfffe, 0xffff) + .invalid(0x1fffe, 0x1ffff) + .invalid(0x2fffe, 0x2ffff) + .invalid(0x3fffe, 0x3ffff) + .invalid(0x4fffe, 0x4ffff) + .invalid(0x5fffe, 0x5ffff) + .invalid(0x6fffe, 0x6ffff) + .invalid(0x7fffe, 0x7ffff) + .invalid(0x8fffe, 0x8ffff) + .invalid(0x9fffe, 0x9ffff) + .invalid(0xafffe, 0xaffff) + .invalid(0xbfffe, 0xbffff) + .invalid(0xcfffe, 0xcffff) + .invalid(0xdfffe, 0xdffff) + .invalid(0xefffe, 0xeffff) + .invalid(0xffffe, 0xfffff) + .invalid(0x10fffe, 0x10ffff); + + // Mark all characters as valid (they're allowed in XML 1.1, even if they get encoded) + builder.valid(0x01, Character.MAX_CODE_POINT); + + switch (mode) { + case ALL: + builder.encoded("&><\'\"") + .encode("&", "&") + .encode(">", ">") + .encode("<", "<") + .encode("'", "\'") + .encode(""", "\""); + break; + case CONTENT: + builder.encoded("&><") + .encode("&", "&") + .encode(">", ">") + .encode("<", "<") + .encode("\'", "\'") + .encode("\"", "\""); + break; + case ATTRIBUTE: + builder.encoded("&<\'\"") + .encode("&", "&") + .encode(">", ">") + .encode("<", "<") + .encode("'", "\'") + .encode(""", "\""); + break; + case SINGLE_QUOTED_ATTRIBUTE: + builder.encoded("&<\'") + .encode("&", "&") + .encode(">", ">") + .encode("<", "<") + .encode("'", "\'") + .encode("\"", "\""); + break; + case DOUBLE_QUOTED_ATTRIBUTE: + builder.encoded("&<\"") + .encode("&", "&") + .encode(">", ">") + .encode("<", "<") + .encode("\'", "\'") + .encode(""", "\""); + break; + default: + throw new AssertionError("untested mode: "+mode); + } + + suite.addTest(builder + .invalidSuite(XMLEncoder.INVALID_CHARACTER_REPLACEMENT) + .encodedSuite() + .build()); + } + return suite; + } + + /** + * Test that the public API methods work correctly for XML 1.1. + */ + public void testXML11PublicAPI() { + String input = "test\u0001\u0002&<>"; + + // Test forXml11 + String result = Encode.forXml11(input); + assertEquals("test&<>", result); + + // Test forXml11Content + result = Encode.forXml11Content(input); + assertEquals("test&<>", result); + + // Test forXml11Attribute + result = Encode.forXml11Attribute(input); + assertEquals("test&<>", result); + } + + /** + * Test that tab, lf, and cr are not encoded in XML 1.1. + */ + public void testXML11AllowedControlChars() { + String input = "a\tb\nc\rd"; + String result = Encode.forXml11(input); + assertEquals("a\tb\nc\rd", result); + } + + /** + * Test that C0 control characters (except tab, lf, cr) are encoded in XML 1.1. + */ + public void testXML11C0ControlChars() { + // Test each C0 control character + for (int i = 1; i <= 0x1F; i++) { + if (i == 0x09 || i == 0x0A || i == 0x0D) { + // Tab, LF, CR should not be encoded + continue; + } + char ch = (char) i; + String input = "a" + ch + "b"; + String result = Encode.forXml11(input); + String expected = "a&#x" + Integer.toHexString(i) + ";b"; + assertEquals("C0 control char 0x" + Integer.toHexString(i) + " should be encoded", + expected, result); + } + } + + /** + * Test that C1 control characters (except NEL) are encoded in XML 1.1. + */ + public void testXML11C1ControlChars() { + // Test DEL and C1 control characters + for (int i = 0x7F; i <= 0x9F; i++) { + if (i == 0x85) { + // NEL should not be encoded + continue; + } + char ch = (char) i; + String input = "a" + ch + "b"; + String result = Encode.forXml11(input); + String expected = "a&#x" + Integer.toHexString(i) + ";b"; + assertEquals("C1 control char 0x" + Integer.toHexString(i) + " should be encoded", + expected, result); + } + } + + /** + * Test that NEL (0x85) is not encoded in XML 1.1. + */ + public void testXML11NEL() { + String input = "a\u0085b"; + String result = Encode.forXml11(input); + assertEquals("a\u0085b", result); + } +} From b7cb589a1ec2ca11cf4e9da0655a7e77973589e3 Mon Sep 17 00:00:00 2001 From: Jeremy Long Date: Thu, 13 Nov 2025 07:03:04 -0500 Subject: [PATCH 2/2] fix: apply suggestions from code review Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- core/src/test/java/org/owasp/encoder/XML11EncoderTest.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/core/src/test/java/org/owasp/encoder/XML11EncoderTest.java b/core/src/test/java/org/owasp/encoder/XML11EncoderTest.java index fa947fe..e71e609 100644 --- a/core/src/test/java/org/owasp/encoder/XML11EncoderTest.java +++ b/core/src/test/java/org/owasp/encoder/XML11EncoderTest.java @@ -206,7 +206,7 @@ public void testXML11C0ControlChars() { char ch = (char) i; String input = "a" + ch + "b"; String result = Encode.forXml11(input); - String expected = "a&#x" + Integer.toHexString(i) + ";b"; + String expected = "a&#x" + String.format("%02x", i) + ";b"; assertEquals("C0 control char 0x" + Integer.toHexString(i) + " should be encoded", expected, result); } @@ -225,8 +225,8 @@ public void testXML11C1ControlChars() { char ch = (char) i; String input = "a" + ch + "b"; String result = Encode.forXml11(input); - String expected = "a&#x" + Integer.toHexString(i) + ";b"; - assertEquals("C1 control char 0x" + Integer.toHexString(i) + " should be encoded", + String expected = "a&#x" + String.format("%02x", i) + ";b"; + assertEquals("C1 control char 0x" + String.format("%02x", i) + " should be encoded", expected, result); } }