From d6d7ced675373011278554346150485904f3a4f3 Mon Sep 17 00:00:00 2001 From: ongdisheng Date: Sun, 1 Mar 2026 12:32:43 +0000 Subject: [PATCH 1/2] [ASTERIXDB-2877][CSV] Fix multi-byte/emoji character corruption in CSV output --- .../data/nontagged/printers/PrintTools.java | 6 +- .../nontagged/printers/PrintToolsTest.java | 160 ++++++++++++++++++ 2 files changed, 163 insertions(+), 3 deletions(-) create mode 100644 asterixdb/asterix-om/src/test/java/org/apache/asterix/dataflow/data/nontagged/printers/PrintToolsTest.java diff --git a/asterixdb/asterix-om/src/main/java/org/apache/asterix/dataflow/data/nontagged/printers/PrintTools.java b/asterixdb/asterix-om/src/main/java/org/apache/asterix/dataflow/data/nontagged/printers/PrintTools.java index 8c5dfbbbbb4..2d608ee2eea 100644 --- a/asterixdb/asterix-om/src/main/java/org/apache/asterix/dataflow/data/nontagged/printers/PrintTools.java +++ b/asterixdb/asterix-om/src/main/java/org/apache/asterix/dataflow/data/nontagged/printers/PrintTools.java @@ -312,7 +312,7 @@ public static void writeUTF8StringAsCSV(byte[] b, int s, int l, PrintStream ps, boolean shouldQuote = forceQuote; if (!shouldQuote) { // Check if the string contains any special characters that require quoting - for (int i = position; i < maxPosition; i++) { + for (int i = position; i < maxPosition; i += UTF8StringUtil.charSize(b, i)) { char c = UTF8StringUtil.charAt(b, i); if (c == quote || c == '\r' || c == '\n' || c == escape || c == delimiter) { shouldQuote = true; @@ -340,9 +340,9 @@ public static void writeUTF8StringAsCSV(byte[] b, int s, int l, PrintStream ps, continue; } - // Write the character bytes + // Write the character bytes as raw UTF-8 bytes to avoid charset encoding bugs while (sz > 0) { - ps.print(c); + ps.write(b[position]); ++position; --sz; } diff --git a/asterixdb/asterix-om/src/test/java/org/apache/asterix/dataflow/data/nontagged/printers/PrintToolsTest.java b/asterixdb/asterix-om/src/test/java/org/apache/asterix/dataflow/data/nontagged/printers/PrintToolsTest.java new file mode 100644 index 00000000000..957c4149c88 --- /dev/null +++ b/asterixdb/asterix-om/src/test/java/org/apache/asterix/dataflow/data/nontagged/printers/PrintToolsTest.java @@ -0,0 +1,160 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.asterix.dataflow.data.nontagged.printers; + +import static org.junit.Assert.assertEquals; + +import java.io.ByteArrayOutputStream; +import java.io.PrintStream; +import java.nio.charset.StandardCharsets; + +import org.apache.hyracks.util.string.UTF8StringUtil; +import org.junit.Test; + +public class PrintToolsTest { + + private static final char QUOTE = '"'; + private static final char ESCAPE = '"'; + private static final char DELIMITER = ','; + + public static String csvOf(String input) throws Exception { + byte[] bytes = UTF8StringUtil.writeStringToBytes(input); + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + PrintStream ps = new PrintStream(baos, true, StandardCharsets.UTF_8); + PrintTools.writeUTF8StringAsCSV(bytes, 0, bytes.length, ps, QUOTE, false, ESCAPE, DELIMITER); + return baos.toString(StandardCharsets.UTF_8); + } + + public static String jsonOf(String input) throws Exception { + byte[] bytes = UTF8StringUtil.writeStringToBytes(input); + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + PrintTools.writeUTF8StringAsJSON(bytes, 0, bytes.length, baos); + return baos.toString(StandardCharsets.UTF_8); + } + + @Test + public void testCsvAsciiPlain() throws Exception { + assertEquals("hello", csvOf("hello")); + } + + @Test + public void testCsvAsciiEmpty() throws Exception { + assertEquals("", csvOf("")); + } + + @Test + public void testCsvAsciiQuotingOnComma() throws Exception { + assertEquals("\"a,b\"", csvOf("a,b")); + } + + @Test + public void testCsvAsciiQuotingOnNewline() throws Exception { + assertEquals("\"a\nb\"", csvOf("a\nb")); + } + + @Test + public void testCsvAsciiEscapeQuote() throws Exception { + assertEquals("\"say \"\"hi\"\"\"", csvOf("say \"hi\"")); + } + + @Test + public void testCsvTwoByteChar() throws Exception { + assertEquals("café", csvOf("café")); + } + + @Test + public void testCsvThreeByteChar() throws Exception { + assertEquals("中文", csvOf("中文")); + } + + @Test + public void testCsvEmoji() throws Exception { + assertEquals("💪", csvOf("💪")); + } + + @Test + public void testCsvMixedEmojiString() throws Exception { + String input = "No more 💪🏻🦋"; + assertEquals(input, csvOf(input)); + } + + @Test + public void testCsvMixedMultibyteAndAscii() throws Exception { + String input = "Hello 中文 café 😀!"; + assertEquals(input, csvOf(input)); + } + + @Test + public void testCsvEmojiWithComma() throws Exception { + assertEquals("\"💪,💪\"", csvOf("💪,💪")); + } + + @Test + public void testCsvEmojiWithNewline() throws Exception { + assertEquals("\"💪\n💪\"", csvOf("💪\n💪")); + } + + @Test + public void testCsvEmojiWithQuote() throws Exception { + assertEquals("\"💪\"\"💪\"", csvOf("💪\"💪")); + } + + @Test + public void testJsonAsciiPlain() throws Exception { + assertEquals("\"hello\"", jsonOf("hello")); + } + + @Test + public void testJsonAsciiEmpty() throws Exception { + assertEquals("\"\"", jsonOf("")); + } + + @Test + public void testJsonAsciiEscapeSpecialChars() throws Exception { + assertEquals("\"line1\\nline2\"", jsonOf("line1\nline2")); + } + + @Test + public void testJsonMixedMultibyteAndAscii() throws Exception { + String input = "Hello 中文 café 😀!"; + assertEquals("\"" + input + "\"", jsonOf(input)); + } + + @Test + public void testJsonTwoByteChar() throws Exception { + assertEquals("\"café\"", jsonOf("café")); + } + + @Test + public void testJsonThreeByteChar() throws Exception { + assertEquals("\"中文\"", jsonOf("中文")); + } + + @Test + public void testJsonEmoji() throws Exception { + assertEquals("\"💪\"", jsonOf("💪")); + } + + @Test + public void testJsonMixedEmojiString() throws Exception { + String input = "No more 💪🏻🦋"; + assertEquals("\"" + input + "\"", jsonOf(input)); + } +} From 2b3ba8b2b0df27df05cad51271b3f85aed94e151 Mon Sep 17 00:00:00 2001 From: ongdisheng Date: Tue, 3 Mar 2026 15:23:28 +0000 Subject: [PATCH 2/2] expand PrintToolsTest coverage --- .../nontagged/printers/PrintToolsTest.java | 23 +++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/asterixdb/asterix-om/src/test/java/org/apache/asterix/dataflow/data/nontagged/printers/PrintToolsTest.java b/asterixdb/asterix-om/src/test/java/org/apache/asterix/dataflow/data/nontagged/printers/PrintToolsTest.java index 957c4149c88..44497844db2 100644 --- a/asterixdb/asterix-om/src/test/java/org/apache/asterix/dataflow/data/nontagged/printers/PrintToolsTest.java +++ b/asterixdb/asterix-om/src/test/java/org/apache/asterix/dataflow/data/nontagged/printers/PrintToolsTest.java @@ -79,6 +79,11 @@ public void testCsvTwoByteChar() throws Exception { assertEquals("café", csvOf("café")); } + @Test + public void testCsvTwoByteCharWithComma() throws Exception { + assertEquals("\"café,café\"", csvOf("café,café")); + } + @Test public void testCsvThreeByteChar() throws Exception { assertEquals("中文", csvOf("中文")); @@ -127,14 +132,18 @@ public void testJsonAsciiEmpty() throws Exception { } @Test - public void testJsonAsciiEscapeSpecialChars() throws Exception { + public void testJsonAsciiEscapeNewline() throws Exception { assertEquals("\"line1\\nline2\"", jsonOf("line1\nline2")); } @Test - public void testJsonMixedMultibyteAndAscii() throws Exception { - String input = "Hello 中文 café 😀!"; - assertEquals("\"" + input + "\"", jsonOf(input)); + public void testJsonAsciiEscapeTab() throws Exception { + assertEquals("\"col1\\tcol2\"", jsonOf("col1\tcol2")); + } + + @Test + public void testJsonAsciiEscapeBackslash() throws Exception { + assertEquals("\"a\\\\b\"", jsonOf("a\\b")); } @Test @@ -152,6 +161,12 @@ public void testJsonEmoji() throws Exception { assertEquals("\"💪\"", jsonOf("💪")); } + @Test + public void testJsonMixedMultibyteAndAscii() throws Exception { + String input = "Hello 中文 café 😀!"; + assertEquals("\"" + input + "\"", jsonOf(input)); + } + @Test public void testJsonMixedEmojiString() throws Exception { String input = "No more 💪🏻🦋";