From 387c9f614f8348e66005cba4bcfdb2cc42b54e62 Mon Sep 17 00:00:00 2001 From: Hiroshi Nakamura Date: Thu, 19 Mar 2015 22:49:37 +0900 Subject: [PATCH] fix: broken CSV cell by backslash escaped quoteChar So far CSVRecordReader detects '\' before quoteChar (usually ") and do unexpected string truncation. - [\"] -> keep parsing the rest This commit removes this weird behavior and no longer does this string truncation. - [\"] -> add \ and " then moves to next cell This commit also removes special '\' handling for '\\"'. This conditional branch is unnecessary because it does the same thing with 'else' branch. --- .../td_import/reader/CSVRecordReader.java | 18 +++++++++- .../td_import/reader/TestCSVFileReader.java | 36 +++++++++++++++++++ 2 files changed, 53 insertions(+), 1 deletion(-) diff --git a/src/main/java/com/treasure_data/td_import/reader/CSVRecordReader.java b/src/main/java/com/treasure_data/td_import/reader/CSVRecordReader.java index 89704ed3..405ddd4b 100644 --- a/src/main/java/com/treasure_data/td_import/reader/CSVRecordReader.java +++ b/src/main/java/com/treasure_data/td_import/reader/CSVRecordReader.java @@ -197,14 +197,30 @@ public boolean readColumns(final List columns) throws IOException { currentRow.append(line); // update untokenized CSV row line += NEWLINE; // add newline to simplify parsing } else if (c == quoteChar) { + /* + * Removes following buggy behavior. TODO: delete this code and comment. + * + * - If the source CSV stream includes [\"] sequence in quoted column, it fails to detect end of column. + * ex. "abc\","def" should be parsed as [abc\], [def] but it was [abc"def...] + * + * New code no longer behaves like this. + * if (charIndex > 2 && line.charAt(charIndex - 2) == '\\' && line.charAt(charIndex - 1) == '\\') { + // **************************************************** + // * This means [\\"] -> [\\] and moves new column. * + // * It's the same behavior with 'else'. Unnecessary. * + // **************************************************** state = TokenizerState.NORMAL; quoteScopeStartingLine = -1; } else if (charIndex > 1 && line.charAt(charIndex - 1) == '\\') { + // ******************************************** + // * This means [\"] -> [\"] and keep parsing * + // ******************************************** currentColumn.append(c); //charIndex++; - } else if (line.charAt(charIndex + 1) == quoteChar) { + } else */ + if (line.charAt(charIndex + 1) == quoteChar) { /* * An escaped quote (""). Add a single quote, then * move the cursor so the next iteration of the loop diff --git a/src/test/java/com/treasure_data/td_import/reader/TestCSVFileReader.java b/src/test/java/com/treasure_data/td_import/reader/TestCSVFileReader.java index 4e454fcd..61a09312 100644 --- a/src/test/java/com/treasure_data/td_import/reader/TestCSVFileReader.java +++ b/src/test/java/com/treasure_data/td_import/reader/TestCSVFileReader.java @@ -241,6 +241,31 @@ public void assertContextEquals(TestCSVFileReader test) { } } + public static class Context07 implements Context { + public void createContext(TestCSVFileReader test) + throws Exception { + test.columnNames = new String[] { "time", "string1", "string2" }; + test.columnTypes = new ColumnType[] { ColumnType.LONG, ColumnType.STRING, ColumnType.STRING }; + } + + public String generateCSVText(TestCSVFileReader test) { + StringBuilder sbuf = new StringBuilder(); + sbuf.append(test.columnNames[0]).append(COMMA); + sbuf.append(test.columnNames[1]).append(COMMA); + sbuf.append(test.columnNames[2]).append(LF); + sbuf.append("\"12345\"").append(COMMA); + sbuf.append("\"str\"\"ing\\\\\"").append(COMMA); + sbuf.append("\"str\"\"ing\\\"").append(LF); + return sbuf.toString(); + } + + public void assertContextEquals(TestCSVFileReader test) { + assertArrayEquals(test.columnNames, test.reader.getColumnNames()); + assertArrayEquals(test.columnTypes, test.reader.getColumnTypes()); + assertTrue(test.reader.getSkipColumns().isEmpty()); + } + } + protected String fileName = "./file.csv"; protected int numLine; @@ -398,6 +423,17 @@ public void checkContextWhenReaderConfigurationWithTimeFormat() throws Exception checkContextWhenReaderConfiguration(context); } + @Test + public void checkContextWhenColumnEndsWithBackslash() throws Exception { + Context07 context = new Context07(); + + createPrepareConfiguration(); + createFileWriter(); + createFileReader(); + + checkContextWhenReaderConfiguration(context); + } + private void checkContextWhenReaderConfiguration(Context context) throws Exception { // create context context.createContext(this);