diff --git a/src/main/java/com/treasure_data/td_import/reader/CSVRecordReader.java b/src/main/java/com/treasure_data/td_import/reader/CSVRecordReader.java index 89704ed3..405ddd4b 100644 --- a/src/main/java/com/treasure_data/td_import/reader/CSVRecordReader.java +++ b/src/main/java/com/treasure_data/td_import/reader/CSVRecordReader.java @@ -197,14 +197,30 @@ public boolean readColumns(final List columns) throws IOException { currentRow.append(line); // update untokenized CSV row line += NEWLINE; // add newline to simplify parsing } else if (c == quoteChar) { + /* + * Removes following buggy behavior. TODO: delete this code and comment. + * + * - If the source CSV stream includes [\"] sequence in quoted column, it fails to detect end of column. + * ex. "abc\","def" should be parsed as [abc\], [def] but it was [abc"def...] + * + * New code no longer behaves like this. + * if (charIndex > 2 && line.charAt(charIndex - 2) == '\\' && line.charAt(charIndex - 1) == '\\') { + // **************************************************** + // * This means [\\"] -> [\\] and moves new column. * + // * It's the same behavior with 'else'. Unnecessary. * + // **************************************************** state = TokenizerState.NORMAL; quoteScopeStartingLine = -1; } else if (charIndex > 1 && line.charAt(charIndex - 1) == '\\') { + // ******************************************** + // * This means [\"] -> [\"] and keep parsing * + // ******************************************** currentColumn.append(c); //charIndex++; - } else if (line.charAt(charIndex + 1) == quoteChar) { + } else */ + if (line.charAt(charIndex + 1) == quoteChar) { /* * An escaped quote (""). Add a single quote, then * move the cursor so the next iteration of the loop diff --git a/src/test/java/com/treasure_data/td_import/reader/TestCSVFileReader.java b/src/test/java/com/treasure_data/td_import/reader/TestCSVFileReader.java index 4e454fcd..61a09312 100644 --- a/src/test/java/com/treasure_data/td_import/reader/TestCSVFileReader.java +++ b/src/test/java/com/treasure_data/td_import/reader/TestCSVFileReader.java @@ -241,6 +241,31 @@ public void assertContextEquals(TestCSVFileReader test) { } } + public static class Context07 implements Context { + public void createContext(TestCSVFileReader test) + throws Exception { + test.columnNames = new String[] { "time", "string1", "string2" }; + test.columnTypes = new ColumnType[] { ColumnType.LONG, ColumnType.STRING, ColumnType.STRING }; + } + + public String generateCSVText(TestCSVFileReader test) { + StringBuilder sbuf = new StringBuilder(); + sbuf.append(test.columnNames[0]).append(COMMA); + sbuf.append(test.columnNames[1]).append(COMMA); + sbuf.append(test.columnNames[2]).append(LF); + sbuf.append("\"12345\"").append(COMMA); + sbuf.append("\"str\"\"ing\\\\\"").append(COMMA); + sbuf.append("\"str\"\"ing\\\"").append(LF); + return sbuf.toString(); + } + + public void assertContextEquals(TestCSVFileReader test) { + assertArrayEquals(test.columnNames, test.reader.getColumnNames()); + assertArrayEquals(test.columnTypes, test.reader.getColumnTypes()); + assertTrue(test.reader.getSkipColumns().isEmpty()); + } + } + protected String fileName = "./file.csv"; protected int numLine; @@ -398,6 +423,17 @@ public void checkContextWhenReaderConfigurationWithTimeFormat() throws Exception checkContextWhenReaderConfiguration(context); } + @Test + public void checkContextWhenColumnEndsWithBackslash() throws Exception { + Context07 context = new Context07(); + + createPrepareConfiguration(); + createFileWriter(); + createFileReader(); + + checkContextWhenReaderConfiguration(context); + } + private void checkContextWhenReaderConfiguration(Context context) throws Exception { // create context context.createContext(this);