feat(filters): add support for repeated columns to DelimitedRowFilter (#106)

fhussonnois · fhussonnois · commit 5a65c3b2d41b · 2021-02-05T13:50:35.000+01:00
This commit adds a new config property: duplicateColumnsAsArray Resolves: GH-106
diff --git a/checkstyle/checkstyle.xml b/checkstyle/checkstyle.xml
@@ -44,6 +44,7 @@
 
     <module name="Header">
         <property name="headerFile" value="${basedir}/checkstyle/apache-header" />
+        <property name="ignoreLines" value="1, 2"/>
     </module>
 
     <module name="TreeWalker">
diff --git a/checkstyle/suppressions.xml b/checkstyle/suppressions.xml
@@ -67,6 +67,6 @@
     <suppress checks="Header" files="log4j.properties"/>
 
     <suppress checks="[a-zA-Z0-9]*" files="src/main/java/io/streamthoughts/kafka/connect/filepulse/expression/parser/antlr4/*"/>
-    <suppress checks="BooleanExpressionComplexity" files="Split.java"/>
+    <suppress checks="BooleanExpressionComplexity" files="io.streamthoughts.kafka.connect.filepulse.internal.StringUtils"/>
 
 </suppressions>
diff --git a/connect-file-pulse-api/src/main/java/io/streamthoughts/kafka/connect/filepulse/data/Schema.java b/connect-file-pulse-api/src/main/java/io/streamthoughts/kafka/connect/filepulse/data/Schema.java
@@ -135,6 +135,17 @@ static MapSchema map(final Map<String ,?> value, final Schema valueSchema) {
         return valueSchema == null ? new LazyMapSchema(value) : new MapSchema(valueSchema);
     }
 
+    /**
+     * Gets the schema for type ARRAY.
+     *
+     * @param valueSchema the {@link Schema} instance.
+     *
+     * @return  the {@link Schema} instance.
+     */
+    static ArraySchema array(final Schema valueSchema) {
+        return new ArraySchema(valueSchema);
+    }
+
     /**
      * Gets the schema for type ARRAY.
      *
diff --git a/connect-file-pulse-api/src/main/java/io/streamthoughts/kafka/connect/filepulse/internal/StringUtils.java b/connect-file-pulse-api/src/main/java/io/streamthoughts/kafka/connect/filepulse/internal/StringUtils.java
@@ -0,0 +1,38 @@
+/*
+ * Copyright 2019-2021 StreamThoughts.
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package io.streamthoughts.kafka.connect.filepulse.internal;
+
+public class StringUtils {
+
+    /**
+     * @see String#split(String).
+     */
+    public static boolean isFastSplit(final String regex) {
+        char ch = 0;
+        return
+            ((regex.length() == 1 && ".$|()[{^?*+\\".indexOf(ch = regex.charAt(0)) == -1) ||
+                (regex.length() == 2 &&
+                    regex.charAt(0) == '\\' &&
+                    (((ch = regex.charAt(1))-'0')|('9'-ch)) < 0 &&
+                    ((ch-'a')|('z'-ch)) < 0 &&
+                    ((ch-'A')|('Z'-ch)) < 0)) &&
+                (ch < Character.MIN_HIGH_SURROGATE ||
+                        ch > Character.MAX_LOW_SURROGATE);
+    }
+}
diff --git a/connect-file-pulse-expression/src/main/java/io/streamthoughts/kafka/connect/filepulse/expression/function/impl/Split.java b/connect-file-pulse-expression/src/main/java/io/streamthoughts/kafka/connect/filepulse/expression/function/impl/Split.java
@@ -28,6 +28,7 @@
 import io.streamthoughts.kafka.connect.filepulse.expression.function.ExpressionFunction;
 import io.streamthoughts.kafka.connect.filepulse.expression.function.GenericArgument;
 import io.streamthoughts.kafka.connect.filepulse.expression.function.MissingArgumentValue;
+import io.streamthoughts.kafka.connect.filepulse.internal.StringUtils;
 
 import java.util.Arrays;
 import java.util.regex.Pattern;
@@ -57,7 +58,7 @@ public Arguments<?> prepare(final Expression[] args) {
 
         final String regex = ((ValueExpression) args[1]).value().getString();
         Object regexArgument;
-        if (isFastSplit(regex)) {
+        if (StringUtils.isFastSplit(regex)) {
             regexArgument = regex;
         } else {
             regexArgument = Pattern.compile(regex);
@@ -87,20 +88,4 @@ public TypedValue apply(Arguments<GenericArgument> args) {
         }
         return TypedValue.array(Arrays.asList(split), Type.STRING);
     }
-
-    /**
-     * @see String#split(String). 
-     */
-    private static boolean isFastSplit(final String regex) {
-        char ch = 0;
-        return
-            ((regex.length() == 1 && ".$|()[{^?*+\\".indexOf(ch = regex.charAt(0)) == -1) ||
-            (regex.length() == 2 &&
-                regex.charAt(0) == '\\' &&
-                (((ch = regex.charAt(1))-'0')|('9'-ch)) < 0 &&
-                ((ch-'a')|('z'-ch)) < 0 &&
-                ((ch-'A')|('Z'-ch)) < 0)) &&
-            (ch < Character.MIN_HIGH_SURROGATE ||
-             ch > Character.MAX_LOW_SURROGATE);
-    }
 }
diff --git a/connect-file-pulse-filters/src/main/java/io/streamthoughts/kafka/connect/filepulse/config/DelimitedRowFilterConfig.java b/connect-file-pulse-filters/src/main/java/io/streamthoughts/kafka/connect/filepulse/config/DelimitedRowFilterConfig.java
@@ -38,6 +38,10 @@ public class DelimitedRowFilterConfig extends CommonFilterConfig {
     public static final String READER_FIELD_TRIM_COLUMN_DOC               = "Remove the leading and trailing whitespaces from all columns.";
     public static final boolean READER_FIELD_TRIM_COLUMN_DEFAULT          = false;
 
+    public static final String READER_FIELD_DUPLICATE_COLUMNS_AS_ARRAY_CONFIG = "duplicateColumnsAsArray";
+    public static final String READER_FIELD_DUPLICATE_COLUMNS_AS_ARRAY_DOC    = "Treat duplicate columns as an array. If false and a record contains duplicate columns an exception will be thrown.";
+    public static final String READER_FIELD_DUPLICATE_COLUMNS_AS_ARRAY_DEFAULT= "false";
+
     public static final String READER_EXTRACT_COLUMN_NAME_CONFIG          = "extractColumnName";
     public static final String READER_EXTRACT_COLUMN_NAME_DOC             = "Define the field from which the schema should be detected (all columns will be of type 'withMessage')";
 
@@ -72,6 +76,10 @@ public boolean isAutoGenerateColumnNames() {
         return getBoolean(READER_AUTO_GENERATE_COLUMN_NAME_CONFIG);
     }
 
+    public boolean isDuplicateColumnsAsArray() {
+        return getBoolean(READER_FIELD_DUPLICATE_COLUMNS_AS_ARRAY_CONFIG);
+    }
+
     public StructSchema schema() {
         final String columns = getString(READER_FIELD_COLUMNS_CONFIG);
         if (columns == null) return null;
@@ -114,6 +122,9 @@ public static ConfigDef configDef() {
                 .define(READER_AUTO_GENERATE_COLUMN_NAME_CONFIG, ConfigDef.Type.BOOLEAN, READER_AUTO_GENERATE_COLUMN_NAME_DEFAULT,
                         ConfigDef.Importance.HIGH, READER_AUTO_GENERATE_COLUMN_NAME_DOC)
 
+                .define(READER_FIELD_DUPLICATE_COLUMNS_AS_ARRAY_CONFIG, ConfigDef.Type.BOOLEAN, READER_FIELD_DUPLICATE_COLUMNS_AS_ARRAY_DEFAULT,
+                        ConfigDef.Importance.LOW, READER_FIELD_DUPLICATE_COLUMNS_AS_ARRAY_DOC)
+
                 .define(READER_FIELD_TRIM_COLUMN_CONFIG, ConfigDef.Type.BOOLEAN, READER_FIELD_TRIM_COLUMN_DEFAULT,
                         ConfigDef.Importance.LOW, READER_FIELD_TRIM_COLUMN_DOC);
     }
diff --git a/connect-file-pulse-filters/src/main/java/io/streamthoughts/kafka/connect/filepulse/filter/DelimitedRowFilter.java b/connect-file-pulse-filters/src/main/java/io/streamthoughts/kafka/connect/filepulse/filter/DelimitedRowFilter.java
@@ -24,13 +24,21 @@
 import io.streamthoughts.kafka.connect.filepulse.data.Type;
 import io.streamthoughts.kafka.connect.filepulse.data.TypedField;
 import io.streamthoughts.kafka.connect.filepulse.data.TypedStruct;
+import io.streamthoughts.kafka.connect.filepulse.internal.StringUtils;
 import io.streamthoughts.kafka.connect.filepulse.reader.RecordsIterable;
 import org.apache.kafka.common.config.ConfigDef;
 import org.apache.kafka.common.config.ConfigException;
 
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.StringJoiner;
+import java.util.function.Function;
+import java.util.regex.Pattern;
+import java.util.stream.Collectors;
+import java.util.stream.IntStream;
 
 import static io.streamthoughts.kafka.connect.filepulse.config.DelimitedRowFilterConfig.READER_AUTO_GENERATE_COLUMN_NAME_CONFIG;
 import static io.streamthoughts.kafka.connect.filepulse.config.DelimitedRowFilterConfig.READER_EXTRACT_COLUMN_NAME_CONFIG;
@@ -48,6 +56,10 @@ public class DelimitedRowFilter extends AbstractRecordFilter<DelimitedRowFilter>
 
     private StructSchema schema;
 
+    private final Map<Integer, TypedField> columnsTypesByIndex = new HashMap<>();
+
+    private Pattern pattern = null;
+
     /**
      * {@inheritDoc}
      */
@@ -59,19 +71,25 @@ public void configure(final Map<String, ?> configs) {
         if (isMandatoryConfigsMissing()) {
             StringJoiner joiner = new StringJoiner(",", "[", "]");
             final String mandatory = joiner
-                .add(READER_AUTO_GENERATE_COLUMN_NAME_CONFIG)
-                .add(READER_EXTRACT_COLUMN_NAME_CONFIG)
-                .add(READER_FIELD_COLUMNS_CONFIG).toString();
+                    .add(READER_AUTO_GENERATE_COLUMN_NAME_CONFIG)
+                    .add(READER_EXTRACT_COLUMN_NAME_CONFIG)
+                    .add(READER_FIELD_COLUMNS_CONFIG).toString();
             throw new ConfigException("At least one of those parameters should be configured " + mandatory);
         }
 
+        if (!StringUtils.isFastSplit(this.configs.delimiter())) pattern = Pattern.compile(this.configs.delimiter());
+
         this.schema = this.configs.schema();
+        if (schema != null) {
+            final List<TypedField> fields = schema.fields();
+            IntStream.range(0, fields.size()).forEach(i -> columnsTypesByIndex.put(i, fields.get(i)));
+        }
     }
 
     private boolean isMandatoryConfigsMissing() {
         return configs.schema() == null &&
-               configs.extractColumnName() == null &&
-               !configs.isAutoGenerateColumnNames();
+                configs.extractColumnName() == null &&
+                !configs.isAutoGenerateColumnNames();
     }
 
     /**
@@ -87,52 +105,70 @@ public ConfigDef configDef() {
      */
     @Override
     public RecordsIterable<TypedStruct> apply(final FilterContext context,
-                                                final TypedStruct record,
-                                                final boolean hasNext) throws FilterException {
+                                              final TypedStruct record,
+                                              final boolean hasNext) throws FilterException {
 
         final String source = record.first(DEFAULT_SOURCE_FIELD).getString();
 
-        String[] fieldValues = splitFields(source);
-        final StructSchema schema = getSchema(record, fieldValues.length);
-        final TypedStruct struct = buildStructForFields(fieldValues, schema);
+        String[] columnValues = splitColumnValues(source);
+        if (schema == null) {
+            inferSchemaFromRecord(record, columnValues.length);
+        }
+        final TypedStruct struct = buildStructForFields(columnValues);
         return RecordsIterable.of(struct);
     }
 
-    private StructSchema getSchema(final TypedStruct record, int n) {
-        if (schema != null) return schema;
-
+    private void inferSchemaFromRecord(final TypedStruct record, int numColumns) {
         schema = Schema.struct();
+
         if (configs.extractColumnName() != null) {
             final String fieldName = configs.extractColumnName();
             String field = record.first(fieldName).getString();
             if (field == null) {
                 throw new FilterException(
-                    "Can't found field for name '" + fieldName + "' to determine columns names");
+                        "Can't found field for name '" + fieldName + "' to determine columns names");
             }
-            final String[] columns = splitFields(field);
+            final List<String> columns = Arrays
+                    .stream(splitColumnValues(field))
+                    .map(String::trim)
+                    .collect(Collectors.toList());
+
+            if (configs.isDuplicateColumnsAsArray()) {
+                columns.stream()
+                    .collect(Collectors.groupingBy(Function.identity(), Collectors.<String>counting()))
+                    .entrySet()
+                    .stream()
+                    .collect(Collectors.toMap(Map.Entry::getKey, e -> {
+                        return e.getValue() > 1 ? Schema.array(DEFAULT_COLUMN_TYPE) : DEFAULT_COLUMN_TYPE;
+                    }))
+                    .forEach(schema::field);
+            } else {
+                columns.forEach(columnName -> schema.field(columnName, DEFAULT_COLUMN_TYPE));
+            }
+            IntStream.range(0, columns.size()).forEach(i -> columnsTypesByIndex.put(i, schema.field(columns.get(i))));
+            return;
+        }
 
-            for (String column : columns) {
-                schema.field(column, DEFAULT_COLUMN_TYPE);
+        if (configs.isAutoGenerateColumnNames()) {
+            for (int i = 0; i < numColumns; i++) {
+                final String fieldName = AUTO_GENERATED_COLUMN_NAME_PREFIX + (i + 1);
+                schema.field(fieldName, DEFAULT_COLUMN_TYPE);
+                columnsTypesByIndex.put(i, schema.field(fieldName));
             }
-        } else if (configs.isAutoGenerateColumnNames()) {
-                for (int i = 0; i < n; i++) {
-                    schema.field(AUTO_GENERATED_COLUMN_NAME_PREFIX + (i + 1), DEFAULT_COLUMN_TYPE);
-                }
-        } else {
-            throw new FilterException("Can't found valid configuration to determine schema for input value");
+            return;
         }
-        return schema;
+
+        throw new FilterException("Can't found valid configuration to determine schema for input value");
     }
 
-    private String[] splitFields(final String value) {
-        return value.split(configs.delimiter());
+    private String[] splitColumnValues(final String value) {
+        return pattern != null ? pattern.split(value) : value.split(configs.delimiter());
     }
 
-    private TypedStruct buildStructForFields(final String[] fieldValues, final StructSchema schema) {
-        List<TypedField> fields = schema.fields();
-        if (fieldValues.length > fields.size()) {
+    private TypedStruct buildStructForFields(final String[] fieldValues) {
+        if (fieldValues.length > columnsTypesByIndex.size()) {
             throw new FilterException(
-                "Error while reading delimited input row. Too large number of fields (" + fieldValues.length + ")");
+                    "Error while reading delimited input row. Too large number of fields (" + fieldValues.length + ")");
         }
 
         TypedStruct struct = TypedStruct.create();
@@ -141,9 +177,16 @@ private TypedStruct buildStructForFields(final String[] fieldValues, final Struc
             if (configs.isTrimColumn()) {
                 fieldValue = fieldValue.trim();
             }
-            TypedField field = fields.get(i);
+            TypedField field = columnsTypesByIndex.get(i);
             final Type type = field.type();
-            struct = struct.put(field.name(), type, type.convert(fieldValue));
+            if (type == Type.ARRAY) {
+                if (!struct.exists(field.name())) {
+                    struct.put(field.name(), new ArrayList<>());
+                }
+                struct.getArray(field.name()).add(fieldValue); // it seems to be OK to use type conversion here
+            } else {
+                struct = struct.put(field.name(), type, type.convert(fieldValue));
+            }
         }
         return struct;
     }
diff --git a/connect-file-pulse-filters/src/test/java/io/streamthoughts/kafka/connect/filepulse/filter/DelimitedRowFileInputFilterTest.java b/connect-file-pulse-filters/src/test/java/io/streamthoughts/kafka/connect/filepulse/filter/DelimitedRowFileInputFilterTest.java