2424import io .streamthoughts .kafka .connect .filepulse .data .Type ;
2525import io .streamthoughts .kafka .connect .filepulse .data .TypedField ;
2626import io .streamthoughts .kafka .connect .filepulse .data .TypedStruct ;
27+ import io .streamthoughts .kafka .connect .filepulse .internal .StringUtils ;
2728import io .streamthoughts .kafka .connect .filepulse .reader .RecordsIterable ;
2829import org .apache .kafka .common .config .ConfigDef ;
2930import org .apache .kafka .common .config .ConfigException ;
3031
32+ import java .util .ArrayList ;
33+ import java .util .Arrays ;
34+ import java .util .HashMap ;
3135import java .util .List ;
3236import java .util .Map ;
3337import java .util .StringJoiner ;
38+ import java .util .function .Function ;
39+ import java .util .regex .Pattern ;
40+ import java .util .stream .Collectors ;
41+ import java .util .stream .IntStream ;
3442
3543import static io .streamthoughts .kafka .connect .filepulse .config .DelimitedRowFilterConfig .READER_AUTO_GENERATE_COLUMN_NAME_CONFIG ;
3644import static io .streamthoughts .kafka .connect .filepulse .config .DelimitedRowFilterConfig .READER_EXTRACT_COLUMN_NAME_CONFIG ;
@@ -48,6 +56,10 @@ public class DelimitedRowFilter extends AbstractRecordFilter<DelimitedRowFilter>
4856
4957 private StructSchema schema ;
5058
59+ private final Map <Integer , TypedField > columnsTypesByIndex = new HashMap <>();
60+
61+ private Pattern pattern = null ;
62+
5163 /**
5264 * {@inheritDoc}
5365 */
@@ -59,19 +71,25 @@ public void configure(final Map<String, ?> configs) {
5971 if (isMandatoryConfigsMissing ()) {
6072 StringJoiner joiner = new StringJoiner ("," , "[" , "]" );
6173 final String mandatory = joiner
62- .add (READER_AUTO_GENERATE_COLUMN_NAME_CONFIG )
63- .add (READER_EXTRACT_COLUMN_NAME_CONFIG )
64- .add (READER_FIELD_COLUMNS_CONFIG ).toString ();
74+ .add (READER_AUTO_GENERATE_COLUMN_NAME_CONFIG )
75+ .add (READER_EXTRACT_COLUMN_NAME_CONFIG )
76+ .add (READER_FIELD_COLUMNS_CONFIG ).toString ();
6577 throw new ConfigException ("At least one of those parameters should be configured " + mandatory );
6678 }
6779
80+ if (!StringUtils .isFastSplit (this .configs .delimiter ())) pattern = Pattern .compile (this .configs .delimiter ());
81+
6882 this .schema = this .configs .schema ();
83+ if (schema != null ) {
84+ final List <TypedField > fields = schema .fields ();
85+ IntStream .range (0 , fields .size ()).forEach (i -> columnsTypesByIndex .put (i , fields .get (i )));
86+ }
6987 }
7088
7189 private boolean isMandatoryConfigsMissing () {
7290 return configs .schema () == null &&
73- configs .extractColumnName () == null &&
74- !configs .isAutoGenerateColumnNames ();
91+ configs .extractColumnName () == null &&
92+ !configs .isAutoGenerateColumnNames ();
7593 }
7694
7795 /**
@@ -87,52 +105,70 @@ public ConfigDef configDef() {
87105 */
88106 @ Override
89107 public RecordsIterable <TypedStruct > apply (final FilterContext context ,
90- final TypedStruct record ,
91- final boolean hasNext ) throws FilterException {
108+ final TypedStruct record ,
109+ final boolean hasNext ) throws FilterException {
92110
93111 final String source = record .first (DEFAULT_SOURCE_FIELD ).getString ();
94112
95- String [] fieldValues = splitFields (source );
96- final StructSchema schema = getSchema (record , fieldValues .length );
97- final TypedStruct struct = buildStructForFields (fieldValues , schema );
113+ String [] columnValues = splitColumnValues (source );
114+ if (schema == null ) {
115+ inferSchemaFromRecord (record , columnValues .length );
116+ }
117+ final TypedStruct struct = buildStructForFields (columnValues );
98118 return RecordsIterable .of (struct );
99119 }
100120
101- private StructSchema getSchema (final TypedStruct record , int n ) {
102- if (schema != null ) return schema ;
103-
121+ private void inferSchemaFromRecord (final TypedStruct record , int numColumns ) {
104122 schema = Schema .struct ();
123+
105124 if (configs .extractColumnName () != null ) {
106125 final String fieldName = configs .extractColumnName ();
107126 String field = record .first (fieldName ).getString ();
108127 if (field == null ) {
109128 throw new FilterException (
110- "Can't found field for name '" + fieldName + "' to determine columns names" );
129+ "Can't found field for name '" + fieldName + "' to determine columns names" );
111130 }
112- final String [] columns = splitFields (field );
131+ final List <String > columns = Arrays
132+ .stream (splitColumnValues (field ))
133+ .map (String ::trim )
134+ .collect (Collectors .toList ());
135+
136+ if (configs .isDuplicateColumnsAsArray ()) {
137+ columns .stream ()
138+ .collect (Collectors .groupingBy (Function .identity (), Collectors .<String >counting ()))
139+ .entrySet ()
140+ .stream ()
141+ .collect (Collectors .toMap (Map .Entry ::getKey , e -> {
142+ return e .getValue () > 1 ? Schema .array (DEFAULT_COLUMN_TYPE ) : DEFAULT_COLUMN_TYPE ;
143+ }))
144+ .forEach (schema ::field );
145+ } else {
146+ columns .forEach (columnName -> schema .field (columnName , DEFAULT_COLUMN_TYPE ));
147+ }
148+ IntStream .range (0 , columns .size ()).forEach (i -> columnsTypesByIndex .put (i , schema .field (columns .get (i ))));
149+ return ;
150+ }
113151
114- for (String column : columns ) {
115- schema .field (column , DEFAULT_COLUMN_TYPE );
152+ if (configs .isAutoGenerateColumnNames ()) {
153+ for (int i = 0 ; i < numColumns ; i ++) {
154+ final String fieldName = AUTO_GENERATED_COLUMN_NAME_PREFIX + (i + 1 );
155+ schema .field (fieldName , DEFAULT_COLUMN_TYPE );
156+ columnsTypesByIndex .put (i , schema .field (fieldName ));
116157 }
117- } else if (configs .isAutoGenerateColumnNames ()) {
118- for (int i = 0 ; i < n ; i ++) {
119- schema .field (AUTO_GENERATED_COLUMN_NAME_PREFIX + (i + 1 ), DEFAULT_COLUMN_TYPE );
120- }
121- } else {
122- throw new FilterException ("Can't found valid configuration to determine schema for input value" );
158+ return ;
123159 }
124- return schema ;
160+
161+ throw new FilterException ("Can't found valid configuration to determine schema for input value" );
125162 }
126163
127- private String [] splitFields (final String value ) {
128- return value .split (configs .delimiter ());
164+ private String [] splitColumnValues (final String value ) {
165+ return pattern != null ? pattern . split ( value ) : value .split (configs .delimiter ());
129166 }
130167
131- private TypedStruct buildStructForFields (final String [] fieldValues , final StructSchema schema ) {
132- List <TypedField > fields = schema .fields ();
133- if (fieldValues .length > fields .size ()) {
168+ private TypedStruct buildStructForFields (final String [] fieldValues ) {
169+ if (fieldValues .length > columnsTypesByIndex .size ()) {
134170 throw new FilterException (
135- "Error while reading delimited input row. Too large number of fields (" + fieldValues .length + ")" );
171+ "Error while reading delimited input row. Too large number of fields (" + fieldValues .length + ")" );
136172 }
137173
138174 TypedStruct struct = TypedStruct .create ();
@@ -141,9 +177,16 @@ private TypedStruct buildStructForFields(final String[] fieldValues, final Struc
141177 if (configs .isTrimColumn ()) {
142178 fieldValue = fieldValue .trim ();
143179 }
144- TypedField field = fields .get (i );
180+ TypedField field = columnsTypesByIndex .get (i );
145181 final Type type = field .type ();
146- struct = struct .put (field .name (), type , type .convert (fieldValue ));
182+ if (type == Type .ARRAY ) {
183+ if (!struct .exists (field .name ())) {
184+ struct .put (field .name (), new ArrayList <>());
185+ }
186+ struct .getArray (field .name ()).add (fieldValue ); // it seems to be OK to use type conversion here
187+ } else {
188+ struct = struct .put (field .name (), type , type .convert (fieldValue ));
189+ }
147190 }
148191 return struct ;
149192 }
0 commit comments