@@ -274,31 +274,34 @@ void Reader::prefilterAndInitRowGroups()
274274 SchemaConverter schemer (file_metadata, options, &extended_sample_block);
275275 if (prewhere_info && !prewhere_info->remove_prewhere_column )
276276 schemer.external_columns .push_back (prewhere_info->prewhere_column_name );
277+ schemer.column_mapper = format_filter_info->column_mapper .get ();
277278 schemer.prepareForReading ();
278279 primitive_columns = std::move (schemer.primitive_columns );
279280 total_primitive_columns_in_file = schemer.primitive_column_idx ;
280281 output_columns = std::move (schemer.output_columns );
281282
282283 // / Precalculate some column index mappings.
283284
284- sample_block_to_output_columns_idx.resize (extended_sample_block.columns (), UINT64_MAX );
285+ sample_block_to_output_columns_idx.resize (extended_sample_block.columns ());
285286 for (size_t i = 0 ; i < output_columns.size (); ++i)
286287 {
287288 const auto & idx = output_columns[i].idx_in_output_block ;
288289 if (idx.has_value ())
289290 {
290- chassert (sample_block_to_output_columns_idx.at (*idx) == UINT64_MAX );
291+ chassert (! sample_block_to_output_columns_idx.at (*idx). has_value () );
291292 sample_block_to_output_columns_idx.at (*idx) = i;
292293 }
293294 }
294- chassert (std::all_of (sample_block_to_output_columns_idx.begin (), sample_block_to_output_columns_idx.end (), [](size_t x) { return x != UINT64_MAX; }));
295295
296296 if (format_filter_info->key_condition )
297297 {
298298 for (size_t idx_in_output_block : format_filter_info->key_condition ->getUsedColumns ())
299299 {
300- size_t output_idx = sample_block_to_output_columns_idx.at (idx_in_output_block);
301- const OutputColumnInfo & output_info = output_columns[output_idx];
300+ const auto & output_idx = sample_block_to_output_columns_idx.at (idx_in_output_block);
301+ if (!output_idx.has_value ())
302+ throw Exception (ErrorCodes::LOGICAL_ERROR, " KeyCondition uses PREWHERE output" );
303+ const OutputColumnInfo & output_info = output_columns[output_idx.value ()];
304+
302305 if (output_info.is_primitive )
303306 primitive_columns[output_info.primitive_start ].used_by_key_condition = idx_in_output_block;
304307 }
@@ -363,7 +366,11 @@ void Reader::prefilterAndInitRowGroups()
363366 const auto & column_conditions = static_cast <FilterInfoExt *>(format_filter_info->opaque .get ())->column_conditions ;
364367 for (const auto & [idx_in_output_block, key_condition] : column_conditions)
365368 {
366- const OutputColumnInfo & output_info = output_columns[sample_block_to_output_columns_idx.at (idx_in_output_block)];
369+ const auto & output_idx = sample_block_to_output_columns_idx.at (idx_in_output_block);
370+ if (!output_idx.has_value ())
371+ throw Exception (ErrorCodes::LOGICAL_ERROR, " Column condition uses PREWHERE output" );
372+ const OutputColumnInfo & output_info = output_columns[output_idx.value ()];
373+
367374 if (!output_info.is_primitive )
368375 continue ;
369376 primitive_columns[output_info.primitive_start ].column_index_condition = key_condition.get ();
@@ -602,44 +609,47 @@ void Reader::initializePrefetches()
602609void Reader::preparePrewhere ()
603610{
604611 PrewhereInfoPtr prewhere_info = format_filter_info->prewhere_info ;
605- if (!prewhere_info)
606- return ;
612+ if (prewhere_info)
613+ {
614+ // / TODO [parquet]: We currently run prewhere after reading all prewhere columns of the row
615+ // / subgroup, in one thread per row group. Instead, we could extract single-column conditions
616+ // / and run them after decoding the corresponding columns, in parallel.
617+ // / (Still run multi-column conditions, like `col1 = 42 or col2 = 'yes'`, after reading all columns.)
618+ // / Probably reuse tryBuildPrewhereSteps from MergeTree for splitting the expression.
607619
608- // / TODO [parquet]: We currently run prewhere after reading all prewhere columns of the row
609- // / subgroup, in one thread per row group. Instead, we could extract single-column conditions
610- // / and run them after decoding the corresponding columns, in parallel.
611- // / (Still run multi-column conditions, like `col1 = 42 or col2 = 'yes'`, after reading all columns.)
612- // / Probably reuse tryBuildPrewhereSteps from MergeTree for splitting the expression.
613620
614- // / Convert ActionsDAG to ExpressionActions.
615- ExpressionActionsSettings actions_settings;
616- if (prewhere_info->row_level_filter .has_value ())
617- {
618- ExpressionActions actions (prewhere_info->row_level_filter ->clone (), actions_settings);
621+ // / Convert ActionsDAG to ExpressionActions.
622+ ExpressionActionsSettings actions_settings;
623+ if (prewhere_info->row_level_filter .has_value ())
624+ {
625+ ExpressionActions actions (prewhere_info->row_level_filter ->clone (), actions_settings);
626+ prewhere_steps.push_back (PrewhereStep
627+ {
628+ .actions = std::move (actions),
629+ .result_column_name = prewhere_info->row_level_column_name ,
630+ });
631+ }
632+ ExpressionActions actions (prewhere_info->prewhere_actions .clone (), actions_settings);
619633 prewhere_steps.push_back (PrewhereStep
620634 {
621635 .actions = std::move (actions),
622- .result_column_name = prewhere_info->row_level_column_name
636+ .result_column_name = prewhere_info->prewhere_column_name ,
637+ .need_filter = prewhere_info->need_filter ,
623638 });
639+ if (!prewhere_info->remove_prewhere_column )
640+ prewhere_steps.back ().idx_in_output_block = sample_block->getPositionByName (prewhere_info->prewhere_column_name );
624641 }
625- ExpressionActions actions (prewhere_info->prewhere_actions .clone (), actions_settings);
626- prewhere_steps.push_back (PrewhereStep
627- {
628- .actions = std::move (actions),
629- .result_column_name = prewhere_info->prewhere_column_name ,
630- .need_filter = prewhere_info->need_filter ,
631- });
632- if (!prewhere_info->remove_prewhere_column )
633- prewhere_steps.back ().idx_in_output_block = sample_block->getPositionByName (prewhere_info->prewhere_column_name );
634-
635642 // / Look up expression inputs in extended_sample_block.
636643 for (PrewhereStep & step : prewhere_steps)
637644 {
638645 for (const auto & col : step.actions .getRequiredColumnsWithTypes ())
639646 {
640647 size_t idx_in_output_block = extended_sample_block.getPositionByName (col.name , /* case_insensitive= */ false );
641- size_t output_idx = sample_block_to_output_columns_idx.at (idx_in_output_block);
642- OutputColumnInfo & output_info = output_columns[output_idx];
648+ const auto & output_idx = sample_block_to_output_columns_idx.at (idx_in_output_block);
649+ if (!output_idx.has_value ())
650+ throw Exception (ErrorCodes::LOGICAL_ERROR, " PREWHERE appears to use its own output as input" );
651+ OutputColumnInfo & output_info = output_columns[output_idx.value ()];
652+
643653 output_info.use_prewhere = true ;
644654 bool only_for_prewhere = idx_in_output_block >= sample_block->columns ();
645655
@@ -649,7 +659,21 @@ void Reader::preparePrewhere()
649659 primitive_columns[primitive_idx].only_for_prewhere = only_for_prewhere;
650660 }
651661
652- step.input_column_idxs .push_back (output_idx);
662+ step.input_column_idxs .push_back (output_idx.value ());
663+ }
664+ }
665+
666+ // / Assert that sample_block_to_output_columns_idx is valid.
667+ for (size_t i = 0 ; i < sample_block_to_output_columns_idx.size (); ++i)
668+ {
669+ // / (`prewhere_steps` has at most two elements)
670+ size_t is_prewhere_output = std::count_if (prewhere_steps.begin (), prewhere_steps.end (),
671+ [&](const PrewhereStep & step) { return step.idx_in_output_block == i; });
672+ if (is_prewhere_output > 1 ||
673+ // / Column must appear in exactly one of {output_columns, prewhere output}.
674+ sample_block_to_output_columns_idx[i].has_value () != !is_prewhere_output)
675+ {
676+ throw Exception (ErrorCodes::LOGICAL_ERROR, " Unexpected column in sample block: {}" , extended_sample_block.getByPosition (i).name );
653677 }
654678 }
655679}
@@ -974,7 +998,8 @@ void Reader::intersectColumnIndexResultsAndInitSubgroups(RowGroup & row_group)
974998 bytes_per_row += estimateColumnMemoryBytesPerRow (row_group.columns .at (i), row_group, primitive_columns.at (i));
975999
9761000 size_t n = size_t (options.format .parquet .prefer_block_bytes / std::max (bytes_per_row, 1 .));
977- rows_per_subgroup = std::min (rows_per_subgroup, std::max (n, 1ul ));
1001+ n = std::max (n, size_t (128 )); // avoid super tiny blocks if something is wrong with stats
1002+ rows_per_subgroup = std::min (rows_per_subgroup, n);
9781003 }
9791004 chassert (rows_per_subgroup > 0 );
9801005
0 commit comments