From 1e9fd2b277e9a8de77be36682d57be4df8718dbf Mon Sep 17 00:00:00 2001 From: Thomas Rebele Date: Wed, 28 Jan 2026 15:43:07 +0100 Subject: [PATCH 01/11] HIVE-29424: CBO plans should use histogram statistics for range predicates with a CAST --- .../stats/FilterSelectivityEstimator.java | 377 +++++++++++++---- .../stats/TestFilterSelectivityEstimator.java | 400 +++++++++++++++++- 2 files changed, 690 insertions(+), 87 deletions(-) diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/FilterSelectivityEstimator.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/FilterSelectivityEstimator.java index b18c525c8849..1d778b77d8fa 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/FilterSelectivityEstimator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/FilterSelectivityEstimator.java @@ -18,10 +18,13 @@ package org.apache.hadoop.hive.ql.optimizer.calcite.stats; import java.math.BigDecimal; +import java.math.RoundingMode; import java.util.ArrayList; import java.util.Collections; import java.util.GregorianCalendar; import java.util.List; +import java.util.Objects; +import java.util.Optional; import java.util.Set; import org.apache.calcite.plan.RelOptUtil; @@ -41,6 +44,7 @@ import org.apache.calcite.sql.SqlKind; import org.apache.calcite.sql.type.SqlTypeName; import org.apache.calcite.util.ImmutableBitSet; +import org.apache.commons.lang3.mutable.MutableObject; import org.apache.datasketches.kll.KllFloatsSketch; import org.apache.datasketches.memory.Memory; import org.apache.datasketches.quantilescommon.QuantileSearchCriteria; @@ -59,6 +63,21 @@ public class FilterSelectivityEstimator extends RexVisitorImpl { protected static final Logger LOG = LoggerFactory.getLogger(FilterSelectivityEstimator.class); + private record FloatInterval(float lower, boolean lowerInclusive, float upper, boolean upperInclusive) { + public FloatInterval getRightHalfOpenInterval() { + if (lowerInclusive && !upperInclusive) { + return this; + } + float newLower = lowerInclusive ? lower : Math.nextUp(lower); + float newUpper = !upperInclusive ? upper : Math.nextUp(upper); + return new FloatInterval(newLower, true, newUpper, false); + } + + public FloatInterval withValues(float lower, float upper) { + return new FloatInterval(lower, lowerInclusive, upper, upperInclusive); + } + } + private final RelNode childRel; private final double childCardinality; private final RelMetadataQuery mq; @@ -184,91 +203,284 @@ public Double visitCall(RexCall call) { return selectivity; } + /** + * If the cast can be removed, just return its operand and adjust the boundaries if necessary. + * + *

+ * In Hive, if a value cannot be represented by the cast, the result of the cast is NULL, + * and therefore cannot fulfill the predicate. So the possible range of the values + * is limited by the range of possible values of the type. + *

+ * + *

+ * Special care is taken to support the cast to DECIMAL(precision, scale): + * The cast to DECIMAL rounds the value the same way as {@link RoundingMode#HALF_UP}. + * The boundaries are adjusted accordingly. + *

+ * + * @param cast a RexCall of type {@link SqlKind#CAST} + * @param tableScan the table that provides the statistics + * @param rangeBoundaries see {@link #adjustBoundariesForDecimal(RexCall, MutableObject, MutableObject)}; might get modified + * @param typeBoundaries see {@link #adjustBoundariesForDecimal(RexCall, MutableObject, MutableObject)}; might get modified + * @return the operand if the cast can be removed, otherwise the cast itself + */ + private RexNode removeCastIfPossible(RexCall cast, HiveTableScan tableScan, + MutableObject rangeBoundaries, MutableObject typeBoundaries) { + RexNode op0 = cast.getOperands().getFirst(); + if (!(op0 instanceof RexInputRef)) { + return cast; + } + int index = ((RexInputRef) op0).getIndex(); + final List colStats = tableScan.getColStat(Collections.singletonList(index)); + if (colStats.isEmpty()) { + return cast; + } + + // we need to check that the possible values of the input to the cast are all within the type range of the cast + // otherwise the CAST introduces some modulo-like behavior (*) + ColStatistics colStat = colStats.getFirst(); + ColStatistics.Range range = colStat.getRange(); + if (range == null || range.minValue == null || Double.isNaN( + range.minValue.doubleValue()) || range.maxValue == null || Double.isNaN(range.maxValue.doubleValue())) { + return cast; + } + + SqlTypeName type = cast.getType().getSqlTypeName(); + + double min; + double max; + switch (type) { + case TINYINT, SMALLINT, INTEGER, BIGINT: + min = ((Number) type.getLimit(false, SqlTypeName.Limit.OVERFLOW, false, -1, -1)).doubleValue(); + max = ((Number) type.getLimit(true, SqlTypeName.Limit.OVERFLOW, false, -1, -1)).doubleValue(); + break; + case TIMESTAMP, DATE: + min = Long.MIN_VALUE; + max = Long.MAX_VALUE; + break; + case FLOAT: + min = -Float.MAX_VALUE; + max = Float.MAX_VALUE; + break; + case DOUBLE, DECIMAL: + min = -Double.MAX_VALUE; + max = Double.MAX_VALUE; + break; + default: + // unknown type, do not remove the cast + return cast; + } + + // see (*) + if (range.minValue.doubleValue() < min || range.maxValue.doubleValue() > max) { + return cast; + } + + if (type == SqlTypeName.DECIMAL) { + adjustBoundariesForDecimal(cast, rangeBoundaries, typeBoundaries); + } + + return op0; + } + + /** + * Adjust the boundaries for a DECIMAL cast. + * + * @param rangeBoundaries boundaries of the range predicate + * @param typeBoundaries if not null, will be set to the boundaries of the type range + */ + private static void adjustBoundariesForDecimal(RexCall cast, MutableObject rangeBoundaries, + MutableObject typeBoundaries) { + // values outside the representable range are cast to NULL, so adapt the boundaries + int precision = cast.getType().getPrecision(); + int scale = cast.getType().getScale(); + int digits = precision - scale; + // the cast does some rounding, i.e., CAST(99.9499 AS DECIMAL(3,1)) = 99.9 + // but CAST(99.95 AS DECIMAL(3,1)) = NULL + float adjust = (float) (5 * Math.pow(10, -(scale + 1))); + // the range of values supported by the type is interval [-typeRangeExtent, typeRangeExtent] (both inclusive) + // e.g., the typeRangeExt is 99.94999 for DECIMAL(3,1) + float typeRangeExtent = Math.nextDown((float) (Math.pow(10, digits) - adjust)); + + FloatInterval range = rangeBoundaries.getValue(); + // the resulting value of +- adjust would be rounded up, so in some cases we need to use Math.nextDown + float adjusted1 = range.lowerInclusive ? range.lower - adjust : Math.nextDown(range.lower + adjust); + float adjusted2 = range.upperInclusive ? Math.nextDown(range.upper + adjust) : range.upper - adjust; + + float lowerUniverse = range.lowerInclusive ? -typeRangeExtent : Math.nextDown(-typeRangeExtent); + float upperUniverse = range.upperInclusive ? typeRangeExtent : Math.nextUp(typeRangeExtent); + float lower = Math.max(adjusted1, lowerUniverse); + float upper = Math.min(adjusted2, upperUniverse); + rangeBoundaries.setValue(range.withValues(lower, upper)); + if (typeBoundaries != null) { + typeBoundaries.setValue( + new FloatInterval(lowerUniverse, range.lowerInclusive, upperUniverse, range.upperInclusive)); + } + } + private double computeRangePredicateSelectivity(RexCall call, SqlKind op) { - final boolean isLiteralLeft = call.getOperands().get(0).getKind().equals(SqlKind.LITERAL); - final boolean isLiteralRight = call.getOperands().get(1).getKind().equals(SqlKind.LITERAL); - final boolean isInputRefLeft = call.getOperands().get(0).getKind().equals(SqlKind.INPUT_REF); - final boolean isInputRefRight = call.getOperands().get(1).getKind().equals(SqlKind.INPUT_REF); + double defaultSelectivity = ((double) 1 / (double) 3); + if (!(childRel instanceof HiveTableScan)) { + return defaultSelectivity; + } - if (childRel instanceof HiveTableScan && isLiteralLeft != isLiteralRight && isInputRefLeft != isInputRefRight) { - final HiveTableScan t = (HiveTableScan) childRel; - final int inputRefIndex = ((RexInputRef) call.getOperands().get(isInputRefLeft ? 0 : 1)).getIndex(); - final List colStats = t.getColStat(Collections.singletonList(inputRefIndex)); + // search for the literal + List operands = call.getOperands(); + final Optional leftLiteral = extractLiteral(operands.get(0)); + final Optional rightLiteral = extractLiteral(operands.get(1)); + if ((leftLiteral.isPresent()) == (rightLiteral.isPresent())) { + return defaultSelectivity; + } + int literalOpIdx = leftLiteral.isPresent() ? 0 : 1; + + // analyze the predicate + float value = leftLiteral.orElseGet(rightLiteral::get); + int boundaryIdx; + boolean openBound = op == SqlKind.LESS_THAN || op == SqlKind.GREATER_THAN; + switch (op) { + case LESS_THAN, LESS_THAN_OR_EQUAL: + boundaryIdx = literalOpIdx; + break; + case GREATER_THAN, GREATER_THAN_OR_EQUAL: + boundaryIdx = 1 - literalOpIdx; + break; + default: + return defaultSelectivity; + } + float[] boundaryValues = new float[] { Float.NEGATIVE_INFINITY, Float.POSITIVE_INFINITY }; + boolean[] inclusive = new boolean[] { true, true }; + boundaryValues[boundaryIdx] = value; + inclusive[boundaryIdx] = !openBound; + MutableObject boundaries = + new MutableObject<>(new FloatInterval(boundaryValues[0], inclusive[0], boundaryValues[1], inclusive[1])); + + // extract the column index from the other operator + final HiveTableScan scan = (HiveTableScan) childRel; + int inputRefOpIndex = 1 - literalOpIdx; + RexNode node = operands.get(inputRefOpIndex); + if (node.getKind().equals(SqlKind.CAST)) { + node = removeCastIfPossible((RexCall) node, scan, boundaries, null); + } - if (!colStats.isEmpty() && isHistogramAvailable(colStats.get(0))) { - final KllFloatsSketch kll = KllFloatsSketch.heapify(Memory.wrap(colStats.get(0).getHistogram())); - final Object boundValueObject = ((RexLiteral) call.getOperands().get(isLiteralLeft ? 0 : 1)).getValue(); - final SqlTypeName typeName = call.getOperands().get(isInputRefLeft ? 0 : 1).getType().getSqlTypeName(); - float value = extractLiteral(typeName, boundValueObject); - boolean closedBound = op.equals(SqlKind.LESS_THAN_OR_EQUAL) || op.equals(SqlKind.GREATER_THAN_OR_EQUAL); - - double selectivity; - if (op.equals(SqlKind.LESS_THAN_OR_EQUAL) || op.equals(SqlKind.LESS_THAN)) { - selectivity = closedBound ? lessThanOrEqualSelectivity(kll, value) : lessThanSelectivity(kll, value); - } else { - selectivity = closedBound ? greaterThanOrEqualSelectivity(kll, value) : greaterThanSelectivity(kll, value); - } + int inputRefIndex = -1; + if (node.getKind().equals(SqlKind.INPUT_REF)) { + inputRefIndex = ((RexInputRef) node).getIndex(); + } - // selectivity does not account for null values, we multiply for the number of non-null values (getN) - // and we divide by the total (non-null + null values) to get the overall selectivity. - // - // Example: consider a filter "col < 3", and the following table rows: - // _____ - // | col | - // |_____| - // |1 | - // |null | - // |null | - // |3 | - // |4 | - // ------- - // kll.getN() would be 3, selectivity 1/3, t.getTable().getRowCount() 5 - // so the final result would be 3 * 1/3 / 5 = 1/5, as expected. - return kll.getN() * selectivity / t.getTable().getRowCount(); - } + if (inputRefIndex < 0) { + return defaultSelectivity; + } + + final List colStats = scan.getColStat(Collections.singletonList(inputRefIndex)); + if (colStats.isEmpty() || !isHistogramAvailable(colStats.get(0))) { + return defaultSelectivity; } - return ((double) 1 / (double) 3); + + final KllFloatsSketch kll = KllFloatsSketch.heapify(Memory.wrap(colStats.get(0).getHistogram())); + // convert the condition to a range val1 <= x < val2 for rangedSelectivity(...) + double rawSelectivity = rangedSelectivity(kll, boundaries.getValue()); + return scaleSelectivityToNullableValues(kll, rawSelectivity, scan); + } + + /** + * Adjust the selectivity estimate to take NULL values into account. + *

+ * The rawSelectivity does not account for null values. We multiply with the number of non-null values (getN) + * and we divide by the total number (non-null + null values) to get the overall selectivity. + *

+ * Example: consider a filter "col < 3", and the following table rows: + *

+   *  _____
+   * | col |
+   * |_____|
+   * |1    |
+   * |null |
+   * |null |
+   * |3    |
+   * |4    |
+   * -------
+   * 
+ * kll.getN() would be 3, rawSelectivity 1/3, scan.getTable().getRowCount() 5 + * so the final result would be 3 * 1/3 / 5 = 1/5, as expected. + */ + private static double scaleSelectivityToNullableValues(KllFloatsSketch kll, double rawSelectivity, + HiveTableScan scan) { + if (scan.getTable() == null) { + return rawSelectivity; + } + return kll.getN() * rawSelectivity / scan.getTable().getRowCount(); } private Double computeBetweenPredicateSelectivity(RexCall call) { - final boolean hasLiteralBool = call.getOperands().get(0).getKind().equals(SqlKind.LITERAL); - final boolean hasInputRef = call.getOperands().get(1).getKind().equals(SqlKind.INPUT_REF); - final boolean hasLiteralLeft = call.getOperands().get(2).getKind().equals(SqlKind.LITERAL); - final boolean hasLiteralRight = call.getOperands().get(3).getKind().equals(SqlKind.LITERAL); + if (!(childRel instanceof HiveTableScan)) { + return computeFunctionSelectivity(call); + } + + List operands = call.getOperands(); + final boolean hasLiteralBool = operands.get(0).getKind().equals(SqlKind.LITERAL); + Optional leftLiteral = extractLiteral(operands.get(2)); + Optional rightLiteral = extractLiteral(operands.get(3)); + + if (hasLiteralBool && leftLiteral.isPresent() && rightLiteral.isPresent()) { + final HiveTableScan scan = (HiveTableScan) childRel; + float leftValue = leftLiteral.get(); + float rightValue = rightLiteral.get(); + + final Object inverseBoolValueObject = ((RexLiteral) operands.getFirst()).getValue(); + boolean inverseBool = Boolean.parseBoolean(inverseBoolValueObject.toString()); + // when they are equal it's an equality predicate, we cannot handle it as "BETWEEN" + if (Objects.equals(leftValue, rightValue)) { + return inverseBool ? computeNotEqualitySelectivity(call) : computeFunctionSelectivity(call); + } + + MutableObject rangeBoundaries = + new MutableObject<>(new FloatInterval(leftValue, true, rightValue, true)); + MutableObject typeBoundaries = inverseBool ? new MutableObject<>( + new FloatInterval(Float.NEGATIVE_INFINITY, true, Float.POSITIVE_INFINITY, true)) : null; + + RexNode expr = operands.get(1); // expr to be checked by the BETWEEN + if (expr.getKind().equals(SqlKind.CAST)) { + expr = removeCastIfPossible((RexCall) expr, scan, rangeBoundaries, typeBoundaries); + } - if (childRel instanceof HiveTableScan && hasLiteralBool && hasInputRef && hasLiteralLeft && hasLiteralRight) { - final HiveTableScan t = (HiveTableScan) childRel; - final int inputRefIndex = ((RexInputRef) call.getOperands().get(1)).getIndex(); - final List colStats = t.getColStat(Collections.singletonList(inputRefIndex)); + int inputRefIndex = -1; + if (expr.getKind().equals(SqlKind.INPUT_REF)) { + inputRefIndex = ((RexInputRef) expr).getIndex(); + } + if (inputRefIndex < 0) { + return computeFunctionSelectivity(call); + } + + final List colStats = scan.getColStat(Collections.singletonList(inputRefIndex)); if (!colStats.isEmpty() && isHistogramAvailable(colStats.get(0))) { + // convert the condition to a range val1 <= x < val2 for rangedSelectivity(...) final KllFloatsSketch kll = KllFloatsSketch.heapify(Memory.wrap(colStats.get(0).getHistogram())); - final SqlTypeName typeName = call.getOperands().get(1).getType().getSqlTypeName(); - final Object inverseBoolValueObject = ((RexLiteral) call.getOperands().get(0)).getValue(); - boolean inverseBool = Boolean.parseBoolean(inverseBoolValueObject.toString()); - final Object leftBoundValueObject = ((RexLiteral) call.getOperands().get(2)).getValue(); - float leftValue = extractLiteral(typeName, leftBoundValueObject); - final Object rightBoundValueObject = ((RexLiteral) call.getOperands().get(3)).getValue(); - float rightValue = extractLiteral(typeName, rightBoundValueObject); - // when inverseBool == true, this is a NOT_BETWEEN and selectivity must be inverted + double rawSelectivity = rangedSelectivity(kll, rangeBoundaries.getValue()); if (inverseBool) { - if (rightValue == leftValue) { - return computeNotEqualitySelectivity(call); - } else if (rightValue < leftValue) { - return 1.0; - } - return 1.0 - (kll.getN() * betweenSelectivity(kll, leftValue, rightValue) / t.getTable().getRowCount()); - } - // when they are equal it's an equality predicate, we cannot handle it as "between" - if (Double.compare(leftValue, rightValue) != 0) { - return kll.getN() * betweenSelectivity(kll, leftValue, rightValue) / t.getTable().getRowCount(); + // when inverseBool == true, this is a NOT_BETWEEN and selectivity must be inverted + // if there's a cast, the inversion is with respect to its codomain (range of the values of the cast) + double typeRangeSelectivity = rangedSelectivity(kll, typeBoundaries.getValue()); + rawSelectivity = typeRangeSelectivity - rawSelectivity; } + return scaleSelectivityToNullableValues(kll, rawSelectivity, scan); } } return computeFunctionSelectivity(call); } - private float extractLiteral(SqlTypeName typeName, Object boundValueObject) { + private Optional extractLiteral(RexNode node) { + if (node.getKind() != SqlKind.LITERAL) { + return Optional.empty(); + } + RexLiteral literal = (RexLiteral) node; + if (literal.getValue() == null) { + return Optional.empty(); + } + return extractLiteral(literal.getTypeName(), literal.getValue()); + } + + private Optional extractLiteral(SqlTypeName typeName, Object boundValueObject) { final String boundValueString = boundValueObject.toString(); float value; @@ -299,10 +511,10 @@ private float extractLiteral(SqlTypeName typeName, Object boundValueObject) { value = ((GregorianCalendar) boundValueObject).toInstant().getEpochSecond(); break; default: - throw new IllegalStateException( - "Unsupported type for comparator selectivity evaluation using histogram: " + typeName); + LOG.warn("Unsupported type for comparator selectivity evaluation using histogram: {}", typeName); + return Optional.empty(); } - return value; + return Optional.of(value); } /** @@ -470,7 +682,7 @@ private boolean isPartitionPredicate(RexNode expr, RelNode r) { } else if (r instanceof Filter) { return isPartitionPredicate(expr, ((Filter) r).getInput()); } else if (r instanceof HiveTableScan) { - RelOptHiveTable table = (RelOptHiveTable) ((HiveTableScan) r).getTable(); + RelOptHiveTable table = (RelOptHiveTable) r.getTable(); ImmutableBitSet cols = RelOptUtil.InputFinder.bits(expr); return table.containsPartitionColumnsOnly(cols); } @@ -489,7 +701,28 @@ public Double visitLiteral(RexLiteral literal) { return null; } - private static double rangedSelectivity(KllFloatsSketch kll, float val1, float val2) { + /** + * Returns the selectivity of a predicate "val1 <= column < val2". + * @param kll the sketch + * @param boundaries the boundaries + * @return the selectivity of "val1 <= column < val2" + */ + private static double rangedSelectivity(KllFloatsSketch kll, FloatInterval boundaries) { + FloatInterval closedOpen = boundaries.getRightHalfOpenInterval(); + return rangedSelectivity(kll, closedOpen.lower, closedOpen.upper); + } + + /** + * Returns the selectivity of a predicate "val1 <= column < val2". + * @param kll the sketch + * @param val1 lower bound (inclusive) + * @param val2 upper bound (exclusive) + * @return the selectivity of "val1 <= column < val2" + */ + static double rangedSelectivity(KllFloatsSketch kll, float val1, float val2) { + if (val1 >= val2) { + return 0; + } float[] splitPoints = new float[] { val1, val2 }; double[] boundaries = kll.getCDF(splitPoints, QuantileSearchCriteria.EXCLUSIVE); return boundaries[1] - boundaries[0]; @@ -574,7 +807,7 @@ public static double betweenSelectivity(KllFloatsSketch kll, float leftValue, fl "Selectivity for BETWEEN leftValue AND rightValue when the two values coincide is not supported, found: " + "leftValue = " + leftValue + " and rightValue = " + rightValue); } - return rangedSelectivity(kll, Math.nextDown(leftValue), Math.nextUp(rightValue)); + return rangedSelectivity(kll, leftValue, Math.nextUp(rightValue)); } /** diff --git a/ql/src/test/org/apache/hadoop/hive/ql/optimizer/calcite/stats/TestFilterSelectivityEstimator.java b/ql/src/test/org/apache/hadoop/hive/ql/optimizer/calcite/stats/TestFilterSelectivityEstimator.java index 4255c756e078..e5e48c9f1b57 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/optimizer/calcite/stats/TestFilterSelectivityEstimator.java +++ b/ql/src/test/org/apache/hadoop/hive/ql/optimizer/calcite/stats/TestFilterSelectivityEstimator.java @@ -17,7 +17,6 @@ */ package org.apache.hadoop.hive.ql.optimizer.calcite.stats; -import com.google.common.collect.ImmutableList; import org.apache.calcite.jdbc.JavaTypeFactoryImpl; import org.apache.calcite.plan.RelOptCluster; import org.apache.calcite.plan.RelOptPlanner; @@ -27,11 +26,16 @@ import org.apache.calcite.rel.type.RelDataType; import org.apache.calcite.rel.type.RelDataTypeFactory; import org.apache.calcite.rex.RexBuilder; +import org.apache.calcite.rex.RexCall; +import org.apache.calcite.rex.RexLiteral; import org.apache.calcite.rex.RexNode; +import org.apache.calcite.sql.SqlBinaryOperator; +import org.apache.calcite.sql.SqlOperator; import org.apache.calcite.sql.fun.SqlStdOperatorTable; import org.apache.calcite.sql.type.SqlTypeName; import org.apache.calcite.tools.RelBuilder; import org.apache.calcite.util.ImmutableBitSet; +import org.apache.commons.lang3.mutable.MutableObject; import org.apache.datasketches.kll.KllFloatsSketch; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.metastore.StatisticsTestUtils; @@ -43,6 +47,7 @@ import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveTableScan; import org.apache.hadoop.hive.ql.parse.CalcitePlanner; import org.apache.hadoop.hive.ql.plan.ColStatistics; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; import org.junit.Assert; import org.junit.Before; import org.junit.BeforeClass; @@ -51,24 +56,73 @@ import org.mockito.Mock; import org.mockito.junit.MockitoJUnitRunner; +import java.time.Instant; +import java.time.LocalDate; +import java.time.LocalTime; +import java.time.ZoneOffset; import java.util.Collections; - +import java.util.Objects; + +import static org.apache.calcite.sql.type.SqlTypeName.BIGINT; +import static org.apache.calcite.sql.type.SqlTypeName.DOUBLE; +import static org.apache.calcite.sql.type.SqlTypeName.FLOAT; +import static org.apache.calcite.sql.type.SqlTypeName.INTEGER; +import static org.apache.calcite.sql.type.SqlTypeName.SMALLINT; +import static org.apache.calcite.sql.type.SqlTypeName.TINYINT; import static org.apache.hadoop.hive.ql.optimizer.calcite.stats.FilterSelectivityEstimator.betweenSelectivity; import static org.apache.hadoop.hive.ql.optimizer.calcite.stats.FilterSelectivityEstimator.greaterThanOrEqualSelectivity; import static org.apache.hadoop.hive.ql.optimizer.calcite.stats.FilterSelectivityEstimator.greaterThanSelectivity; import static org.apache.hadoop.hive.ql.optimizer.calcite.stats.FilterSelectivityEstimator.isHistogramAvailable; import static org.apache.hadoop.hive.ql.optimizer.calcite.stats.FilterSelectivityEstimator.lessThanOrEqualSelectivity; import static org.apache.hadoop.hive.ql.optimizer.calcite.stats.FilterSelectivityEstimator.lessThanSelectivity; +import static org.mockito.ArgumentMatchers.any; import static org.mockito.Mockito.doReturn; +import static org.mockito.Mockito.never; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; @RunWith(MockitoJUnitRunner.class) public class TestFilterSelectivityEstimator { + private static final SqlBinaryOperator GT = SqlStdOperatorTable.GREATER_THAN; + private static final SqlBinaryOperator GE = SqlStdOperatorTable.GREATER_THAN_OR_EQUAL; + private static final SqlBinaryOperator LT = SqlStdOperatorTable.LESS_THAN; + private static final SqlBinaryOperator LE = SqlStdOperatorTable.LESS_THAN_OR_EQUAL; + private static final SqlOperator BETWEEN = HiveBetween.INSTANCE; + private static final float[] VALUES = { 1, 2, 2, 2, 2, 2, 2, 2, 3, 4, 5, 6, 7 }; + private static final float[] VALUES2 = { + // rounding for DECIMAL(3,1) + // -99.95f and its two predecessors and successors + -99.95001f, -99.950005f, -99.95f, -99.94999f, -99.94998f, + // some values + 0f, 1f, 10f, + // rounding for DECIMAL(3,1) + // 99.95f and its two predecessors and successors + 99.94998f, 99.94999f, 99.95f, 99.950005f, 99.95001f, + // 100f and its two predecessors and successors + 99.999985f, 99.99999f, 100f, 100.00001f, 100.000015f, + // 100.05f and its two predecessors and successors + 100.04999f, 100.049995f, 100.05f, 100.05001f, 100.05002f, + // some values + 1_000f, 10_000f, 100_000f, 1_000_000f, 10_000_000f }; + + /** + * Both dates and timestamps are converted to epoch seconds. + *

+ * See {@link org.apache.hadoop.hive.ql.udf.generic.GenericUDFToUnixTimeStamp#evaluate(GenericUDF.DeferredObject[])}. + */ + private static final float[] VALUES_TIME = { + timestamp("2020-11-01"), timestamp("2020-11-02"), timestamp("2020-11-03"), timestamp("2020-11-04"), + timestamp("2020-11-05T11:23:45Z"), timestamp("2020-11-06"), timestamp("2020-11-07") }; + private static final KllFloatsSketch KLL = StatisticsTestUtils.createKll(VALUES); - private static final float DELTA = Float.MIN_VALUE; + private static final KllFloatsSketch KLL2 = StatisticsTestUtils.createKll(VALUES2); + private static final KllFloatsSketch KLL_TIME = StatisticsTestUtils.createKll(VALUES_TIME); + private static final float DELTA = 1e-7f; private static final RexBuilder REX_BUILDER = new RexBuilder(new JavaTypeFactoryImpl(new HiveTypeSystemImpl())); private static final RelDataTypeFactory TYPE_FACTORY = REX_BUILDER.getTypeFactory(); + private static RelOptCluster relOptCluster; private static RexNode intMinus1; private static RexNode int0; @@ -85,7 +139,6 @@ public class TestFilterSelectivityEstimator { private static RexNode inputRef0; private static RexNode boolFalse; private static RexNode boolTrue; - private static ColStatistics stats; @Mock private RelOptSchema schemaMock; @@ -94,12 +147,14 @@ public class TestFilterSelectivityEstimator { @Mock private RelMetadataQuery mq; - private HiveTableScan tableScan; + private ColStatistics stats; private RelNode scan; + private RexNode currentInputRef; + private final MutableObject currentValues = new MutableObject<>(); @BeforeClass public static void beforeClass() { - RelDataType integerType = TYPE_FACTORY.createSqlType(SqlTypeName.INTEGER); + RelDataType integerType = TYPE_FACTORY.createSqlType(INTEGER); intMinus1 = REX_BUILDER.makeLiteral(-1, integerType, true); int0 = REX_BUILDER.makeLiteral(0, integerType, true); int1 = REX_BUILDER.makeLiteral(1, integerType, true); @@ -113,25 +168,54 @@ public static void beforeClass() { int11 = REX_BUILDER.makeLiteral(11, integerType, true); boolFalse = REX_BUILDER.makeLiteral(false, TYPE_FACTORY.createSqlType(SqlTypeName.BOOLEAN), true); boolTrue = REX_BUILDER.makeLiteral(true, TYPE_FACTORY.createSqlType(SqlTypeName.BOOLEAN), true); - tableType = TYPE_FACTORY.createStructType(ImmutableList.of(integerType), ImmutableList.of("f1")); + RelDataTypeFactory.Builder b = new RelDataTypeFactory.Builder(TYPE_FACTORY); + b.add("f_numeric", decimalType(38, 25)); + b.add("f_timestamp", SqlTypeName.TIMESTAMP); + b.add("f_date", SqlTypeName.DATE).build(); + tableType = b.build(); RelOptPlanner planner = CalcitePlanner.createPlanner(new HiveConf()); relOptCluster = RelOptCluster.create(planner, REX_BUILDER); + } - stats = new ColStatistics(); - stats.setHistogram(KLL.toByteArray()); + private static ColStatistics.Range rangeOf(float[] values) { + float min = Float.MAX_VALUE, max = -Float.MAX_VALUE; + for (float v : values) { + min = Math.min(min, v); + max = Math.max(max, v); + } + return new ColStatistics.Range(min, max); } @Before public void before() { + currentValues.setValue(VALUES); doReturn(tableType).when(tableMock).getRowType(); - doReturn((double) VALUES.length).when(tableMock).getRowCount(); + when(tableMock.getRowCount()).thenAnswer(a -> (double) Objects.requireNonNull(currentValues.getValue()).length); RelBuilder relBuilder = HiveRelFactories.HIVE_BUILDER.create(relOptCluster, schemaMock); - tableScan = new HiveTableScan(relOptCluster, relOptCluster.traitSetOf(HiveRelNode.CONVENTION), - tableMock, "table", null, false, false); + HiveTableScan tableScan = + new HiveTableScan(relOptCluster, relOptCluster.traitSetOf(HiveRelNode.CONVENTION), tableMock, "table", null, + false, false); scan = relBuilder.push(tableScan).build(); inputRef0 = REX_BUILDER.makeInputRef(scan, 0); + currentInputRef = inputRef0; + + stats = new ColStatistics(); + stats.setHistogram(KLL.toByteArray()); + stats.setRange(rangeOf(VALUES)); + } + + /** + * Note: call this method only at the beginning of a test method. + */ + private void useFieldWithValues(String fieldname, float[] values, KllFloatsSketch sketch) { + currentValues.setValue(values); + stats.setHistogram(sketch.toByteArray()); + stats.setRange(rangeOf(values)); + int fieldIndex = scan.getRowType().getFieldNames().indexOf(fieldname); + currentInputRef = REX_BUILDER.makeInputRef(scan, fieldIndex); + doReturn(Collections.singletonList(stats)).when(tableMock).getColStat(Collections.singletonList(fieldIndex)); } @Test @@ -420,7 +504,7 @@ public void testComputeRangePredicateSelectivityBetweenLeftLowerThanRight() { @Test public void testComputeRangePredicateSelectivityBetweenLeftEqualsRight() { - doReturn(Collections.singletonList(stats)).when(tableMock).getColStat(Collections.singletonList(0)); + verify(tableMock, never()).getColStat(any()); doReturn(10.0).when(mq).getDistinctRowCount(scan, ImmutableBitSet.of(0), REX_BUILDER.makeLiteral(true)); RexNode filter = REX_BUILDER.makeCall(HiveBetween.INSTANCE, boolFalse, inputRef0, int3, int3); FilterSelectivityEstimator estimator = new FilterSelectivityEstimator(scan, mq); @@ -454,7 +538,7 @@ public void testComputeRangePredicateSelectivityNotBetweenRightLowerThanLeft() { @Test public void testComputeRangePredicateSelectivityNotBetweenLeftEqualsRight() { - doReturn(Collections.singletonList(stats)).when(tableMock).getColStat(Collections.singletonList(0)); + verify(tableMock, never()).getColStat(any()); RexNode filter = REX_BUILDER.makeCall(HiveBetween.INSTANCE, boolTrue, inputRef0, int3, int3); FilterSelectivityEstimator estimator = new FilterSelectivityEstimator(scan, mq); Assert.assertEquals(1, estimator.estimateSelectivity(filter), DELTA); @@ -511,6 +595,292 @@ public void testComputeRangePredicateSelectivityNotBetweenWithNULLS() { doReturn(Collections.singletonList(stats)).when(tableMock).getColStat(Collections.singletonList(0)); RexNode filter = REX_BUILDER.makeCall(HiveBetween.INSTANCE, boolTrue, inputRef0, int1, int3); FilterSelectivityEstimator estimator = new FilterSelectivityEstimator(scan, mq); - Assert.assertEquals(0.55, estimator.estimateSelectivity(filter), DELTA); + // only the values 4, 5, 6, 7 fulfill the condition NOT BETWEEN 1 AND 3 + // (the NULL values do not fulfill the condition) + Assert.assertEquals(0.2, estimator.estimateSelectivity(filter), DELTA); + } + + @Test + public void testComputeRangePredicateSelectivityWithCast() { + useFieldWithValues("f_numeric", VALUES, KLL); + checkSelectivity(3 / 13.f, ge(cast("f_numeric", TINYINT), int5)); + checkSelectivity(10 / 13.f, lt(cast("f_numeric", TINYINT), int5)); + checkSelectivity(2 / 13.f, gt(cast("f_numeric", TINYINT), int5)); + checkSelectivity(11 / 13.f, le(cast("f_numeric", TINYINT), int5)); + + checkSelectivity(12 / 13f, ge(cast("f_numeric", TINYINT), int2)); + checkSelectivity(1 / 13f, lt(cast("f_numeric", TINYINT), int2)); + checkSelectivity(5 / 13f, gt(cast("f_numeric", TINYINT), int2)); + checkSelectivity(8 / 13f, le(cast("f_numeric", TINYINT), int2)); + + // check some types + checkSelectivity(3 / 13.f, ge(cast("f_numeric", INTEGER), int5)); + checkSelectivity(3 / 13.f, ge(cast("f_numeric", SMALLINT), int5)); + checkSelectivity(3 / 13.f, ge(cast("f_numeric", BIGINT), int5)); + checkSelectivity(3 / 13.f, ge(cast("f_numeric", FLOAT), int5)); + checkSelectivity(3 / 13.f, ge(cast("f_numeric", DOUBLE), int5)); + } + + @Test + public void testComputeRangePredicateSelectivityWithCast2() { + useFieldWithValues("f_numeric", VALUES2, KLL2); + RelDataType decimal3s1 = decimalType(3, 1); + checkSelectivity(4 / 28.f, ge(cast("f_numeric", decimal3s1), literalFloat(1))); + + // values from -99.94999 to 99.94999 (both inclusive) + checkSelectivity(7 / 28.f, lt(cast("f_numeric", decimal3s1), literalFloat(100))); + checkSelectivity(7 / 28.f, le(cast("f_numeric", decimal3s1), literalFloat(100))); + checkSelectivity(0 / 28.f, gt(cast("f_numeric", decimal3s1), literalFloat(100))); + checkSelectivity(0 / 28.f, ge(cast("f_numeric", decimal3s1), literalFloat(100))); + + RelDataType decimal4s1 = decimalType(4, 1); + checkSelectivity(10 / 28.f, lt(cast("f_numeric", decimal4s1), literalFloat(100))); + checkSelectivity(20 / 28.f, le(cast("f_numeric", decimal4s1), literalFloat(100))); + checkSelectivity(3 / 28.f, gt(cast("f_numeric", decimal4s1), literalFloat(100))); + checkSelectivity(13 / 28.f, ge(cast("f_numeric", decimal4s1), literalFloat(100))); + + RelDataType decimal2s1 = decimalType(2, 1); + checkSelectivity(2 / 28.f, lt(cast("f_numeric", decimal2s1), literalFloat(100))); + checkSelectivity(2 / 28.f, le(cast("f_numeric", decimal2s1), literalFloat(100))); + checkSelectivity(0 / 28.f, gt(cast("f_numeric", decimal2s1), literalFloat(100))); + checkSelectivity(0 / 28.f, ge(cast("f_numeric", decimal2s1), literalFloat(100))); + + // expected: 100_000f + RelDataType decimal7s1 = decimalType(7, 1); + checkSelectivity(1 / 28.f, gt(cast("f_numeric", decimal7s1), literalFloat(10000))); + + // expected: 10_000f, 100_000f, because CAST(1_000_000 AS DECIMAL(7,1)) = NULL, and similar for even larger values + checkSelectivity(2 / 28.f, ge(cast("f_numeric", decimal7s1), literalFloat(9999))); + checkSelectivity(2 / 28.f, ge(cast("f_numeric", decimal7s1), literalFloat(10000))); + + // expected: 100_000f + checkSelectivity(1 / 28.f, gt(cast("f_numeric", decimal7s1), literalFloat(10000))); + checkSelectivity(1 / 28.f, gt(cast("f_numeric", decimal7s1), literalFloat(10001))); + + // expected 1f, 10f, 99.94998f, 99.94999f + checkSelectivity(4 / 28.f, ge(cast("f_numeric", decimal3s1), literalFloat(1))); + checkSelectivity(3 / 28.f, gt(cast("f_numeric", decimal3s1), literalFloat(1))); + // expected -99.94999f, -99.94998f, 0f, 1f + checkSelectivity(4 / 28.f, le(cast("f_numeric", decimal3s1), literalFloat(1))); + checkSelectivity(3 / 28.f, lt(cast("f_numeric", decimal3s1), literalFloat(1))); + + // the cast would apply a modulo operation to the values outside the range of the cast + // so instead a default selectivity should be returned + checkSelectivity(1 / 3.f, lt(cast("f_numeric", TINYINT), literalFloat(100))); + checkSelectivity(1 / 3.f, lt(cast("f_numeric", TINYINT), literalFloat(100))); + } + + private void checkTimeFieldOnMidnightTimestamps(RexNode field) { + // note: use only values from VALUES_TIME that specify a date without hh:mm:ss! + checkSelectivity(7 / 7.f, ge(field, literalTimestamp("2020-11-01"))); + checkSelectivity(5 / 7.f, ge(field, literalTimestamp("2020-11-03"))); + checkSelectivity(1 / 7.f, ge(field, literalTimestamp("2020-11-07"))); + + checkSelectivity(6 / 7.f, gt(field, literalTimestamp("2020-11-01"))); + checkSelectivity(4 / 7.f, gt(field, literalTimestamp("2020-11-03"))); + checkSelectivity(0 / 7.f, gt(field, literalTimestamp("2020-11-07"))); + + checkSelectivity(1 / 7.f, le(field, literalTimestamp("2020-11-01"))); + checkSelectivity(3 / 7.f, le(field, literalTimestamp("2020-11-03"))); + checkSelectivity(7 / 7.f, le(field, literalTimestamp("2020-11-07"))); + + checkSelectivity(0 / 7.f, lt(field, literalTimestamp("2020-11-01"))); + checkSelectivity(2 / 7.f, lt(field, literalTimestamp("2020-11-03"))); + checkSelectivity(6 / 7.f, lt(field, literalTimestamp("2020-11-07"))); + } + + private void checkTimeFieldOnIntraDayTimestamps(RexNode field) { + checkSelectivity(3 / 7.f, ge(field, literalTimestamp("2020-11-05T11:23:45Z"))); + checkSelectivity(2 / 7.f, gt(field, literalTimestamp("2020-11-05T11:23:45Z"))); + checkSelectivity(5 / 7.f, le(field, literalTimestamp("2020-11-05T11:23:45Z"))); + checkSelectivity(4 / 7.f, lt(field, literalTimestamp("2020-11-05T11:23:45Z"))); + } + + @Test + public void testComputeRangePredicateSelectivityTimestamp() { + useFieldWithValues("f_timestamp", VALUES_TIME, KLL_TIME); + checkTimeFieldOnMidnightTimestamps(currentInputRef); + checkTimeFieldOnIntraDayTimestamps(currentInputRef); + } + + @Test + public void testComputeRangePredicateSelectivityDate() { + useFieldWithValues("f_date", VALUES_TIME, KLL_TIME); + checkTimeFieldOnMidnightTimestamps(currentInputRef); + + // it does not make sense to compare with "2020-11-05T11:23:45Z", + // as that value would not be stored as-is in a date column, but as "2020-11-05" instead + } + + @Test + public void testComputeRangePredicateSelectivityDateWithCast() { + useFieldWithValues("f_date", VALUES_TIME, KLL_TIME); + RexNode field1 = cast("f_date", SqlTypeName.DATE); + checkTimeFieldOnMidnightTimestamps(field1); + checkTimeFieldOnIntraDayTimestamps(field1); + + RexNode field2 = cast("f_date", SqlTypeName.TIMESTAMP); + checkTimeFieldOnMidnightTimestamps(field2); + checkTimeFieldOnIntraDayTimestamps(field2); + } + + @Test + public void testComputeRangePredicateSelectivityTimestampWithCast() { + useFieldWithValues("f_timestamp", VALUES_TIME, KLL_TIME); + checkTimeFieldOnMidnightTimestamps(cast("f_timestamp", SqlTypeName.DATE)); + checkTimeFieldOnMidnightTimestamps(cast("f_timestamp", SqlTypeName.TIMESTAMP)); + } + + @Test + public void testComputeRangePredicateSelectivityBetweenWithCastDecimal2_1() { + useFieldWithValues("f_numeric", VALUES2, KLL2); + float total = VALUES2.length; + float universe = 2; // the number of values that "survive" the cast + RexNode cast = REX_BUILDER.makeCast(decimalType(2, 1), inputRef0); + checkBetweenSelectivity(0, universe, total, cast, 100f, 1000f); + checkBetweenSelectivity(1, universe, total, cast, 1f, 100f); + checkBetweenSelectivity(0, universe, total, cast, 100f, 0f); + } + + @Test + public void testComputeRangePredicateSelectivityBetweenWithCastDecimal3_1() { + useFieldWithValues("f_numeric", VALUES2, KLL2); + float total = VALUES2.length; + float universe = 7; + RexNode cast = REX_BUILDER.makeCast(decimalType(3, 1), inputRef0); + checkBetweenSelectivity(0, universe, total, cast, 100f, 1000f); + checkBetweenSelectivity(4, universe, total, cast, 1f, 100f); + checkBetweenSelectivity(0, universe, total, cast, 100f, 0f); + } + + @Test + public void testComputeRangePredicateSelectivityBetweenWithCastDecimal4_1() { + useFieldWithValues("f_numeric", VALUES2, KLL2); + float total = VALUES2.length; + float universe = 23; + RexNode cast = REX_BUILDER.makeCast(decimalType(4, 1), inputRef0); + // the values between -999.94999... and 999.94999... (both inclusive) pass through the cast + // the values between 99.95 and 100 are rounded up to 100, so they fulfill the BETWEEN + checkBetweenSelectivity(13, universe, total, cast, 100, 1000); + checkBetweenSelectivity(14, universe, total, cast, 1f, 100f); + checkBetweenSelectivity(0, universe, total, cast, 100f, 0f); + } + + @Test + public void testComputeRangePredicateSelectivityBetweenWithCastDecimal7_1() { + useFieldWithValues("f_numeric", VALUES2, KLL2); + float total = VALUES2.length; + float universe = 26; + RexNode cast = REX_BUILDER.makeCast(decimalType(7, 1), inputRef0); + checkBetweenSelectivity(14, universe, total, cast, 100, 1000); + checkBetweenSelectivity(14, universe, total, cast, 1f, 100f); + checkBetweenSelectivity(0, universe, total, cast, 100f, 0f); + } + + private void checkSelectivity(float expectedSelectivity, RexNode filter) { + FilterSelectivityEstimator estimator = new FilterSelectivityEstimator(scan, mq); + Assert.assertEquals(filter.toString(), expectedSelectivity, estimator.estimateSelectivity(filter), DELTA); + + // swap equation, e.g., col < 5 becomes 5 > col; selectivity stays the same + RexCall call = (RexCall) filter; + SqlOperator operator = ((RexCall) filter).getOperator(); + SqlOperator swappedOp; + if (operator == LE) { + swappedOp = GE; + } else if (operator == LT) { + swappedOp = GT; + } else if (operator == GE) { + swappedOp = LE; + } else if (operator == GT) { + swappedOp = LT; + } else if (operator == BETWEEN) { + // BETWEEN cannot be swapped + return; + } else { + throw new UnsupportedOperationException(); + } + RexNode swapped = REX_BUILDER.makeCall(swappedOp, call.getOperands().get(1), call.getOperands().get(0)); + Assert.assertEquals(filter.toString(), expectedSelectivity, estimator.estimateSelectivity(swapped), DELTA); + } + + private void checkBetweenSelectivity(float expectedEntries, float universe, float total, RexNode value, float lower, + float upper) { + RexNode betweenFilter = + REX_BUILDER.makeCall(HiveBetween.INSTANCE, boolFalse, value, literalFloat(lower), literalFloat(upper)); + FilterSelectivityEstimator estimator = new FilterSelectivityEstimator(scan, mq); + String between = "BETWEEN " + lower + " AND " + upper; + float expectedSelectivity = expectedEntries / total; + String message = between + ": calcite filter " + betweenFilter.toString(); + Assert.assertEquals(message, expectedSelectivity, estimator.estimateSelectivity(betweenFilter), DELTA); + + // invert the filter to a NOT BETWEEN + RexNode invBetween = + REX_BUILDER.makeCall(HiveBetween.INSTANCE, boolTrue, value, literalFloat(lower), literalFloat(upper)); + String invMessage = "NOT " + between + ": calcite filter " + invBetween.toString(); + float invExpectedSelectivity = (universe - expectedEntries) / total; + Assert.assertEquals(invMessage, invExpectedSelectivity, estimator.estimateSelectivity(invBetween), DELTA); + } + + private RexNode cast(String fieldname, SqlTypeName typeName) { + return cast(fieldname, type(typeName)); } + + private RexNode cast(String fieldname, RelDataType type) { + int fieldIndex = scan.getRowType().getFieldNames().indexOf(fieldname); + RexNode column = REX_BUILDER.makeInputRef(scan, fieldIndex); + return REX_BUILDER.makeCast(type, column); + } + + private RexNode ge(RexNode expr, RexNode value) { + return REX_BUILDER.makeCall(GE, expr, value); + } + + private RexNode gt(RexNode expr, RexNode value) { + return REX_BUILDER.makeCall(GT, expr, value); + } + + private RexNode le(RexNode expr, RexNode value) { + return REX_BUILDER.makeCall(LE, expr, value); + } + + private RexNode lt(RexNode expr, RexNode value) { + return REX_BUILDER.makeCall(LT, expr, value); + } + + private static RelDataType type(SqlTypeName typeName) { + return REX_BUILDER.getTypeFactory().createSqlType(typeName); + } + + private static RelDataType decimalType(int precision, int scale) { + return REX_BUILDER.getTypeFactory().createSqlType(SqlTypeName.DECIMAL, precision, scale); + } + + private static RexLiteral literalTimestamp(String timestamp) { + return REX_BUILDER.makeLiteral(timestampMillis(timestamp), + REX_BUILDER.getTypeFactory().createSqlType(SqlTypeName.TIMESTAMP)); + } + + private static RexLiteral literalDate(String date) { + return REX_BUILDER.makeLiteral(epochDay(date), REX_BUILDER.getTypeFactory().createSqlType(SqlTypeName.DATE)); + } + + private RexNode literalFloat(float f) { + return REX_BUILDER.makeLiteral(f, type(SqlTypeName.FLOAT)); + } + + private static long timestampMillis(String timestamp) { + if (!timestamp.contains(":")) { + return LocalDate.parse(timestamp).toEpochSecond(LocalTime.MIDNIGHT, ZoneOffset.UTC) * 1000; + } + return Instant.parse(timestamp).toEpochMilli(); + } + + private static long timestamp(String timestamp) { + return timestampMillis(timestamp) / 1000; + } + + private static int epochDay(String date) { + return (int) LocalDate.parse(date).toEpochDay(); + } + } From 11e6c0dfddd4569a5d4ba19708c4bdc11925661f Mon Sep 17 00:00:00 2001 From: Stamatis Zampetakis Date: Fri, 20 Feb 2026 15:08:01 +0100 Subject: [PATCH 02/11] Decouple CAST removal and type boundary adjustment The removeCastIfPossible was doing three things: 1) Checking if a cast can be removed based using column stats 2) Removing the cast if possible 3) Adjusting the boundaries in case of DECIMAL casts After the refactoring the three actions are decoupled and each is performed individually. This leads to smaller and more self-contained methods that are easier to follow. --- .../stats/FilterSelectivityEstimator.java | 72 ++++++++++--------- 1 file changed, 37 insertions(+), 35 deletions(-) diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/FilterSelectivityEstimator.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/FilterSelectivityEstimator.java index 1d778b77d8fa..5977b5a3a01b 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/FilterSelectivityEstimator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/FilterSelectivityEstimator.java @@ -33,6 +33,7 @@ import org.apache.calcite.rel.core.Filter; import org.apache.calcite.rel.core.Project; import org.apache.calcite.rel.metadata.RelMetadataQuery; +import org.apache.calcite.rel.type.RelDataType; import org.apache.calcite.rex.RexBuilder; import org.apache.calcite.rex.RexCall; import org.apache.calcite.rex.RexInputRef; @@ -204,36 +205,31 @@ public Double visitCall(RexCall call) { } /** - * If the cast can be removed, just return its operand and adjust the boundaries if necessary. + * Return whether the expression is a removable cast based on stats and type bounds. * *

- * In Hive, if a value cannot be represented by the cast, the result of the cast is NULL, - * and therefore cannot fulfill the predicate. So the possible range of the values - * is limited by the range of possible values of the type. + * In Hive, if a value cannot be represented by the cast, the result of the cast is NULL, + * and therefore cannot fulfill the predicate. So the possible range of the values + * is limited by the range of possible values of the type. *

* - *

- * Special care is taken to support the cast to DECIMAL(precision, scale): - * The cast to DECIMAL rounds the value the same way as {@link RoundingMode#HALF_UP}. - * The boundaries are adjusted accordingly. - *

- * - * @param cast a RexCall of type {@link SqlKind#CAST} + * @param exp the expression to check * @param tableScan the table that provides the statistics - * @param rangeBoundaries see {@link #adjustBoundariesForDecimal(RexCall, MutableObject, MutableObject)}; might get modified - * @param typeBoundaries see {@link #adjustBoundariesForDecimal(RexCall, MutableObject, MutableObject)}; might get modified - * @return the operand if the cast can be removed, otherwise the cast itself + * @return true if the expression is a removable cast, false otherwise */ - private RexNode removeCastIfPossible(RexCall cast, HiveTableScan tableScan, - MutableObject rangeBoundaries, MutableObject typeBoundaries) { + private boolean isRemovableCast(RexNode exp, HiveTableScan tableScan) { + if(SqlKind.CAST != exp.getKind()) { + return false; + } + RexCall cast = (RexCall) exp; RexNode op0 = cast.getOperands().getFirst(); if (!(op0 instanceof RexInputRef)) { - return cast; + return false; } int index = ((RexInputRef) op0).getIndex(); final List colStats = tableScan.getColStat(Collections.singletonList(index)); if (colStats.isEmpty()) { - return cast; + return false; } // we need to check that the possible values of the input to the cast are all within the type range of the cast @@ -242,7 +238,7 @@ private RexNode removeCastIfPossible(RexCall cast, HiveTableScan tableScan, ColStatistics.Range range = colStat.getRange(); if (range == null || range.minValue == null || Double.isNaN( range.minValue.doubleValue()) || range.maxValue == null || Double.isNaN(range.maxValue.doubleValue())) { - return cast; + return false; } SqlTypeName type = cast.getType().getSqlTypeName(); @@ -268,32 +264,36 @@ private RexNode removeCastIfPossible(RexCall cast, HiveTableScan tableScan, break; default: // unknown type, do not remove the cast - return cast; + return false; } // see (*) if (range.minValue.doubleValue() < min || range.maxValue.doubleValue() > max) { - return cast; - } - - if (type == SqlTypeName.DECIMAL) { - adjustBoundariesForDecimal(cast, rangeBoundaries, typeBoundaries); + return false; } - - return op0; + return true; } /** - * Adjust the boundaries for a DECIMAL cast. + * Adjust the type boundaries if necessary. + * + *

+ * Special care is taken to support the cast to DECIMAL(precision, scale): + * The cast to DECIMAL rounds the value the same way as {@link RoundingMode#HALF_UP}. + * The boundaries are adjusted accordingly. + *

* * @param rangeBoundaries boundaries of the range predicate * @param typeBoundaries if not null, will be set to the boundaries of the type range */ - private static void adjustBoundariesForDecimal(RexCall cast, MutableObject rangeBoundaries, + private static void adjustTypeBoundaries(RelDataType type, MutableObject rangeBoundaries, MutableObject typeBoundaries) { + if (type.getSqlTypeName() != SqlTypeName.DECIMAL) { + return; + } // values outside the representable range are cast to NULL, so adapt the boundaries - int precision = cast.getType().getPrecision(); - int scale = cast.getType().getScale(); + int precision = type.getPrecision(); + int scale = type.getScale(); int digits = precision - scale; // the cast does some rounding, i.e., CAST(99.9499 AS DECIMAL(3,1)) = 99.9 // but CAST(99.95 AS DECIMAL(3,1)) = NULL @@ -358,8 +358,9 @@ private double computeRangePredicateSelectivity(RexCall call, SqlKind op) { final HiveTableScan scan = (HiveTableScan) childRel; int inputRefOpIndex = 1 - literalOpIdx; RexNode node = operands.get(inputRefOpIndex); - if (node.getKind().equals(SqlKind.CAST)) { - node = removeCastIfPossible((RexCall) node, scan, boundaries, null); + if (isRemovableCast(node, scan)) { + adjustTypeBoundaries(node.getType(), boundaries, null); + node = RexUtil.removeCast(node); } int inputRefIndex = -1; @@ -439,8 +440,9 @@ private Double computeBetweenPredicateSelectivity(RexCall call) { new FloatInterval(Float.NEGATIVE_INFINITY, true, Float.POSITIVE_INFINITY, true)) : null; RexNode expr = operands.get(1); // expr to be checked by the BETWEEN - if (expr.getKind().equals(SqlKind.CAST)) { - expr = removeCastIfPossible((RexCall) expr, scan, rangeBoundaries, typeBoundaries); + if (isRemovableCast(expr, scan)) { + adjustTypeBoundaries(expr.getType(), rangeBoundaries, typeBoundaries); + expr = RexUtil.removeCast(expr); } int inputRefIndex = -1; From 638376e2b54b591763232c30f7f32ead89a52952 Mon Sep 17 00:00:00 2001 From: Stamatis Zampetakis Date: Fri, 20 Feb 2026 16:13:31 +0100 Subject: [PATCH 03/11] Generalize StatsUtils#isWithin and use in FilterSelectivityEstimator --- .../stats/FilterSelectivityEstimator.java | 18 +++++-------- .../hadoop/hive/ql/stats/StatsUtils.java | 26 +++++++++++++------ 2 files changed, 25 insertions(+), 19 deletions(-) diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/FilterSelectivityEstimator.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/FilterSelectivityEstimator.java index 5977b5a3a01b..e2013a5d30bc 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/FilterSelectivityEstimator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/FilterSelectivityEstimator.java @@ -57,6 +57,7 @@ import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveTableScan; import org.apache.hadoop.hive.ql.plan.ColStatistics; import org.apache.hadoop.hive.ql.session.SessionState; +import org.apache.hadoop.hive.ql.stats.StatsUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -232,12 +233,11 @@ private boolean isRemovableCast(RexNode exp, HiveTableScan tableScan) { return false; } - // we need to check that the possible values of the input to the cast are all within the type range of the cast - // otherwise the CAST introduces some modulo-like behavior (*) + // Check that the possible values of the input column are all within the type range of the cast + // otherwise the CAST introduces some modulo-like behavior ColStatistics colStat = colStats.getFirst(); - ColStatistics.Range range = colStat.getRange(); - if (range == null || range.minValue == null || Double.isNaN( - range.minValue.doubleValue()) || range.maxValue == null || Double.isNaN(range.maxValue.doubleValue())) { + ColStatistics.Range colRange = colStat.getRange(); + if (colRange == null) { return false; } @@ -266,12 +266,8 @@ private boolean isRemovableCast(RexNode exp, HiveTableScan tableScan) { // unknown type, do not remove the cast return false; } - - // see (*) - if (range.minValue.doubleValue() < min || range.maxValue.doubleValue() > max) { - return false; - } - return true; + ColStatistics.Range typeRange = new ColStatistics.Range(min, max); + return StatsUtils.isWithin(colRange, typeRange, Number::doubleValue); } /** diff --git a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java index c530633fbf1c..ea29206cb963 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java @@ -37,6 +37,7 @@ import java.util.concurrent.Future; import java.util.concurrent.ForkJoinPool; import java.util.concurrent.atomic.AtomicInteger; +import java.util.function.Function; import java.util.stream.Collectors; import org.apache.commons.lang3.math.NumberUtils; @@ -509,7 +510,7 @@ public static boolean inferForeignKey(ColStatistics csPK, ColStatistics csFK) { if (csPK.getRange() != null && csFK.getRange() != null) { ColStatistics.Range pkRange = csPK.getRange(); ColStatistics.Range fkRange = csFK.getRange(); - return isWithin(fkRange, pkRange); + return isWithin(fkRange, pkRange, Number::longValue); } } } @@ -546,13 +547,22 @@ public static long getRangeDelta(ColStatistics.Range range) { return 0; } - private static boolean isWithin(ColStatistics.Range range1, ColStatistics.Range range2) { - if (range1.minValue != null && range2.minValue != null && range1.maxValue != null && - range2.maxValue != null) { - if (range1.minValue.longValue() >= range2.minValue.longValue() && - range1.maxValue.longValue() <= range2.maxValue.longValue()) { - return true; - } + /** + * Returns whether range r1 is fully contained within r2. The comparison is done by applying + * the converter function to the min and max values of both ranges. + * @param r1 the first range + * @param r2 the second range + * @param converter the converter function to apply to the min and max values of the ranges + * @return true if r1 is fully contained within r2, false otherwise + */ + public static > boolean isWithin(ColStatistics.Range r1, ColStatistics.Range r2, + Function converter) { + if (r1.minValue != null && r2.minValue != null && r1.maxValue != null && r2.maxValue != null) { + T r1Min = converter.apply(r1.minValue); + T r1Max = converter.apply(r1.maxValue); + T r2Min = converter.apply(r2.minValue); + T r2Max = converter.apply(r2.maxValue); + return r1Min.compareTo(r2Min) >= 0 && r1Max.compareTo(r2Max) <= 0; } return false; } From ef8dc6c9310845019e74e9478653fc6d24e32412 Mon Sep 17 00:00:00 2001 From: Stamatis Zampetakis Date: Fri, 20 Feb 2026 17:02:19 +0100 Subject: [PATCH 04/11] Replace FloatInterval with Guava's Range API No need to invent new APIs when equivalent exists and used in other places in Hive/Calcite. --- .../stats/FilterSelectivityEstimator.java | 73 ++++++++++--------- 1 file changed, 37 insertions(+), 36 deletions(-) diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/FilterSelectivityEstimator.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/FilterSelectivityEstimator.java index e2013a5d30bc..266adc90b651 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/FilterSelectivityEstimator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/FilterSelectivityEstimator.java @@ -27,6 +27,8 @@ import java.util.Optional; import java.util.Set; +import com.google.common.collect.BoundType; +import com.google.common.collect.Range; import org.apache.calcite.plan.RelOptUtil; import org.apache.calcite.plan.RelOptUtil.InputReferencedVisitor; import org.apache.calcite.rel.RelNode; @@ -65,21 +67,6 @@ public class FilterSelectivityEstimator extends RexVisitorImpl { protected static final Logger LOG = LoggerFactory.getLogger(FilterSelectivityEstimator.class); - private record FloatInterval(float lower, boolean lowerInclusive, float upper, boolean upperInclusive) { - public FloatInterval getRightHalfOpenInterval() { - if (lowerInclusive && !upperInclusive) { - return this; - } - float newLower = lowerInclusive ? lower : Math.nextUp(lower); - float newUpper = !upperInclusive ? upper : Math.nextUp(upper); - return new FloatInterval(newLower, true, newUpper, false); - } - - public FloatInterval withValues(float lower, float upper) { - return new FloatInterval(lower, lowerInclusive, upper, upperInclusive); - } - } - private final RelNode childRel; private final double childCardinality; private final RelMetadataQuery mq; @@ -282,8 +269,8 @@ private boolean isRemovableCast(RexNode exp, HiveTableScan tableScan) { * @param rangeBoundaries boundaries of the range predicate * @param typeBoundaries if not null, will be set to the boundaries of the type range */ - private static void adjustTypeBoundaries(RelDataType type, MutableObject rangeBoundaries, - MutableObject typeBoundaries) { + private static void adjustTypeBoundaries(RelDataType type, MutableObject> rangeBoundaries, + MutableObject> typeBoundaries) { if (type.getSqlTypeName() != SqlTypeName.DECIMAL) { return; } @@ -298,19 +285,24 @@ private static void adjustTypeBoundaries(RelDataType type, MutableObject range = rangeBoundaries.getValue(); // the resulting value of +- adjust would be rounded up, so in some cases we need to use Math.nextDown - float adjusted1 = range.lowerInclusive ? range.lower - adjust : Math.nextDown(range.lower + adjust); - float adjusted2 = range.upperInclusive ? Math.nextDown(range.upper + adjust) : range.upper - adjust; + boolean lowerInclusive = BoundType.CLOSED.equals(range.lowerBoundType()); + boolean upperInclusive = BoundType.CLOSED.equals(range.upperBoundType()); + float adjusted1 = lowerInclusive ? range.lowerEndpoint() - adjust : Math.nextDown(range.lowerEndpoint() + adjust); + float adjusted2 = upperInclusive ? Math.nextDown(range.upperEndpoint() + adjust) : range.upperEndpoint() - adjust; - float lowerUniverse = range.lowerInclusive ? -typeRangeExtent : Math.nextDown(-typeRangeExtent); - float upperUniverse = range.upperInclusive ? typeRangeExtent : Math.nextUp(typeRangeExtent); + float lowerUniverse = lowerInclusive ? -typeRangeExtent : Math.nextDown(-typeRangeExtent); + float upperUniverse = upperInclusive ? typeRangeExtent : Math.nextUp(typeRangeExtent); float lower = Math.max(adjusted1, lowerUniverse); float upper = Math.min(adjusted2, upperUniverse); - rangeBoundaries.setValue(range.withValues(lower, upper)); + rangeBoundaries.setValue(Range.range(lower, range.lowerBoundType(), upper, range.upperBoundType())); if (typeBoundaries != null) { - typeBoundaries.setValue( - new FloatInterval(lowerUniverse, range.lowerInclusive, upperUniverse, range.upperInclusive)); + typeBoundaries.setValue(Range.range( + lowerUniverse, + range.lowerBoundType(), + upperUniverse, + range.upperBoundType())); } } @@ -344,11 +336,11 @@ private double computeRangePredicateSelectivity(RexCall call, SqlKind op) { return defaultSelectivity; } float[] boundaryValues = new float[] { Float.NEGATIVE_INFINITY, Float.POSITIVE_INFINITY }; - boolean[] inclusive = new boolean[] { true, true }; + BoundType[] inclusive = new BoundType[] { BoundType.CLOSED, BoundType.CLOSED }; boundaryValues[boundaryIdx] = value; - inclusive[boundaryIdx] = !openBound; - MutableObject boundaries = - new MutableObject<>(new FloatInterval(boundaryValues[0], inclusive[0], boundaryValues[1], inclusive[1])); + inclusive[boundaryIdx] = openBound ? BoundType.OPEN : BoundType.CLOSED; + MutableObject> boundaries = + new MutableObject<>(Range.range(boundaryValues[0], inclusive[0], boundaryValues[1], inclusive[1])); // extract the column index from the other operator final HiveTableScan scan = (HiveTableScan) childRel; @@ -430,10 +422,15 @@ private Double computeBetweenPredicateSelectivity(RexCall call) { return inverseBool ? computeNotEqualitySelectivity(call) : computeFunctionSelectivity(call); } - MutableObject rangeBoundaries = - new MutableObject<>(new FloatInterval(leftValue, true, rightValue, true)); - MutableObject typeBoundaries = inverseBool ? new MutableObject<>( - new FloatInterval(Float.NEGATIVE_INFINITY, true, Float.POSITIVE_INFINITY, true)) : null; + // TODO: This case should never appear during planning; verify and consider removing test cases + if (leftValue > rightValue) { + // invalid range, return 0 for BETWEEN and 1 for NOT BETWEEN + return inverseBool ? 1.0 : 0.0; + } + + MutableObject> rangeBoundaries = new MutableObject<>(Range.closed(leftValue, rightValue)); + MutableObject> typeBoundaries = + inverseBool ? new MutableObject<>(Range.closed(Float.NEGATIVE_INFINITY, Float.POSITIVE_INFINITY)) : null; RexNode expr = operands.get(1); // expr to be checked by the BETWEEN if (isRemovableCast(expr, scan)) { @@ -705,9 +702,13 @@ public Double visitLiteral(RexLiteral literal) { * @param boundaries the boundaries * @return the selectivity of "val1 <= column < val2" */ - private static double rangedSelectivity(KllFloatsSketch kll, FloatInterval boundaries) { - FloatInterval closedOpen = boundaries.getRightHalfOpenInterval(); - return rangedSelectivity(kll, closedOpen.lower, closedOpen.upper); + private static double rangedSelectivity(KllFloatsSketch kll, Range boundaries) { + float newLower = BoundType.CLOSED.equals(boundaries.lowerBoundType()) ? boundaries.lowerEndpoint() + : Math.nextUp(boundaries.lowerEndpoint()); + float newUpper = BoundType.OPEN.equals(boundaries.upperBoundType()) ? boundaries.upperEndpoint() + : Math.nextUp(boundaries.upperEndpoint()); + Range closedOpen = Range.closedOpen(newLower, newUpper); + return rangedSelectivity(kll, closedOpen.lowerEndpoint(), closedOpen.upperEndpoint()); } /** From b93db3f6a81aacf3f7459168091bdcba0e1a1a4e Mon Sep 17 00:00:00 2001 From: Thomas Rebele Date: Tue, 24 Feb 2026 16:18:13 +0100 Subject: [PATCH 05/11] Avoid a MutableObject --- .../stats/TestFilterSelectivityEstimator.java | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/ql/src/test/org/apache/hadoop/hive/ql/optimizer/calcite/stats/TestFilterSelectivityEstimator.java b/ql/src/test/org/apache/hadoop/hive/ql/optimizer/calcite/stats/TestFilterSelectivityEstimator.java index e5e48c9f1b57..9e641dd42004 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/optimizer/calcite/stats/TestFilterSelectivityEstimator.java +++ b/ql/src/test/org/apache/hadoop/hive/ql/optimizer/calcite/stats/TestFilterSelectivityEstimator.java @@ -35,7 +35,6 @@ import org.apache.calcite.sql.type.SqlTypeName; import org.apache.calcite.tools.RelBuilder; import org.apache.calcite.util.ImmutableBitSet; -import org.apache.commons.lang3.mutable.MutableObject; import org.apache.datasketches.kll.KllFloatsSketch; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.metastore.StatisticsTestUtils; @@ -61,7 +60,6 @@ import java.time.LocalTime; import java.time.ZoneOffset; import java.util.Collections; -import java.util.Objects; import static org.apache.calcite.sql.type.SqlTypeName.BIGINT; import static org.apache.calcite.sql.type.SqlTypeName.DOUBLE; @@ -150,7 +148,7 @@ public class TestFilterSelectivityEstimator { private ColStatistics stats; private RelNode scan; private RexNode currentInputRef; - private final MutableObject currentValues = new MutableObject<>(); + private int currentValuesSize; @BeforeClass public static void beforeClass() { @@ -189,9 +187,9 @@ private static ColStatistics.Range rangeOf(float[] values) { @Before public void before() { - currentValues.setValue(VALUES); + currentValuesSize = VALUES.length; doReturn(tableType).when(tableMock).getRowType(); - when(tableMock.getRowCount()).thenAnswer(a -> (double) Objects.requireNonNull(currentValues.getValue()).length); + when(tableMock.getRowCount()).thenAnswer(a -> (double) currentValuesSize); RelBuilder relBuilder = HiveRelFactories.HIVE_BUILDER.create(relOptCluster, schemaMock); HiveTableScan tableScan = @@ -210,7 +208,7 @@ public void before() { * Note: call this method only at the beginning of a test method. */ private void useFieldWithValues(String fieldname, float[] values, KllFloatsSketch sketch) { - currentValues.setValue(values); + currentValuesSize = values.length; stats.setHistogram(sketch.toByteArray()); stats.setRange(rangeOf(values)); int fieldIndex = scan.getRowType().getFieldNames().indexOf(fieldname); @@ -860,10 +858,6 @@ private static RexLiteral literalTimestamp(String timestamp) { REX_BUILDER.getTypeFactory().createSqlType(SqlTypeName.TIMESTAMP)); } - private static RexLiteral literalDate(String date) { - return REX_BUILDER.makeLiteral(epochDay(date), REX_BUILDER.getTypeFactory().createSqlType(SqlTypeName.DATE)); - } - private RexNode literalFloat(float f) { return REX_BUILDER.makeLiteral(f, type(SqlTypeName.FLOAT)); } @@ -878,9 +872,4 @@ private static long timestampMillis(String timestamp) { private static long timestamp(String timestamp) { return timestampMillis(timestamp) / 1000; } - - private static int epochDay(String date) { - return (int) LocalDate.parse(date).toEpochDay(); - } - } From ccce3ba8f80e003bfffff73e40615ee0f3ccf711 Mon Sep 17 00:00:00 2001 From: Thomas Rebele Date: Tue, 24 Feb 2026 16:40:13 +0100 Subject: [PATCH 06/11] Fix tests --- .../stats/FilterSelectivityEstimator.java | 24 ++++++++++++------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/FilterSelectivityEstimator.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/FilterSelectivityEstimator.java index 266adc90b651..1c2e9968caad 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/FilterSelectivityEstimator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/FilterSelectivityEstimator.java @@ -296,9 +296,9 @@ private static void adjustTypeBoundaries(RelDataType type, MutableObject makeRange(float lower, BoundType lowerType, float upper, BoundType upperType) { + if (lower > upper) { + return Range.closedOpen(0f, 0f); + } + if (lower == upper && lowerType == BoundType.OPEN && upperType == BoundType.OPEN) { + return Range.closedOpen(0f, 0f); + } + + return Range.range(lower, lowerType, upper, upperType); + } + private double computeRangePredicateSelectivity(RexCall call, SqlKind op) { double defaultSelectivity = ((double) 1 / (double) 3); if (!(childRel instanceof HiveTableScan)) { @@ -422,13 +433,8 @@ private Double computeBetweenPredicateSelectivity(RexCall call) { return inverseBool ? computeNotEqualitySelectivity(call) : computeFunctionSelectivity(call); } - // TODO: This case should never appear during planning; verify and consider removing test cases - if (leftValue > rightValue) { - // invalid range, return 0 for BETWEEN and 1 for NOT BETWEEN - return inverseBool ? 1.0 : 0.0; - } - - MutableObject> rangeBoundaries = new MutableObject<>(Range.closed(leftValue, rightValue)); + MutableObject> rangeBoundaries = + new MutableObject<>(makeRange(leftValue, BoundType.CLOSED, rightValue, BoundType.CLOSED)); MutableObject> typeBoundaries = inverseBool ? new MutableObject<>(Range.closed(Float.NEGATIVE_INFINITY, Float.POSITIVE_INFINITY)) : null; From c4b2e5a1173c52ce0ea48c3d3319e9e057c5223d Mon Sep 17 00:00:00 2001 From: Thomas Rebele Date: Tue, 24 Feb 2026 17:20:55 +0100 Subject: [PATCH 07/11] Avoid mutating the arguments --- .../stats/FilterSelectivityEstimator.java | 100 ++++++++++-------- 1 file changed, 54 insertions(+), 46 deletions(-) diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/FilterSelectivityEstimator.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/FilterSelectivityEstimator.java index 1c2e9968caad..917fcff91ef4 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/FilterSelectivityEstimator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/FilterSelectivityEstimator.java @@ -47,7 +47,6 @@ import org.apache.calcite.sql.SqlKind; import org.apache.calcite.sql.type.SqlTypeName; import org.apache.calcite.util.ImmutableBitSet; -import org.apache.commons.lang3.mutable.MutableObject; import org.apache.datasketches.kll.KllFloatsSketch; import org.apache.datasketches.memory.Memory; import org.apache.datasketches.quantilescommon.QuantileSearchCriteria; @@ -258,54 +257,60 @@ private boolean isRemovableCast(RexNode exp, HiveTableScan tableScan) { } /** - * Adjust the type boundaries if necessary. + * Get the range of values that are rounded to valid values of a DECIMAL type. * - *

- * Special care is taken to support the cast to DECIMAL(precision, scale): - * The cast to DECIMAL rounds the value the same way as {@link RoundingMode#HALF_UP}. - * The boundaries are adjusted accordingly. - *

- * - * @param rangeBoundaries boundaries of the range predicate - * @param typeBoundaries if not null, will be set to the boundaries of the type range + * @param type the DECIMAL type + * @param lowerBound the lower bound type of the result + * @param upperBound the upper bound type of the result + * @return the range of the type */ - private static void adjustTypeBoundaries(RelDataType type, MutableObject> rangeBoundaries, - MutableObject> typeBoundaries) { - if (type.getSqlTypeName() != SqlTypeName.DECIMAL) { - return; - } + private static Range getRangeOfDecimalType(RelDataType type, BoundType lowerBound, BoundType upperBound) { // values outside the representable range are cast to NULL, so adapt the boundaries - int precision = type.getPrecision(); - int scale = type.getScale(); - int digits = precision - scale; + int digits = type.getPrecision() - type.getScale(); // the cast does some rounding, i.e., CAST(99.9499 AS DECIMAL(3,1)) = 99.9 // but CAST(99.95 AS DECIMAL(3,1)) = NULL - float adjust = (float) (5 * Math.pow(10, -(scale + 1))); + float adjust = (float) (5 * Math.pow(10, -(type.getScale() + 1))); // the range of values supported by the type is interval [-typeRangeExtent, typeRangeExtent] (both inclusive) // e.g., the typeRangeExt is 99.94999 for DECIMAL(3,1) float typeRangeExtent = Math.nextDown((float) (Math.pow(10, digits) - adjust)); - Range range = rangeBoundaries.getValue(); // the resulting value of +- adjust would be rounded up, so in some cases we need to use Math.nextDown - boolean lowerInclusive = BoundType.CLOSED.equals(range.lowerBoundType()); - boolean upperInclusive = BoundType.CLOSED.equals(range.upperBoundType()); - float adjusted1 = lowerInclusive ? range.lowerEndpoint() - adjust : Math.nextDown(range.lowerEndpoint() + adjust); - float adjusted2 = upperInclusive ? Math.nextDown(range.upperEndpoint() + adjust) : range.upperEndpoint() - adjust; - + boolean lowerInclusive = BoundType.CLOSED.equals(lowerBound); + boolean upperInclusive = BoundType.CLOSED.equals(upperBound); float lowerUniverse = lowerInclusive ? -typeRangeExtent : Math.nextDown(-typeRangeExtent); float upperUniverse = upperInclusive ? typeRangeExtent : Math.nextUp(typeRangeExtent); - float lower = Math.max(adjusted1, lowerUniverse); - float upper = Math.min(adjusted2, upperUniverse); - rangeBoundaries.setValue(makeRange(lower, range.lowerBoundType(), upper, range.upperBoundType())); - if (typeBoundaries != null) { - typeBoundaries.setValue(makeRange( - lowerUniverse, - range.lowerBoundType(), - upperUniverse, - range.upperBoundType())); - } + return makeRange(lowerUniverse, lowerBound, upperUniverse, upperBound); } + /** + * Adjust the type boundaries if necessary. + * + *

+ * Special care is taken to support the cast to DECIMAL(precision, scale): + * The cast to DECIMAL rounds the value the same way as {@link RoundingMode#HALF_UP}. + * The boundaries are adjusted accordingly. + *

+ * + * @param predicateRange boundaries of the range predicate + * @param type the DECIMAL type + * @param typeRange the boundaries of the type range + * @return the adjusted boundary + */ + private static Range adjustRangeToDecimalType(Range predicateRange, RelDataType type, + Range typeRange) { + float adjust = (float) (5 * Math.pow(10, -(type.getScale() + 1))); + // the resulting value of +- adjust would be rounded up, so in some cases we need to use Math.nextDown + boolean lowerInclusive = BoundType.CLOSED.equals(predicateRange.lowerBoundType()); + boolean upperInclusive = BoundType.CLOSED.equals(predicateRange.upperBoundType()); + float adjusted1 = lowerInclusive ? predicateRange.lowerEndpoint() - adjust + : Math.nextDown(predicateRange.lowerEndpoint() + adjust); + float adjusted2 = upperInclusive ? Math.nextDown(predicateRange.upperEndpoint() + adjust) + : predicateRange.upperEndpoint() - adjust; + float lower = Math.max(adjusted1, typeRange.lowerEndpoint()); + float upper = Math.min(adjusted2, typeRange.upperEndpoint()); + return makeRange(lower, predicateRange.lowerBoundType(), upper, predicateRange.upperBoundType()); + } + private static Range makeRange(float lower, BoundType lowerType, float upper, BoundType upperType) { if (lower > upper) { return Range.closedOpen(0f, 0f); @@ -350,15 +355,18 @@ private double computeRangePredicateSelectivity(RexCall call, SqlKind op) { BoundType[] inclusive = new BoundType[] { BoundType.CLOSED, BoundType.CLOSED }; boundaryValues[boundaryIdx] = value; inclusive[boundaryIdx] = openBound ? BoundType.OPEN : BoundType.CLOSED; - MutableObject> boundaries = - new MutableObject<>(Range.range(boundaryValues[0], inclusive[0], boundaryValues[1], inclusive[1])); + Range boundaries = Range.range(boundaryValues[0], inclusive[0], boundaryValues[1], inclusive[1]); // extract the column index from the other operator final HiveTableScan scan = (HiveTableScan) childRel; int inputRefOpIndex = 1 - literalOpIdx; RexNode node = operands.get(inputRefOpIndex); if (isRemovableCast(node, scan)) { - adjustTypeBoundaries(node.getType(), boundaries, null); + if (node.getType().getSqlTypeName() == SqlTypeName.DECIMAL) { + Range rangeOfDecimalType = + getRangeOfDecimalType(node.getType(), boundaries.lowerBoundType(), boundaries.upperBoundType()); + boundaries = adjustRangeToDecimalType(boundaries, node.getType(), rangeOfDecimalType); + } node = RexUtil.removeCast(node); } @@ -378,7 +386,7 @@ private double computeRangePredicateSelectivity(RexCall call, SqlKind op) { final KllFloatsSketch kll = KllFloatsSketch.heapify(Memory.wrap(colStats.get(0).getHistogram())); // convert the condition to a range val1 <= x < val2 for rangedSelectivity(...) - double rawSelectivity = rangedSelectivity(kll, boundaries.getValue()); + double rawSelectivity = rangedSelectivity(kll, boundaries); return scaleSelectivityToNullableValues(kll, rawSelectivity, scan); } @@ -433,14 +441,14 @@ private Double computeBetweenPredicateSelectivity(RexCall call) { return inverseBool ? computeNotEqualitySelectivity(call) : computeFunctionSelectivity(call); } - MutableObject> rangeBoundaries = - new MutableObject<>(makeRange(leftValue, BoundType.CLOSED, rightValue, BoundType.CLOSED)); - MutableObject> typeBoundaries = - inverseBool ? new MutableObject<>(Range.closed(Float.NEGATIVE_INFINITY, Float.POSITIVE_INFINITY)) : null; + Range rangeBoundaries = makeRange(leftValue, BoundType.CLOSED, rightValue, BoundType.CLOSED); + Range typeBoundaries = inverseBool ? Range.closed(Float.NEGATIVE_INFINITY, Float.POSITIVE_INFINITY) : null; RexNode expr = operands.get(1); // expr to be checked by the BETWEEN if (isRemovableCast(expr, scan)) { - adjustTypeBoundaries(expr.getType(), rangeBoundaries, typeBoundaries); + typeBoundaries = + getRangeOfDecimalType(expr.getType(), rangeBoundaries.lowerBoundType(), rangeBoundaries.upperBoundType()); + rangeBoundaries = adjustRangeToDecimalType(rangeBoundaries, expr.getType(), typeBoundaries); expr = RexUtil.removeCast(expr); } @@ -457,11 +465,11 @@ private Double computeBetweenPredicateSelectivity(RexCall call) { if (!colStats.isEmpty() && isHistogramAvailable(colStats.get(0))) { // convert the condition to a range val1 <= x < val2 for rangedSelectivity(...) final KllFloatsSketch kll = KllFloatsSketch.heapify(Memory.wrap(colStats.get(0).getHistogram())); - double rawSelectivity = rangedSelectivity(kll, rangeBoundaries.getValue()); + double rawSelectivity = rangedSelectivity(kll, rangeBoundaries); if (inverseBool) { // when inverseBool == true, this is a NOT_BETWEEN and selectivity must be inverted // if there's a cast, the inversion is with respect to its codomain (range of the values of the cast) - double typeRangeSelectivity = rangedSelectivity(kll, typeBoundaries.getValue()); + double typeRangeSelectivity = rangedSelectivity(kll, typeBoundaries); rawSelectivity = typeRangeSelectivity - rawSelectivity; } return scaleSelectivityToNullableValues(kll, rawSelectivity, scan); From b311f227ce0bcaac2e2e0a9bdc3477c59ae49a77 Mon Sep 17 00:00:00 2001 From: Thomas Rebele Date: Tue, 24 Feb 2026 17:54:28 +0100 Subject: [PATCH 08/11] Implement review comments --- .../stats/FilterSelectivityEstimator.java | 2 +- .../stats/TestFilterSelectivityEstimator.java | 93 ++++++++----------- 2 files changed, 38 insertions(+), 57 deletions(-) diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/FilterSelectivityEstimator.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/FilterSelectivityEstimator.java index 917fcff91ef4..949990db8ab9 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/FilterSelectivityEstimator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/FilterSelectivityEstimator.java @@ -332,6 +332,7 @@ private double computeRangePredicateSelectivity(RexCall call, SqlKind op) { List operands = call.getOperands(); final Optional leftLiteral = extractLiteral(operands.get(0)); final Optional rightLiteral = extractLiteral(operands.get(1)); + // ensure that there's exactly one literal if ((leftLiteral.isPresent()) == (rightLiteral.isPresent())) { return defaultSelectivity; } @@ -520,7 +521,6 @@ private Optional extractLiteral(SqlTypeName typeName, Object boundValueOb value = ((GregorianCalendar) boundValueObject).toInstant().getEpochSecond(); break; default: - LOG.warn("Unsupported type for comparator selectivity evaluation using histogram: {}", typeName); return Optional.empty(); } return Optional.of(value); diff --git a/ql/src/test/org/apache/hadoop/hive/ql/optimizer/calcite/stats/TestFilterSelectivityEstimator.java b/ql/src/test/org/apache/hadoop/hive/ql/optimizer/calcite/stats/TestFilterSelectivityEstimator.java index 9e641dd42004..3c6b9098a0cf 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/optimizer/calcite/stats/TestFilterSelectivityEstimator.java +++ b/ql/src/test/org/apache/hadoop/hive/ql/optimizer/calcite/stats/TestFilterSelectivityEstimator.java @@ -29,8 +29,7 @@ import org.apache.calcite.rex.RexCall; import org.apache.calcite.rex.RexLiteral; import org.apache.calcite.rex.RexNode; -import org.apache.calcite.sql.SqlBinaryOperator; -import org.apache.calcite.sql.SqlOperator; +import org.apache.calcite.rex.RexUtil; import org.apache.calcite.sql.fun.SqlStdOperatorTable; import org.apache.calcite.sql.type.SqlTypeName; import org.apache.calcite.tools.RelBuilder; @@ -82,12 +81,6 @@ @RunWith(MockitoJUnitRunner.class) public class TestFilterSelectivityEstimator { - private static final SqlBinaryOperator GT = SqlStdOperatorTable.GREATER_THAN; - private static final SqlBinaryOperator GE = SqlStdOperatorTable.GREATER_THAN_OR_EQUAL; - private static final SqlBinaryOperator LT = SqlStdOperatorTable.LESS_THAN; - private static final SqlBinaryOperator LE = SqlStdOperatorTable.LESS_THAN_OR_EQUAL; - private static final SqlOperator BETWEEN = HiveBetween.INSTANCE; - private static final float[] VALUES = { 1, 2, 2, 2, 2, 2, 2, 2, 3, 4, 5, 6, 7 }; private static final float[] VALUES2 = { // rounding for DECIMAL(3,1) @@ -599,7 +592,7 @@ public void testComputeRangePredicateSelectivityNotBetweenWithNULLS() { } @Test - public void testComputeRangePredicateSelectivityWithCast() { + public void testRangePredicateWithCast() { useFieldWithValues("f_numeric", VALUES, KLL); checkSelectivity(3 / 13.f, ge(cast("f_numeric", TINYINT), int5)); checkSelectivity(10 / 13.f, lt(cast("f_numeric", TINYINT), int5)); @@ -620,7 +613,7 @@ public void testComputeRangePredicateSelectivityWithCast() { } @Test - public void testComputeRangePredicateSelectivityWithCast2() { + public void testRangePredicateWithCast2() { useFieldWithValues("f_numeric", VALUES2, KLL2); RelDataType decimal3s1 = decimalType(3, 1); checkSelectivity(4 / 28.f, ge(cast("f_numeric", decimal3s1), literalFloat(1))); @@ -695,14 +688,26 @@ private void checkTimeFieldOnIntraDayTimestamps(RexNode field) { } @Test - public void testComputeRangePredicateSelectivityTimestamp() { + public void testRangePredicateOnTimestamp() { useFieldWithValues("f_timestamp", VALUES_TIME, KLL_TIME); checkTimeFieldOnMidnightTimestamps(currentInputRef); checkTimeFieldOnIntraDayTimestamps(currentInputRef); } @Test - public void testComputeRangePredicateSelectivityDate() { + public void testRangePredicateOnTimestampWithCast() { + useFieldWithValues("f_timestamp", VALUES_TIME, KLL_TIME); + RexNode expr1 = cast("f_timestamp", SqlTypeName.DATE); + checkTimeFieldOnMidnightTimestamps(expr1); + checkTimeFieldOnIntraDayTimestamps(expr1); + + RexNode expr2 = cast("f_timestamp", SqlTypeName.TIMESTAMP); + checkTimeFieldOnMidnightTimestamps(expr2); + checkTimeFieldOnIntraDayTimestamps(expr2); + } + + @Test + public void testRangePredicateOnDate() { useFieldWithValues("f_date", VALUES_TIME, KLL_TIME); checkTimeFieldOnMidnightTimestamps(currentInputRef); @@ -711,26 +716,17 @@ public void testComputeRangePredicateSelectivityDate() { } @Test - public void testComputeRangePredicateSelectivityDateWithCast() { + public void testRangePredicateOnDateWithCast() { useFieldWithValues("f_date", VALUES_TIME, KLL_TIME); - RexNode field1 = cast("f_date", SqlTypeName.DATE); - checkTimeFieldOnMidnightTimestamps(field1); - checkTimeFieldOnIntraDayTimestamps(field1); + checkTimeFieldOnMidnightTimestamps(cast("f_date", SqlTypeName.DATE)); + checkTimeFieldOnMidnightTimestamps(cast("f_date", SqlTypeName.TIMESTAMP)); - RexNode field2 = cast("f_date", SqlTypeName.TIMESTAMP); - checkTimeFieldOnMidnightTimestamps(field2); - checkTimeFieldOnIntraDayTimestamps(field2); - } - - @Test - public void testComputeRangePredicateSelectivityTimestampWithCast() { - useFieldWithValues("f_timestamp", VALUES_TIME, KLL_TIME); - checkTimeFieldOnMidnightTimestamps(cast("f_timestamp", SqlTypeName.DATE)); - checkTimeFieldOnMidnightTimestamps(cast("f_timestamp", SqlTypeName.TIMESTAMP)); + // it does not make sense to compare with "2020-11-05T11:23:45Z", + // as that value would not be stored as-is in a date column, but as "2020-11-05" instead } @Test - public void testComputeRangePredicateSelectivityBetweenWithCastDecimal2_1() { + public void testBetweenWithCastDecimal2s1() { useFieldWithValues("f_numeric", VALUES2, KLL2); float total = VALUES2.length; float universe = 2; // the number of values that "survive" the cast @@ -741,10 +737,10 @@ public void testComputeRangePredicateSelectivityBetweenWithCastDecimal2_1() { } @Test - public void testComputeRangePredicateSelectivityBetweenWithCastDecimal3_1() { + public void testBetweenWithCastDecimal3s1() { useFieldWithValues("f_numeric", VALUES2, KLL2); float total = VALUES2.length; - float universe = 7; + float universe = 7; // the number of values that "survive" the cast RexNode cast = REX_BUILDER.makeCast(decimalType(3, 1), inputRef0); checkBetweenSelectivity(0, universe, total, cast, 100f, 1000f); checkBetweenSelectivity(4, universe, total, cast, 1f, 100f); @@ -752,10 +748,10 @@ public void testComputeRangePredicateSelectivityBetweenWithCastDecimal3_1() { } @Test - public void testComputeRangePredicateSelectivityBetweenWithCastDecimal4_1() { + public void testBetweenWithCastDecimal4s1() { useFieldWithValues("f_numeric", VALUES2, KLL2); float total = VALUES2.length; - float universe = 23; + float universe = 23; // the number of values that "survive" the cast RexNode cast = REX_BUILDER.makeCast(decimalType(4, 1), inputRef0); // the values between -999.94999... and 999.94999... (both inclusive) pass through the cast // the values between 99.95 and 100 are rounded up to 100, so they fulfill the BETWEEN @@ -765,10 +761,10 @@ public void testComputeRangePredicateSelectivityBetweenWithCastDecimal4_1() { } @Test - public void testComputeRangePredicateSelectivityBetweenWithCastDecimal7_1() { + public void testBetweenWithCastDecimal7s1() { useFieldWithValues("f_numeric", VALUES2, KLL2); float total = VALUES2.length; - float universe = 26; + float universe = 26; // the number of values that "survive" the cast RexNode cast = REX_BUILDER.makeCast(decimalType(7, 1), inputRef0); checkBetweenSelectivity(14, universe, total, cast, 100, 1000); checkBetweenSelectivity(14, universe, total, cast, 1f, 100f); @@ -779,26 +775,11 @@ private void checkSelectivity(float expectedSelectivity, RexNode filter) { FilterSelectivityEstimator estimator = new FilterSelectivityEstimator(scan, mq); Assert.assertEquals(filter.toString(), expectedSelectivity, estimator.estimateSelectivity(filter), DELTA); - // swap equation, e.g., col < 5 becomes 5 > col; selectivity stays the same - RexCall call = (RexCall) filter; - SqlOperator operator = ((RexCall) filter).getOperator(); - SqlOperator swappedOp; - if (operator == LE) { - swappedOp = GE; - } else if (operator == LT) { - swappedOp = GT; - } else if (operator == GE) { - swappedOp = LE; - } else if (operator == GT) { - swappedOp = LT; - } else if (operator == BETWEEN) { - // BETWEEN cannot be swapped - return; - } else { - throw new UnsupportedOperationException(); + // convert "col OP value" to "value INVERSE_OP col", and check it + RexNode inverted = RexUtil.invert(REX_BUILDER, (RexCall) filter); + if (inverted != null) { + Assert.assertEquals(filter.toString(), expectedSelectivity, estimator.estimateSelectivity(inverted), DELTA); } - RexNode swapped = REX_BUILDER.makeCall(swappedOp, call.getOperands().get(1), call.getOperands().get(0)); - Assert.assertEquals(filter.toString(), expectedSelectivity, estimator.estimateSelectivity(swapped), DELTA); } private void checkBetweenSelectivity(float expectedEntries, float universe, float total, RexNode value, float lower, @@ -830,19 +811,19 @@ private RexNode cast(String fieldname, RelDataType type) { } private RexNode ge(RexNode expr, RexNode value) { - return REX_BUILDER.makeCall(GE, expr, value); + return REX_BUILDER.makeCall(SqlStdOperatorTable.GREATER_THAN_OR_EQUAL, expr, value); } private RexNode gt(RexNode expr, RexNode value) { - return REX_BUILDER.makeCall(GT, expr, value); + return REX_BUILDER.makeCall(SqlStdOperatorTable.GREATER_THAN, expr, value); } private RexNode le(RexNode expr, RexNode value) { - return REX_BUILDER.makeCall(LE, expr, value); + return REX_BUILDER.makeCall(SqlStdOperatorTable.LESS_THAN_OR_EQUAL, expr, value); } private RexNode lt(RexNode expr, RexNode value) { - return REX_BUILDER.makeCall(LT, expr, value); + return REX_BUILDER.makeCall(SqlStdOperatorTable.LESS_THAN, expr, value); } private static RelDataType type(SqlTypeName typeName) { From 1c6cccf71143ae0122b257ad5b8f9b94af5c71c3 Mon Sep 17 00:00:00 2001 From: Thomas Rebele Date: Wed, 25 Feb 2026 00:10:54 +0100 Subject: [PATCH 09/11] Comments --- .../calcite/stats/FilterSelectivityEstimator.java | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/FilterSelectivityEstimator.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/FilterSelectivityEstimator.java index 949990db8ab9..ec69f43b33ec 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/FilterSelectivityEstimator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/FilterSelectivityEstimator.java @@ -308,9 +308,14 @@ private static Range adjustRangeToDecimalType(Range predicateRange : predicateRange.upperEndpoint() - adjust; float lower = Math.max(adjusted1, typeRange.lowerEndpoint()); float upper = Math.min(adjusted2, typeRange.upperEndpoint()); + // the boundaries might result in an invalid range (e.g., left > right) + // in that case the predicate does not select anything, and we return an empty range return makeRange(lower, predicateRange.lowerBoundType(), upper, predicateRange.upperBoundType()); } + /** + * If the arguments lead to a valid range, it is returned, otherwise an empty array is returned. + */ private static Range makeRange(float lower, BoundType lowerType, float upper, BoundType upperType) { if (lower > upper) { return Range.closedOpen(0f, 0f); @@ -386,7 +391,6 @@ private double computeRangePredicateSelectivity(RexCall call, SqlKind op) { } final KllFloatsSketch kll = KllFloatsSketch.heapify(Memory.wrap(colStats.get(0).getHistogram())); - // convert the condition to a range val1 <= x < val2 for rangedSelectivity(...) double rawSelectivity = rangedSelectivity(kll, boundaries); return scaleSelectivityToNullableValues(kll, rawSelectivity, scan); } @@ -464,7 +468,6 @@ private Double computeBetweenPredicateSelectivity(RexCall call) { final List colStats = scan.getColStat(Collections.singletonList(inputRefIndex)); if (!colStats.isEmpty() && isHistogramAvailable(colStats.get(0))) { - // convert the condition to a range val1 <= x < val2 for rangedSelectivity(...) final KllFloatsSketch kll = KllFloatsSketch.heapify(Memory.wrap(colStats.get(0).getHistogram())); double rawSelectivity = rangedSelectivity(kll, rangeBoundaries); if (inverseBool) { @@ -717,6 +720,7 @@ public Double visitLiteral(RexLiteral literal) { * @return the selectivity of "val1 <= column < val2" */ private static double rangedSelectivity(KllFloatsSketch kll, Range boundaries) { + // convert the condition to a range val1 <= x < val2 float newLower = BoundType.CLOSED.equals(boundaries.lowerBoundType()) ? boundaries.lowerEndpoint() : Math.nextUp(boundaries.lowerEndpoint()); float newUpper = BoundType.OPEN.equals(boundaries.upperBoundType()) ? boundaries.upperEndpoint() From fbd71162e132c31d83c63dafbda42984e9f640c8 Mon Sep 17 00:00:00 2001 From: Thomas Rebele Date: Wed, 25 Feb 2026 10:20:49 +0100 Subject: [PATCH 10/11] Compare boundaries directly --- .../stats/FilterSelectivityEstimator.java | 4 +-- .../hadoop/hive/ql/stats/StatsUtils.java | 26 ++++++------------- 2 files changed, 10 insertions(+), 20 deletions(-) diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/FilterSelectivityEstimator.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/FilterSelectivityEstimator.java index ec69f43b33ec..6c0d7ef7606c 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/FilterSelectivityEstimator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/FilterSelectivityEstimator.java @@ -252,8 +252,8 @@ private boolean isRemovableCast(RexNode exp, HiveTableScan tableScan) { // unknown type, do not remove the cast return false; } - ColStatistics.Range typeRange = new ColStatistics.Range(min, max); - return StatsUtils.isWithin(colRange, typeRange, Number::doubleValue); + // are all values of the input column accepted by the cast? + return min < colRange.minValue.doubleValue() && colRange.maxValue.doubleValue() < max; } /** diff --git a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java index ea29206cb963..c530633fbf1c 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java @@ -37,7 +37,6 @@ import java.util.concurrent.Future; import java.util.concurrent.ForkJoinPool; import java.util.concurrent.atomic.AtomicInteger; -import java.util.function.Function; import java.util.stream.Collectors; import org.apache.commons.lang3.math.NumberUtils; @@ -510,7 +509,7 @@ public static boolean inferForeignKey(ColStatistics csPK, ColStatistics csFK) { if (csPK.getRange() != null && csFK.getRange() != null) { ColStatistics.Range pkRange = csPK.getRange(); ColStatistics.Range fkRange = csFK.getRange(); - return isWithin(fkRange, pkRange, Number::longValue); + return isWithin(fkRange, pkRange); } } } @@ -547,22 +546,13 @@ public static long getRangeDelta(ColStatistics.Range range) { return 0; } - /** - * Returns whether range r1 is fully contained within r2. The comparison is done by applying - * the converter function to the min and max values of both ranges. - * @param r1 the first range - * @param r2 the second range - * @param converter the converter function to apply to the min and max values of the ranges - * @return true if r1 is fully contained within r2, false otherwise - */ - public static > boolean isWithin(ColStatistics.Range r1, ColStatistics.Range r2, - Function converter) { - if (r1.minValue != null && r2.minValue != null && r1.maxValue != null && r2.maxValue != null) { - T r1Min = converter.apply(r1.minValue); - T r1Max = converter.apply(r1.maxValue); - T r2Min = converter.apply(r2.minValue); - T r2Max = converter.apply(r2.maxValue); - return r1Min.compareTo(r2Min) >= 0 && r1Max.compareTo(r2Max) <= 0; + private static boolean isWithin(ColStatistics.Range range1, ColStatistics.Range range2) { + if (range1.minValue != null && range2.minValue != null && range1.maxValue != null && + range2.maxValue != null) { + if (range1.minValue.longValue() >= range2.minValue.longValue() && + range1.maxValue.longValue() <= range2.maxValue.longValue()) { + return true; + } } return false; } From fc88104427db3d7a3f93eef2e5a968699b39609e Mon Sep 17 00:00:00 2001 From: Thomas Rebele Date: Wed, 25 Feb 2026 13:32:15 +0100 Subject: [PATCH 11/11] Fix test and Sonar Qube warnings --- .../optimizer/calcite/stats/FilterSelectivityEstimator.java | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/FilterSelectivityEstimator.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/FilterSelectivityEstimator.java index 6c0d7ef7606c..257a1e1747ec 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/FilterSelectivityEstimator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/FilterSelectivityEstimator.java @@ -58,7 +58,6 @@ import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveTableScan; import org.apache.hadoop.hive.ql.plan.ColStatistics; import org.apache.hadoop.hive.ql.session.SessionState; -import org.apache.hadoop.hive.ql.stats.StatsUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -223,7 +222,7 @@ private boolean isRemovableCast(RexNode exp, HiveTableScan tableScan) { // otherwise the CAST introduces some modulo-like behavior ColStatistics colStat = colStats.getFirst(); ColStatistics.Range colRange = colStat.getRange(); - if (colRange == null) { + if (colRange == null || colRange.minValue == null || colRange.maxValue == null) { return false; } @@ -311,7 +310,7 @@ private static Range adjustRangeToDecimalType(Range predicateRange // the boundaries might result in an invalid range (e.g., left > right) // in that case the predicate does not select anything, and we return an empty range return makeRange(lower, predicateRange.lowerBoundType(), upper, predicateRange.upperBoundType()); - } + } /** * If the arguments lead to a valid range, it is returned, otherwise an empty array is returned.