diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ReduceSinkMapJoinProc.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ReduceSinkMapJoinProc.java index ea192e8af9aa..b1a93ffcaaf1 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ReduceSinkMapJoinProc.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ReduceSinkMapJoinProc.java @@ -187,7 +187,8 @@ public static Object processReduceSinkToHashJoin(ReduceSinkOperator parentRS, Ma ExprNodeDesc realCol = parentRS.getColumnExprMap().get(prefix + "." + keyCol); ColStatistics cs = StatsUtils.getColStatisticsFromExpression(context.conf, stats, realCol); - if (cs == null || cs.getCountDistint() <= 0) { + if (cs == null || cs.getCountDistint() < 0) { + // unknown: same fallback as old "no stats / overloaded NDV=0" path maxKeyCount = Long.MAX_VALUE; break; } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/SetHashGroupByMinReduction.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/SetHashGroupByMinReduction.java index bbd474b842f8..06a1a5ba8849 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/SetHashGroupByMinReduction.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/SetHashGroupByMinReduction.java @@ -69,8 +69,8 @@ public Object process(Node nd, Stack stack, Statistics parentStats = groupByOperator.getParentOperators().get(0).getStatistics(); long ndvProduct = StatsUtils.computeNDVGroupingColumns( colStats, parentStats, true); - // if ndvProduct is 0 then column stats state must be partial and we are missing - if (ndvProduct == 0) { + if (ndvProduct < 0) { + // unknown product - same fallback as old "overloaded NDV=0" path return null; } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/SortedDynPartitionOptimizer.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/SortedDynPartitionOptimizer.java index f5431fa34934..aaa851b7942b 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/SortedDynPartitionOptimizer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/SortedDynPartitionOptimizer.java @@ -90,6 +90,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.google.common.annotations.VisibleForTesting; import com.google.common.collect.Lists; import com.google.common.collect.Maps; import com.google.common.collect.Sets; @@ -924,7 +925,8 @@ private long computeMaxWriters() { * Computes the partition cardinality based on column NDV statistics. * @return positive value = estimated cardinality, 0 = no partition columns, -1 = stats unavailable */ - private long computePartCardinality(List partitionPos, + @VisibleForTesting + long computePartCardinality(List partitionPos, List, ExprNodeDesc>> customPartitionExprs, Statistics tStats, Operator fsParent, ArrayList allRSCols) { @@ -935,7 +937,8 @@ private long computePartCardinality(List partitionPos, for (Integer idx : partitionPos) { ColumnInfo ci = fsParent.getSchema().getSignature().get(idx); ColStatistics partStats = tStats.getColumnStatisticsFromColName(ci.getInternalName()); - if (partStats == null) { + // countDistinct < 0 means "unknown" - same path as missing stats + if (partStats == null || partStats.getCountDistint() < 0) { return -1; } partCardinality *= partStats.getCountDistint(); @@ -950,7 +953,8 @@ private long computePartCardinality(List partitionPos, // implementations on UDFs (e.g. iceberg_bucket reports min(inputNDV, numBuckets)) ColStatistics exprStats = StatsUtils.getColStatisticsFromExpression( this.parseCtx.getConf(), tStats, resolved); - if (exprStats == null) { + // countDistinct < 0 means "unknown" - same path as missing stats + if (exprStats == null || exprStats.getCountDistint() < 0) { return -1; } partCardinality *= exprStats.getCountDistint(); diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdDistinctRowCount.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdDistinctRowCount.java index b0f40d0d815e..eeb599e848dd 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdDistinctRowCount.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdDistinctRowCount.java @@ -22,6 +22,7 @@ import org.apache.calcite.rel.convert.Converter; import org.apache.calcite.rel.core.JoinRelType; import org.apache.calcite.rel.core.Spool; +import com.google.common.annotations.VisibleForTesting; import org.apache.calcite.rel.metadata.ReflectiveRelMetadataProvider; import org.apache.calcite.rel.metadata.RelMdDistinctRowCount; import org.apache.calcite.rel.metadata.RelMdUtil; @@ -50,7 +51,8 @@ public class HiveRelMdDistinctRowCount extends RelMdDistinctRowCount { ReflectiveRelMetadataProvider.reflectiveSource( BuiltInMethod.DISTINCT_ROW_COUNT.method, new HiveRelMdDistinctRowCount()); - private HiveRelMdDistinctRowCount() { + @VisibleForTesting + HiveRelMdDistinctRowCount() { } public Double getDistinctRowCount(HiveTableScan htRel, RelMetadataQuery mq, ImmutableBitSet groupKey, @@ -60,6 +62,9 @@ public Double getDistinctRowCount(HiveTableScan htRel, RelMetadataQuery mq, Immu List colStats = htRel.getColStat(projIndxLst); Double noDistinctRows = 1.0; for (ColStatistics cStat : colStats) { + if (cStat.getCountDistint() <= 0) { + return 0.0; + } noDistinctRows *= cStat.getCountDistint(); } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java index abfe6170217e..28bc2623a6a6 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java @@ -557,18 +557,25 @@ private long evaluateInExpr(Statistics stats, ExprNodeDesc pred, long currNumRow factor *= children.size() - 1; } for (int i = 0; i < columnStats.size(); i++) { - long dvs = columnStats.get(i) == null ? 0 : columnStats.get(i).getCountDistint(); + ColStatistics cs = columnStats.get(i); + long dvs = cs == null ? -1L : cs.getCountDistint(); if (dvs == 0) { - factor *= 0.5; - continue; + // verified zero distinct values: IN cannot match any row + factor = 0; + break; } - // (num of distinct vals for col in IN clause / num of distinct vals for col ) - double columnFactor = 1.0 / dvs; - if (!multiColumn) { - columnFactor *= estimateIntersectionSize(aspCtx.getConf(), columnStats.get(i), values.get(i)); + if (dvs < 0) { + // missing stats or unknown NDV + factor *= 0.5; + } else { + // (num of distinct vals for col in IN clause / num of distinct vals for col ) + double columnFactor = 1.0 / dvs; + if (!multiColumn) { + columnFactor *= estimateIntersectionSize(aspCtx.getConf(), columnStats.get(i), values.get(i)); + } + // max can be 1, even when ndv is larger in IN clause than in column stats + factor *= Math.min(columnFactor, 1.0); } - // max can be 1, even when ndv is larger in IN clause than in column stats - factor *= Math.min(columnFactor, 1.0); } // Clamp at 1 to be sure that we don't get out of range. @@ -1317,9 +1324,7 @@ private long evaluateChildExpr(Statistics stats, ExprNodeDesc child, ColStatistics cs = stats.getColumnStatisticsFromColName(colName); if (cs != null) { - long dvs = cs.getCountDistint(); - numRows = dvs == 0 ? numRows / 2 : Math.round((double) numRows / dvs); - return numRows; + return rowsAfterEqualityFilter(numRows, cs.getCountDistint()); } } else if (leaf instanceof ExprNodeColumnDesc) { ExprNodeColumnDesc colDesc = (ExprNodeColumnDesc) leaf; @@ -1338,9 +1343,7 @@ private long evaluateChildExpr(Statistics stats, ExprNodeDesc child, ColStatistics cs = stats.getColumnStatisticsFromColName(colName); if (cs != null) { - long dvs = cs.getCountDistint(); - numRows = dvs == 0 ? numRows / 2 : Math.round((double) numRows / dvs); - return numRows; + return rowsAfterEqualityFilter(numRows, cs.getCountDistint()); } } } @@ -1380,6 +1383,16 @@ private long evaluateChildExpr(Statistics stats, ExprNodeDesc child, return numRows / 2; } + private static long rowsAfterEqualityFilter(long numRows, long dvs) { + if (dvs < 0) { + return numRows / 2; + } + if (dvs == 0) { + return 0; + } + return Math.round((double) numRows / dvs); + } + } /** @@ -1518,14 +1531,12 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, // compute product of distinct values of grouping columns long ndvProduct = StatsUtils.computeNDVGroupingColumns(colStats, parentStats, false); - // if ndvProduct is 0 then column stats state must be partial and we are missing - // column stats for a group by column - if (ndvProduct == 0) { + if (ndvProduct < 0) { + // unknown - missing column stats or unknown NDV on a grouping column ndvProduct = parentNumRows / 2; if (LOG.isDebugEnabled()) { - LOG.debug("STATS-" + gop.toString() + ": ndvProduct became 0 as some column does not" + - " have stats. ndvProduct changed to: " + ndvProduct); + LOG.debug("STATS-{}: ndvProduct unknown; falling back to {}", gop, ndvProduct); } } final long maxColumnNDV = colStats.stream() @@ -1720,6 +1731,10 @@ static void computeAggregateColumnMinMax(ColStatistics cs, HiveConf conf, Aggreg long valuesCount = agg.getDistinct() ? parentCS.getCountDistint() : parentStats.getNumRows() - numNulls; + // countDistinct < 0 would produce a Range with a negative maxValue + if (agg.getDistinct() && valuesCount < 0) { + return; + } Range range = parentCS.getRange(); // Get the aggregate function matching the name in the query. GenericUDAFResolver udaf = @@ -1819,9 +1834,24 @@ private boolean checkMapSideAggregation(GroupByOperator gop, // estimate size of key from column statistics long avgKeySize = 0; + // lazily computed on first unknown NDV (null = not yet looked up) + Long parentNumRows = null; for (ColStatistics cs : colStats) { if (cs != null) { - numEstimatedRows = StatsUtils.safeMult(numEstimatedRows, cs.getCountDistint()); + long ndv = cs.getCountDistint(); + if (ndv < 0) { + if (parentNumRows == null) { + // unknown NDV: fall back to parentNumRows / 2, matching the heuristic + // used elsewhere in this file when GROUP BY cardinality cannot be computed + Statistics parentStats = gop.getParentOperators().get(0).getStatistics(); + parentNumRows = (parentStats != null) ? parentStats.getNumRows() : -1L; + } + if (parentNumRows <= 0) { + return false; + } + ndv = parentNumRows / 2; + } + numEstimatedRows = StatsUtils.safeMult(numEstimatedRows, ndv); avgKeySize += Math.ceil(cs.getAvgColLen()); } } @@ -2227,7 +2257,8 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, return null; } - private long calculateUnmatchedRowsForOuter(HiveConf conf, long inputRowCount, + @VisibleForTesting + long calculateUnmatchedRowsForOuter(HiveConf conf, long inputRowCount, List joinKeys, Statistics statistics, long distinctUnmatched) { // Extract the ndv from each of the columns involved in the join List distinctVals = new ArrayList<>(); @@ -2248,14 +2279,15 @@ private long calculateUnmatchedRowsForOuter(HiveConf conf, long inputRowCount, distinctVal = StatsUtils.addWithExpDecay(distinctVals); } } - // If we have a greater number of unmatched values than number of distinct values, - // we just return the number of rows in the input as we can assume there are no - // matches - if (distinctUnmatched >= distinctVal) { + // distinctVal <= 0 covers unknown (<0) and verified-zero (==0) cases; the latter means + // no key value matches anything, so every input row is unmatched in an outer join. + // distinctUnmatched < 0 (unknown) is treated conservatively the same way. + // If unmatched >= distinctVal, all rows can be assumed unmatched. + if (distinctVal <= 0 || distinctUnmatched < 0 || distinctUnmatched >= distinctVal) { return inputRowCount; } // Otherwise, divide the number of input rows by the number of distinct values - // and divide by the number of distinct values unmatched + // and multiply by the number of distinct values unmatched return StatsUtils.safeMult(inputRowCount / distinctVal, distinctUnmatched); } @@ -2604,7 +2636,8 @@ void updateNumNulls(ColStatistics colStats, long leftUnmatchedRows, long rightUn colStats.setNumNulls(newNumNulls); } - private void updateColStats(HiveConf conf, Statistics stats, long leftUnmatchedRows, long rightUnmatchedRows, + @VisibleForTesting + void updateColStats(HiveConf conf, Statistics stats, long leftUnmatchedRows, long rightUnmatchedRows, long newNumRows, CommonJoinOperator jop, Map rowCountParents) { if (newNumRows < 0) { @@ -2632,26 +2665,29 @@ private void updateColStats(HiveConf conf, Statistics stats, long leftUnmatchedR int pos = jop.getConf().getReversedExprs().get(cs.getColumnName()); long oldDV = cs.getCountDistint(); - boolean useCalciteForNdvReadjustment - = HiveConf.getBoolVar(conf, ConfVars.HIVE_STATS_JOIN_NDV_READJUSTMENT); - long newDV = oldDV; - if (useCalciteForNdvReadjustment) { - Double approxNdv = RelMdUtil.numDistinctVals(oldDV * 1.0, newNumRows * 1.0); - Preconditions.checkNotNull(approxNdv, "approximate NDV is null"); - newDV = approxNdv.longValue(); - } else { - long oldRowCount = rowCountParents.get(pos); - double ratio = (double) newNumRows / (double) oldRowCount; - - // if ratio is greater than 1, then number of rows increases. This can happen - // when some operators like GROUPBY duplicates the input rows in which case - // number of distincts should not change. Update the distinct count only when - // the output number of rows is less than input number of rows. - if (ratio <= 1.0) { - newDV = (long) Math.ceil(ratio * oldDV); + // countDistinct < 0 means "unknown" + if (oldDV >= 0) { + boolean useCalciteForNdvReadjustment + = HiveConf.getBoolVar(conf, ConfVars.HIVE_STATS_JOIN_NDV_READJUSTMENT); + long newDV = oldDV; + if (useCalciteForNdvReadjustment) { + Double approxNdv = RelMdUtil.numDistinctVals(oldDV * 1.0, newNumRows * 1.0); + Preconditions.checkNotNull(approxNdv, "approximate NDV is null"); + newDV = approxNdv.longValue(); + } else { + long oldRowCount = rowCountParents.get(pos); + double ratio = (double) newNumRows / (double) oldRowCount; + + // if ratio is greater than 1, then number of rows increases. This can happen + // when some operators like GROUPBY duplicates the input rows in which case + // number of distincts should not change. Update the distinct count only when + // the output number of rows is less than input number of rows. + if (ratio < 1.0) { + newDV = (long) Math.ceil(ratio * oldDV); + } } + cs.setCountDistint(newDV); } - cs.setCountDistint(newDV); updateNumNulls(cs, leftUnmatchedRows, rightUnmatchedRows, newNumRows, pos, jop); } stats.setColumnStats(colStats); @@ -2718,7 +2754,8 @@ private long computeFinalRowCount(List rowCountParents, long interimRowCou return result; } - private long computeRowCountAssumingInnerJoin(List rowCountParents, long denom, + @VisibleForTesting + long computeRowCountAssumingInnerJoin(List rowCountParents, long denom, CommonJoinOperator join) { double factor = 0.0d; long result = 1; @@ -2734,7 +2771,12 @@ private long computeRowCountAssumingInnerJoin(List rowCountParents, long d } } - denom = denom == 0 ? 1 : denom; + // denom < 0 (unknown) and denom == 0 (verified-zero join key, cardinality formula + // degenerate) both fall back to "no constraint" rather than producing a negative + // factor or div-by-zero + if (denom <= 0) { + denom = 1; + } factor = (double) max / (double) denom; for (int i = 0; i < rowCountParents.size(); i++) { @@ -2786,6 +2828,9 @@ private long getDenominatorForUnmatchedRows(List distinctVals) { if (distinctVals.isEmpty()) { return 2; } + if (StatsUtils.containsUnknownNDV(distinctVals)) { + return -1L; + } // simple join from 2 relations: denom = min(v1, v2) if (distinctVals.size() <= 2) { @@ -2826,6 +2871,9 @@ private long getDenominator(List distinctVals) { // denominator is 2. return 2; } + if (StatsUtils.containsUnknownNDV(distinctVals)) { + return -1L; + } // simple join from 2 relations: denom = max(v1, v2) if (distinctVals.size() <= 2) { diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/Statistics.java b/ql/src/java/org/apache/hadoop/hive/ql/plan/Statistics.java index d672b7acfc22..ab048a94dc29 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/plan/Statistics.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/Statistics.java @@ -260,7 +260,11 @@ public void addToColumnStats(List colStats) { } else { existing.setNumNulls(StatsUtils.safeAdd(existing.getNumNulls(), cs.getNumNulls())); } - existing.setCountDistint(Math.max(existing.getCountDistint(), cs.getCountDistint())); + if (cs.getCountDistint() < 0 || existing.getCountDistint() < 0) { + existing.setCountDistint(-1); + } else { + existing.setCountDistint(Math.max(existing.getCountDistint(), cs.getCountDistint())); + } } } } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java index 55f9d0c1e158..cba53f5df4ee 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java @@ -813,7 +813,7 @@ public static ColStatistics getColStatistics(ColumnStatisticsObj cso, String col } else if (colTypeLowerCase.equals(serdeConstants.STRING_TYPE_NAME) || colTypeLowerCase.startsWith(serdeConstants.CHAR_TYPE_NAME) || colTypeLowerCase.startsWith(serdeConstants.VARCHAR_TYPE_NAME)) { - cs.setCountDistint(csd.getStringStats().getNumDVs()); + cs.setCountDistint(csd.getStringStats().isSetNumDVs() ? csd.getStringStats().getNumDVs() : -1); cs.setNumNulls(csd.getStringStats().getNumNulls()); cs.setAvgColLen(csd.getStringStats().getAvgColLen()); cs.setBitVectors(csd.getStringStats().getBitVectors()); @@ -837,9 +837,12 @@ public static ColStatistics getColStatistics(ColumnStatisticsObj cso, String col } else if (colTypeLowerCase.equals(serdeConstants.BINARY_TYPE_NAME)) { cs.setAvgColLen(csd.getBinaryStats().getAvgColLen()); cs.setNumNulls(csd.getBinaryStats().getNumNulls()); + // BinaryColumnStatsData has no numDVs field - the metastore does not track NDV + // for binary columns, so it is genuinely unknown + cs.setCountDistint(-1); } else if (colTypeLowerCase.equals(serdeConstants.TIMESTAMP_TYPE_NAME)) { cs.setAvgColLen(JavaDataModel.get().lengthOfTimestamp()); - cs.setCountDistint(csd.getTimestampStats().getNumDVs()); + cs.setCountDistint(csd.getTimestampStats().isSetNumDVs() ? csd.getTimestampStats().getNumDVs() : -1); cs.setNumNulls(csd.getTimestampStats().getNumNulls()); Long lowVal = (csd.getTimestampStats().getLowValue() != null) ? csd.getTimestampStats().getLowValue() .getSecondsSinceEpoch() : null; @@ -852,7 +855,7 @@ public static ColStatistics getColStatistics(ColumnStatisticsObj cso, String col cs.setAvgColLen(JavaDataModel.get().lengthOfTimestamp()); } else if (colTypeLowerCase.startsWith(serdeConstants.DECIMAL_TYPE_NAME)) { cs.setAvgColLen(JavaDataModel.get().lengthOfDecimal()); - cs.setCountDistint(csd.getDecimalStats().getNumDVs()); + cs.setCountDistint(csd.getDecimalStats().isSetNumDVs() ? csd.getDecimalStats().getNumDVs() : -1); cs.setNumNulls(csd.getDecimalStats().getNumNulls()); Decimal highValue = csd.getDecimalStats().getHighValue(); Decimal lowValue = csd.getDecimalStats().getLowValue(); @@ -871,7 +874,7 @@ public static ColStatistics getColStatistics(ColumnStatisticsObj cso, String col cs.setHistogram(csd.getDecimalStats().getHistogram()); } else if (colTypeLowerCase.equals(serdeConstants.DATE_TYPE_NAME)) { cs.setAvgColLen(JavaDataModel.get().lengthOfDate()); - cs.setCountDistint(csd.getDateStats().getNumDVs()); + cs.setCountDistint(csd.getDateStats().isSetNumDVs() ? csd.getDateStats().getNumDVs() : -1); cs.setNumNulls(csd.getDateStats().getNumNulls()); Long lowVal = (csd.getDateStats().getLowValue() != null) ? csd.getDateStats().getLowValue() .getDaysSinceEpoch() : null; @@ -900,7 +903,7 @@ public static void fillColumnStatisticsData(ColumnStatisticsData data, ColStatis private static void fillColStatisticsFromLongStatsData(ColStatistics cs, LongColumnStatsData longStats, double avgColLen) { - cs.setCountDistint(longStats.getNumDVs()); + cs.setCountDistint(longStats.isSetNumDVs() ? longStats.getNumDVs() : -1); cs.setNumNulls(longStats.getNumNulls()); cs.setAvgColLen(avgColLen); Long lowVal = longStats.isSetLowValue() ? longStats.getLowValue() : null; @@ -912,7 +915,7 @@ private static void fillColStatisticsFromLongStatsData(ColStatistics cs, LongCol private static void fillColStatisticsFromDoubleStatsData(ColStatistics cs, DoubleColumnStatsData doubleStats, double avgColLen) { - cs.setCountDistint(doubleStats.getNumDVs()); + cs.setCountDistint(doubleStats.isSetNumDVs() ? doubleStats.getNumDVs() : -1); cs.setNumNulls(doubleStats.getNumNulls()); cs.setAvgColLen(avgColLen); Double lowVal = doubleStats.isSetLowValue() ? doubleStats.getLowValue() : null; @@ -1690,6 +1693,9 @@ public static Long addWithExpDecay (List distinctVals) { // Exponential back-off for NDVs. // 1) Descending order sort of NDVs // 2) denominator = NDV1 * (NDV2 ^ (1/2)) * (NDV3 ^ (1/4))) * .... + if (containsUnknownNDV(distinctVals)) { + return -1L; + } distinctVals.sort(Collections.reverseOrder()); long denom = distinctVals.get(0); @@ -1716,6 +1722,10 @@ private static long getNDVFor(ExprNodeGenericFuncDesc engfd, long numRows, Stati for (String col : engfd.getCols()) { ColStatistics stats = parentStats.getColumnStatisticsFromColName(col); if (stats != null) { + // countDistinct < 0 means "unknown" + if (stats.getCountDistint() < 0) { + return -1L; + } ndvs.add(stats.getCountDistint()); } } @@ -2036,20 +2046,23 @@ public static void updateStats(Statistics stats, long newNumRows, for (ColStatistics cs : colStats) { long oldDV = cs.getCountDistint(); if (affectedColumns.contains(cs.getColumnName())) { - long newDV = oldDV; - - // if ratio is greater than 1, then number of rows increases. This can happen - // when some operators like GROUPBY duplicates the input rows in which case - // number of distincts should not change. Update the distinct count only when - // the output number of rows is less than input number of rows. - if (ratio <= 1.0) { - newDV = (long) Math.ceil(ratio * oldDV); - } - cs.setCountDistint(newDV); cs.setFilterColumn(); - oldDV = newDV; + // countDistinct < 0 means "unknown" - skip the NDV math + if (oldDV >= 0) { + long newDV = oldDV; + + // if ratio is greater than 1, then number of rows increases. This can happen + // when some operators like GROUPBY duplicates the input rows in which case + // number of distincts should not change. Update the distinct count only when + // the output number of rows is less than input number of rows. + if (ratio <= 1.0) { + newDV = (long) Math.ceil(ratio * oldDV); + } + cs.setCountDistint(newDV); + oldDV = newDV; + } } - if (oldDV > newNumRows) { + if (oldDV >= 0 && oldDV > newNumRows) { cs.setCountDistint(newNumRows); } // numNulls < 0 means "unknown" - preserve the sentinel value @@ -2080,7 +2093,8 @@ public static void scaleColStatistics(List colStats, double facto if (cs.getNumNulls() >= 0) { cs.setNumNulls(StatsUtils.safeMult(cs.getNumNulls(), factor)); } - if (factor < 1.0) { + // countDistinct < 0 means "unknown" - preserve the sentinel value + if (factor < 1.0 && cs.getCountDistint() >= 0) { final double newNDV = Math.ceil(cs.getCountDistint() * factor); cs.setCountDistint(newNDV > Long.MAX_VALUE ? Long.MAX_VALUE : (long) newNDV); } @@ -2092,7 +2106,8 @@ public static long computeNDVGroupingColumns(List colStats, Stati List ndvValues = extractNDVGroupingColumns(colStats, parentStats); if (ndvValues == null) { - return 0L; + // unknown: a grouping column has NDV<0 or stats are missing on a partial state + return -1L; } if (ndvValues.isEmpty()) { // No grouping columns, one row @@ -2112,6 +2127,11 @@ private static List extractNDVGroupingColumns(List colStats for (ColStatistics cs : colStats) { if (cs != null) { long ndv = cs.getCountDistint(); + // countDistinct < 0 means "unknown" - signal it like a missing entry + if (ndv < 0) { + ndvValues = null; + break; + } if (cs.getNumNulls() > 0) { ndv = StatsUtils.safeAdd(ndv, 1); } @@ -2134,4 +2154,13 @@ private static List extractNDVGroupingColumns(List colStats return ndvValues; } + + /** + * Returns true if any value in the given list is the negative NDV "unknown" + * sentinel established by HIVE-29438 / HIVE-29625. Used by aggregators that + * must propagate unknown when any contributor is unknown. + */ + public static boolean containsUnknownNDV(List distinctVals) { + return distinctVals.stream().anyMatch(v -> v < 0); + } } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/PessimisticStatCombiner.java b/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/PessimisticStatCombiner.java index 4de2867de7c0..2a3e16c48235 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/PessimisticStatCombiner.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/PessimisticStatCombiner.java @@ -41,7 +41,9 @@ public void add(ColStatistics stat) { if (stat.getAvgColLen() > result.getAvgColLen()) { result.setAvgColLen(stat.getAvgColLen()); } - if (stat.getCountDistint() > result.getCountDistint()) { + if (stat.getCountDistint() < 0 || result.getCountDistint() < 0) { + result.setCountDistint(-1); + } else if (stat.getCountDistint() > result.getCountDistint()) { result.setCountDistint(stat.getCountDistint()); } if (stat.getNumNulls() < 0 || result.getNumNulls() < 0) { diff --git a/ql/src/test/org/apache/hadoop/hive/ql/optimizer/TestReduceSinkMapJoinProc.java b/ql/src/test/org/apache/hadoop/hive/ql/optimizer/TestReduceSinkMapJoinProc.java new file mode 100644 index 000000000000..58329a228027 --- /dev/null +++ b/ql/src/test/org/apache/hadoop/hive/ql/optimizer/TestReduceSinkMapJoinProc.java @@ -0,0 +1,169 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.optimizer; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.mockStatic; +import static org.mockito.Mockito.when; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.stream.Stream; + +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.ql.CompilationOpContext; +import org.apache.hadoop.hive.ql.Context; +import org.apache.hadoop.hive.ql.exec.MapJoinOperator; +import org.apache.hadoop.hive.ql.exec.Operator; +import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator; +import org.apache.hadoop.hive.ql.exec.RowSchema; +import org.apache.hadoop.hive.ql.exec.Utilities; +import org.apache.hadoop.hive.ql.parse.GenTezProcContext; +import org.apache.hadoop.hive.ql.parse.ParseContext; +import org.apache.hadoop.hive.ql.plan.BaseWork; +import org.apache.hadoop.hive.ql.plan.ColStatistics; +import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; +import org.apache.hadoop.hive.ql.plan.MapJoinDesc; +import org.apache.hadoop.hive.ql.plan.OperatorDesc; +import org.apache.hadoop.hive.ql.plan.ReduceSinkDesc; +import org.apache.hadoop.hive.ql.plan.Statistics; +import org.apache.hadoop.hive.ql.stats.StatsUtils; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; +import org.mockito.MockedStatic; + +class TestReduceSinkMapJoinProc { + + // A null ndv row represents StatsUtils.getColStatisticsFromExpression returning null. + @ParameterizedTest(name = "{0}") + @MethodSource("keyCountFromNdvCases") + void testProcessReduceSinkToHashJoinKeyCountFromNdv( + String scenarioName, Long ndv, long parentRows, long expectedKeyCount) throws Exception { + invokeAndAssertKeyCount(ndv == null ? null : buildColStat(ndv), parentRows, expectedKeyCount); + } + + private static Stream keyCountFromNdvCases() { + // Behavior recap: + // Initial keyCount = parentRows = stats.getNumRows() + // For each key col: + // if cs == null or cs.getCountDistint() < 0 -> maxKeyCount = MAX_VALUE; break <-- HIVE-29625 change + // else maxKeyCount *= cs.getCountDistint() + // keyCount = min(maxKeyCount, keyCount) + // if keyCount == 0 -> keyCount = 1 + // joinConf.getParentKeyCounts().put(pos, keyCount) [only if keyCount != MAX_VALUE] + return Stream.of( + // NDV > 0 and below parentRows -> keyCount = NDV + Arguments.of("knownPositiveBelowParent", 10L, 1000L, 10L), + // NDV > 0 but above parentRows -> capped at parentRows + Arguments.of("knownPositiveAboveParent", 5000L, 1000L, 1000L), + // NDV = 0 (verified zero under HIVE-29625) -> maxKeyCount = 0 -> keyCount = 0 -> bumped to 1 + Arguments.of("verifiedZeroBumpsToOne", 0L, 1000L, 1L), + // NDV = -1 (unknown under HIVE-29625) -> maxKeyCount = MAX_VALUE -> keyCount = parentRows + Arguments.of("unknownFallsBackToParent", -1L, 1000L, 1000L), + // cs == null (no derivable stat) -> shares the MAX_VALUE fallback path + Arguments.of("nullColStatsFallsBackToParent", null, 1000L, 1000L) + ); + } + + // Shared harness: build GenTezProcContext + mocked operators, run the method, read the keyCount put(). + private static void invokeAndAssertKeyCount( + ColStatistics csForKey, long parentRows, long expectedKeyCount) throws Exception { + + // ---- Operator chain mocks ---- + ReduceSinkOperator parentRS = mock(ReduceSinkOperator.class); + MapJoinOperator mapJoinOp = mock(MapJoinOperator.class); + ReduceSinkDesc rsConf = mock(ReduceSinkDesc.class); + MapJoinDesc joinConf = mock(MapJoinDesc.class); + Statistics rsStats = mock(Statistics.class); + ExprNodeDesc keyExpr = mock(ExprNodeDesc.class); + BaseWork parentWork = mock(BaseWork.class); + + when(parentRS.getConf()).thenReturn(rsConf); + when(parentRS.getStatistics()).thenReturn(rsStats); + when(parentRS.getCompilationOpContext()).thenReturn(new CompilationOpContext()); + Map columnExprMap = new HashMap<>(); + columnExprMap.put(Utilities.ReduceField.KEY.toString() + ".k0", keyExpr); + when(parentRS.getColumnExprMap()).thenReturn(columnExprMap); + Operator upstreamParent = mock(Operator.class); + when(upstreamParent.getSchema()).thenReturn(new RowSchema(Collections.emptyList())); + when(parentRS.getParentOperators()).thenReturn(Arrays.asList(upstreamParent)); + List> childOps = new ArrayList<>(); + childOps.add(mapJoinOp); + when(parentRS.getChildOperators()).thenReturn(childOps); + + when(mapJoinOp.getConf()).thenReturn(joinConf); + + when(rsConf.getOutputKeyColumnNames()).thenReturn(Arrays.asList("k0")); + + when(joinConf.isBucketMapJoin()).thenReturn(false); + when(joinConf.isDynamicPartitionHashJoin()).thenReturn(false); + Map parentKeyCounts = new LinkedHashMap<>(); + when(joinConf.getParentKeyCounts()).thenReturn(parentKeyCounts); + when(joinConf.getParentToInput()).thenReturn(new LinkedHashMap<>()); + when(joinConf.getParentDataSizes()).thenReturn(new LinkedHashMap<>()); + Map> keyExprMap = new HashMap<>(); + keyExprMap.put((byte) 0, Collections.emptyList()); + when(joinConf.getKeys()).thenReturn(keyExprMap); + + when(rsStats.getNumRows()).thenReturn(parentRows); + when(rsStats.getDataSize()).thenReturn(8000L); + + when(parentWork.getName()).thenReturn("parent_work"); + + // ---- Real GenTezProcContext (constructor sets up all the maps for us) ---- + HiveConf conf = new HiveConf(); + ParseContext parseCtx = mock(ParseContext.class); + Context ctx = mock(Context.class); + when(parseCtx.getContext()).thenReturn(ctx); + when(ctx.getSequencer()).thenReturn(new AtomicInteger()); + GenTezProcContext context = new GenTezProcContext( + conf, parseCtx, Collections.emptyList(), new ArrayList<>(), + Collections.emptySet(), Collections.emptySet()); + + context.childToWorkMap.put(parentRS, Arrays.asList(parentWork)); + context.mapJoinParentMap.put(mapJoinOp, Arrays.asList(parentRS)); + + // ---- Stub StatsUtils.getColStatisticsFromExpression to return our chosen colStat ---- + try (MockedStatic stub = mockStatic(StatsUtils.class)) { + stub.when(() -> StatsUtils.getColStatisticsFromExpression( + any(HiveConf.class), any(Statistics.class), any(ExprNodeDesc.class))) + .thenReturn(csForKey); + + ReduceSinkMapJoinProc.processReduceSinkToHashJoin(parentRS, mapJoinOp, context); + } + + Long actual = parentKeyCounts.get(0); + assertEquals(expectedKeyCount, actual.longValue()); + } + + private static ColStatistics buildColStat(long ndv) { + ColStatistics cs = new ColStatistics("k0", "int"); + cs.setCountDistint(ndv); + return cs; + } +} diff --git a/ql/src/test/org/apache/hadoop/hive/ql/optimizer/TestSetHashGroupByMinReduction.java b/ql/src/test/org/apache/hadoop/hive/ql/optimizer/TestSetHashGroupByMinReduction.java new file mode 100644 index 000000000000..6c7c2145382b --- /dev/null +++ b/ql/src/test/org/apache/hadoop/hive/ql/optimizer/TestSetHashGroupByMinReduction.java @@ -0,0 +1,141 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.optimizer; + +import static org.junit.jupiter.api.Assertions.assertNull; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.ArgumentMatchers.anyFloat; +import static org.mockito.ArgumentMatchers.eq; +import static org.mockito.Mockito.atLeastOnce; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.mockStatic; +import static org.mockito.Mockito.never; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; + +import java.util.Arrays; +import java.util.Collections; +import java.util.function.Consumer; +import java.util.stream.Stream; + +import org.apache.hadoop.hive.ql.exec.GroupByOperator; +import org.apache.hadoop.hive.ql.exec.Operator; +import org.apache.hadoop.hive.ql.exec.RowSchema; +import org.apache.hadoop.hive.ql.parse.SemanticException; +import org.apache.hadoop.hive.ql.plan.GroupByDesc; +import org.apache.hadoop.hive.ql.plan.GroupByDesc.Mode; +import org.apache.hadoop.hive.ql.plan.Statistics; +import org.apache.hadoop.hive.ql.plan.Statistics.State; +import org.apache.hadoop.hive.ql.stats.StatsUtils; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; +import org.mockito.MockedStatic; + +class TestSetHashGroupByMinReduction { + + // Default-reduction tuple used across all tests. Picked so the known-positive case + // (ndvProduct=100, numRows=1000) produces a factor (0.9) strictly below the default + // (0.99), triggering setMinReductionHashAggr. + private static final float DEFAULT_MIN_REDUCTION = 0.99f; + private static final float DEFAULT_MIN_REDUCTION_LOWER_BOUND = 0.1f; + + @ParameterizedTest(name = "{0}") + @MethodSource("ndvProductCases") + void testProcessByNdvProduct(String name, long ndvProduct, boolean expectSetCall) + throws SemanticException { + GroupByOperator op = setupCompleteHashGroupBy(); + GroupByDesc desc = op.getConf(); + + try (MockedStatic stub = mockStatic(StatsUtils.class)) { + stub.when(() -> StatsUtils.computeNDVGroupingColumns(any(), any(), eq(true))) + .thenReturn(ndvProduct); + + Object result = new SetHashGroupByMinReduction().process(op, null, null); + + assertNull(result, "process() always returns null sentinel"); + verify(desc, expectSetCall ? atLeastOnce() : never()).setMinReductionHashAggr(anyFloat()); + } + } + + private static Stream ndvProductCases() { + return Stream.of( + Arguments.of("unknownNDVEarlyReturns", -1L, false), + Arguments.of("verifiedZeroFactorTooHigh", 0L, false), + Arguments.of("knownPositiveBelowDefault", 100L, true) + ); + } + + @ParameterizedTest(name = "{0}") + @MethodSource("earlyReturnGateCases") + void testProcessEarlyReturnsOnUnsupportedState(String name, Consumer flipGate) + throws SemanticException { + GroupByOperator op = setupCompleteHashGroupBy(); + flipGate.accept(op); + + Object result = new SetHashGroupByMinReduction().process(op, null, null); + + assertNull(result); + verify(op.getConf(), never()).setMinReductionHashAggr(anyFloat()); + } + + private static Stream earlyReturnGateCases() { + return Stream.of( + Arguments.of("modeNotHash", + (Consumer) op -> + when(op.getConf().getMode()).thenReturn(Mode.MERGEPARTIAL)), + Arguments.of("basicStatsIncomplete", + (Consumer) op -> + when(op.getStatistics().getBasicStatsState()).thenReturn(State.PARTIAL)), + Arguments.of("columnStatsIncomplete", + (Consumer) op -> + when(op.getStatistics().getColumnStatsState()).thenReturn(State.PARTIAL)) + ); + } + + // Passes all early-return gates; empty keys make the colStats loop a no-op. + private static GroupByOperator setupCompleteHashGroupBy() { + GroupByOperator op = mock(GroupByOperator.class); + GroupByDesc desc = mock(GroupByDesc.class); + Statistics stats = mock(Statistics.class); + Operator parent = mock(Operator.class); + Statistics parentStats = mock(Statistics.class); + RowSchema schema = mock(RowSchema.class); + + when(op.getConf()).thenReturn(desc); + when(op.getStatistics()).thenReturn(stats); + when(op.getSchema()).thenReturn(schema); + when(schema.getSignature()).thenReturn(Collections.emptyList()); + + when(desc.getMode()).thenReturn(Mode.HASH); + when(desc.getKeys()).thenReturn(Collections.emptyList()); + when(desc.getMinReductionHashAggr()).thenReturn(DEFAULT_MIN_REDUCTION); + when(desc.getMinReductionHashAggrLowerBound()).thenReturn(DEFAULT_MIN_REDUCTION_LOWER_BOUND); + + when(stats.getBasicStatsState()).thenReturn(State.COMPLETE); + when(stats.getColumnStatsState()).thenReturn(State.COMPLETE); + + when(parent.getStatistics()).thenReturn(parentStats); + when(parentStats.getNumRows()).thenReturn(1000L); + + when(op.getParentOperators()).thenReturn(Arrays.asList(parent)); + + return op; + } +} diff --git a/ql/src/test/org/apache/hadoop/hive/ql/optimizer/TestSortedDynPartitionOptimizer.java b/ql/src/test/org/apache/hadoop/hive/ql/optimizer/TestSortedDynPartitionOptimizer.java new file mode 100644 index 000000000000..3fbd063619e3 --- /dev/null +++ b/ql/src/test/org/apache/hadoop/hive/ql/optimizer/TestSortedDynPartitionOptimizer.java @@ -0,0 +1,176 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.optimizer; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.mockito.ArgumentMatchers.eq; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.mockStatic; +import static org.mockito.Mockito.when; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.function.Function; +import java.util.stream.Stream; + +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.ql.exec.ColumnInfo; +import org.apache.hadoop.hive.ql.exec.FileSinkOperator; +import org.apache.hadoop.hive.ql.exec.Operator; +import org.apache.hadoop.hive.ql.exec.RowSchema; +import org.apache.hadoop.hive.ql.parse.ParseContext; +import org.apache.hadoop.hive.ql.plan.ColStatistics; +import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; +import org.apache.hadoop.hive.ql.plan.Statistics; +import org.apache.hadoop.hive.ql.stats.StatsUtils; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; +import org.mockito.MockedStatic; + +class TestSortedDynPartitionOptimizer { + + @ParameterizedTest(name = "{0}") + @MethodSource("computePartCardinalityColumnCases") + void testComputePartCardinalityColumnBranch( + String scenarioName, long[] ndvs, boolean firstStatNull, long expected) { + SortedDynPartitionOptimizer.SortedDynamicPartitionProc proc = newProc(null); + + Statistics tStats = mock(Statistics.class); + Operator fsParent = mock(FileSinkOperator.class); + RowSchema schema = mock(RowSchema.class); + when(fsParent.getSchema()).thenReturn(schema); + + ColStatistics[] colStats = buildColStats(ndvs, firstStatNull, "p"); + List sig = new ArrayList<>(); + List partitionPos = new ArrayList<>(); + for (int i = 0; i < ndvs.length; i++) { + String colName = "p" + i; + ColumnInfo ci = mock(ColumnInfo.class); + when(ci.getInternalName()).thenReturn(colName); + sig.add(ci); + when(tStats.getColumnStatisticsFromColName(colName)).thenReturn(colStats[i]); + partitionPos.add(i); + } + when(schema.getSignature()).thenReturn(sig); + + long result = proc.computePartCardinality( + partitionPos, Collections.emptyList(), tStats, fsParent, new ArrayList<>()); + + assertEquals(expected, result, scenarioName); + } + + private static Stream computePartCardinalityColumnCases() { + return Stream.of( + // All known positive NDVs - product computed + Arguments.of("twoPositiveColumns", new long[] {10L, 5L}, false, 50L), + Arguments.of("singlePositiveColumn", new long[] {42L}, false, 42L), + Arguments.of("threeColumnsCompound", new long[] {3L, 4L, 5L}, false, 60L), + // HIVE-29625: NDV<0 is unknown - returns -1 + Arguments.of("unknownNDVShortCircuits", new long[] {10L, -1L, 5L}, false, -1L), + Arguments.of("firstUnknownShortCircuits", new long[] {-1L, 10L}, false, -1L), + Arguments.of("singleUnknownColumn", new long[] {-1L}, false, -1L), + // Verified zero NDV (HIVE-29625 disambiguation) - falls through to multiplication + Arguments.of("verifiedZeroProducesZero", new long[] {10L, 0L, 5L}, false, 0L), + Arguments.of("singleVerifiedZero", new long[] {0L}, false, 0L), + // Missing stats (partStats == null) - returns -1 + Arguments.of("nullStatsShortCircuits", new long[] {0L, 10L}, true, -1L) + ); + } + + @ParameterizedTest(name = "{0}") + @MethodSource("computePartCardinalityExprCases") + void testComputePartCardinalityCustomExprBranch( + String scenarioName, long[] ndvs, boolean firstStatNull, long expected) { + HiveConf conf = new HiveConf(); + ParseContext parseCtx = mock(ParseContext.class); + when(parseCtx.getConf()).thenReturn(conf); + SortedDynPartitionOptimizer.SortedDynamicPartitionProc proc = newProc(parseCtx); + + Statistics tStats = mock(Statistics.class); + Operator fsParent = mock(FileSinkOperator.class); + ArrayList allRSCols = new ArrayList<>(); + List, ExprNodeDesc>> exprs = new ArrayList<>(); + List resolvedExprs = new ArrayList<>(); + for (int i = 0; i < ndvs.length; i++) { + ExprNodeDesc resolved = mock(ExprNodeDesc.class); + resolvedExprs.add(resolved); + exprs.add(cols -> resolved); + } + + ColStatistics[] colStats = buildColStats(ndvs, firstStatNull, "e"); + try (MockedStatic stub = mockStatic(StatsUtils.class)) { + for (int i = 0; i < ndvs.length; i++) { + final int idx = i; + stub.when(() -> StatsUtils.getColStatisticsFromExpression(eq(conf), eq(tStats), eq(resolvedExprs.get(idx)))) + .thenReturn(colStats[idx]); + } + + long result = proc.computePartCardinality( + Collections.emptyList(), exprs, tStats, fsParent, allRSCols); + + assertEquals(expected, result, scenarioName); + } + } + + private static Stream computePartCardinalityExprCases() { + return Stream.of( + Arguments.of("singleKnownExpr", new long[] {7L}, false, 7L), + Arguments.of("twoKnownExprsMultiply", new long[] {3L, 4L}, false, 12L), + // HIVE-29625: NDV<0 from expression stats short-circuits + Arguments.of("unknownExprStatsShortCircuit", new long[] {5L, -1L}, false, -1L), + Arguments.of("firstExprUnknown", new long[] {-1L, 5L}, false, -1L), + // Verified zero from expression stats - falls through to multiplication + Arguments.of("verifiedZeroExprProducesZero", new long[] {5L, 0L}, false, 0L), + // Null expression stats (StatsUtils returned null) - returns -1 + Arguments.of("nullExprStatsShortCircuits", new long[] {0L, 5L}, true, -1L) + ); + } + + @Test + void testComputePartCardinalityBothEmptyReturnsZero() { + SortedDynPartitionOptimizer.SortedDynamicPartitionProc proc = newProc(null); + long result = proc.computePartCardinality( + Collections.emptyList(), Collections.emptyList(), + mock(Statistics.class), mock(FileSinkOperator.class), new ArrayList<>()); + assertEquals(0L, result, "Both partitionPos and customPartitionExprs empty -> 0"); + } + + private static SortedDynPartitionOptimizer.SortedDynamicPartitionProc newProc(ParseContext parseCtx) { + SortedDynPartitionOptimizer outer = new SortedDynPartitionOptimizer(); + return outer.new SortedDynamicPartitionProc(parseCtx); + } + + // First entry is null when firstStatNull is true; simulates a missing stat for either branch. + private static ColStatistics[] buildColStats(long[] ndvs, boolean firstStatNull, String prefix) { + ColStatistics[] result = new ColStatistics[ndvs.length]; + for (int i = 0; i < ndvs.length; i++) { + if (i == 0 && firstStatNull) { + result[i] = null; + } else { + ColStatistics cs = new ColStatistics(prefix + i, "int"); + cs.setCountDistint(ndvs[i]); + result[i] = cs; + } + } + return result; + } +} diff --git a/ql/src/test/org/apache/hadoop/hive/ql/optimizer/calcite/stats/TestHiveRelMdDistinctRowCount.java b/ql/src/test/org/apache/hadoop/hive/ql/optimizer/calcite/stats/TestHiveRelMdDistinctRowCount.java new file mode 100644 index 000000000000..c49b6db33b28 --- /dev/null +++ b/ql/src/test/org/apache/hadoop/hive/ql/optimizer/calcite/stats/TestHiveRelMdDistinctRowCount.java @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.optimizer.calcite.stats; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +import java.util.ArrayList; +import java.util.List; +import java.util.stream.Stream; + +import org.apache.calcite.rel.metadata.RelMetadataQuery; +import org.apache.calcite.util.ImmutableBitSet; +import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveTableScan; +import org.apache.hadoop.hive.ql.plan.ColStatistics; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; + +class TestHiveRelMdDistinctRowCount { + + @ParameterizedTest(name = "{0}") + @MethodSource("getDistinctRowCountCases") + void testGetDistinctRowCountHiveTableScan( + String scenarioName, long[] ndvs, double rowCount, double expected) { + HiveTableScan htRel = mock(HiveTableScan.class); + RelMetadataQuery mq = mock(RelMetadataQuery.class); + + when(htRel.getColStat(any())).thenReturn(buildColStats(ndvs)); + when(mq.getRowCount(htRel)).thenReturn(rowCount); + + HiveRelMdDistinctRowCount provider = new HiveRelMdDistinctRowCount(); + Double result = provider.getDistinctRowCount(htRel, mq, ImmutableBitSet.of(0), null); + + assertEquals(expected, result); + } + + private static Stream getDistinctRowCountCases() { + return Stream.of( + // All positive, product fits under row count + Arguments.of("allPositiveProductUnderRowCount", new long[] {10L, 5L}, 1000.0, 50.0), + Arguments.of("singlePositive", new long[] {42L}, 1000.0, 42.0), + // Product exceeds row count -> capped + Arguments.of("productCappedAtRowCount", new long[] {2000L, 50L}, 1000.0, 1000.0), + Arguments.of("singlePositiveExceedsRowCount", new long[] {5000L}, 1000.0, 1000.0), + // Verified-zero NDV in any column triggers the <=0 early-exit + Arguments.of("verifiedZeroInAnyColumn", new long[] {10L, 0L, 5L}, 1000.0, 0.0), + Arguments.of("verifiedZeroFirstShortCircuits", new long[] {0L, 10L}, 1000.0, 0.0), + Arguments.of("verifiedZeroAlone", new long[] {0L}, 1000.0, 0.0), + // Unknown NDV (-1) in any column triggers the <=0 early-exit (HIVE-29625) + Arguments.of("unknownInAnyColumn", new long[] {10L, -1L, 5L}, 1000.0, 0.0), + Arguments.of("unknownFirstShortCircuits", new long[] {-1L, 10L}, 1000.0, 0.0), + Arguments.of("unknownAlone", new long[] {-1L}, 1000.0, 0.0), + // Mixed verified-zero and unknown: both produce 0.0 regardless of order + Arguments.of("unknownThenVerifiedZero", new long[] {-1L, 0L}, 1000.0, 0.0), + Arguments.of("verifiedZeroThenUnknown", new long[] {0L, -1L}, 1000.0, 0.0), + // Empty column list - loop doesn't execute, fall through to Math.min(1.0, rowCount) + Arguments.of("emptyColStatsFallsThroughTo1", new long[] {}, 1000.0, 1.0), + Arguments.of("emptyColStatsCappedByLowRowCount", new long[] {}, 0.5, 0.5) + ); + } + + private static List buildColStats(long[] ndvs) { + List stats = new ArrayList<>(); + for (int i = 0; i < ndvs.length; i++) { + ColStatistics cs = new ColStatistics("c" + i, "int"); + cs.setCountDistint(ndvs[i]); + stats.add(cs); + } + return stats; + } +} diff --git a/ql/src/test/org/apache/hadoop/hive/ql/optimizer/stats/annotation/TestStatsRulesProcFactory.java b/ql/src/test/org/apache/hadoop/hive/ql/optimizer/stats/annotation/TestStatsRulesProcFactory.java index 4d9d351af8f1..6e646a6ad502 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/optimizer/stats/annotation/TestStatsRulesProcFactory.java +++ b/ql/src/test/org/apache/hadoop/hive/ql/optimizer/stats/annotation/TestStatsRulesProcFactory.java @@ -23,14 +23,25 @@ import org.apache.hadoop.hive.common.type.Date; import org.apache.hadoop.hive.common.type.Timestamp; import org.apache.hadoop.hive.metastore.StatisticsTestUtils; +import org.apache.hadoop.hive.ql.Context; +import org.apache.hadoop.hive.ql.exec.ColumnInfo; +import org.apache.hadoop.hive.ql.exec.GroupByOperator; +import org.apache.hadoop.hive.ql.exec.Operator; +import org.apache.hadoop.hive.ql.exec.RowSchema; +import org.apache.hadoop.hive.ql.exec.tez.DagUtils; +import org.apache.hadoop.hive.ql.parse.ParseContext; import org.apache.hadoop.hive.ql.parse.SemanticException; import org.apache.hadoop.hive.ql.plan.ColStatistics; import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc; +import org.apache.hadoop.hive.ql.plan.GroupByDesc; +import org.apache.hadoop.hive.ql.plan.OperatorDesc; import org.apache.hadoop.hive.ql.plan.Statistics; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFBetween; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDFIn; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqual; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqualOrGreaterThan; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqualOrLessThan; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPGreaterThan; @@ -39,23 +50,37 @@ import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; import org.apache.hadoop.hive.ql.plan.AggregationDesc; import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator; -import org.junit.Test; +import org.apache.hadoop.yarn.api.records.Resource; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; +import org.mockito.ArgumentCaptor; +import org.mockito.MockedStatic; import java.util.Arrays; import java.util.Collections; +import java.util.HashMap; +import java.util.Map; +import java.util.stream.Stream; import org.apache.hadoop.hive.ql.exec.CommonJoinOperator; import org.apache.hadoop.hive.ql.plan.JoinCondDesc; import org.apache.hadoop.hive.ql.plan.JoinDesc; +import static org.mockito.ArgumentMatchers.any; import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.mockStatic; +import static org.mockito.Mockito.spy; +import static org.mockito.Mockito.verify; import static org.mockito.Mockito.when; import static org.apache.hadoop.hive.ql.optimizer.stats.annotation.StatsRulesProcFactory.FilterStatsRule.extractFloatFromLiteralValue; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertNotNull; -import static org.junit.Assert.assertThrows; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertThrows; -public class TestStatsRulesProcFactory { +class TestStatsRulesProcFactory { private final static String COL_NAME = "col1"; private final static ExprNodeDesc COL_EXPR = new ExprNodeColumnDesc( @@ -66,7 +91,7 @@ public class TestStatsRulesProcFactory { private final static long[] VALUES = { 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 4L, 5L, 6L, 7L }; @Test - public void testComparisonRowCountZeroNonNullValues() throws SemanticException { + void testComparisonRowCountZeroNonNullValues() throws SemanticException { long numNulls = 2; long[] values = {}; Statistics stats = createStatistics(values, numNulls); @@ -80,7 +105,7 @@ public void testComparisonRowCountZeroNonNullValues() throws SemanticException { } @Test - public void testComparisonRowCountInvalidKll() throws SemanticException { + void testComparisonRowCountInvalidKll() throws SemanticException { long numNulls = 2; Statistics stats = createStatistics(VALUES, numNulls); stats.getColumnStats().get(0).setHistogram(null); @@ -102,253 +127,304 @@ public void testComparisonRowCountInvalidKll() throws SemanticException { assertEquals((VALUES.length + numNulls) / 3, numRows); } - @Test - public void testComparisonRowCountLessThan() throws SemanticException { - long numNulls = 2; - Statistics stats = createStatistics(VALUES, numNulls); - - ExprNodeDesc exprNodeDesc = new ExprNodeGenericFuncDesc(TypeInfoFactory.intTypeInfo, - new GenericUDFOPLessThan(), Arrays.asList(COL_EXPR, createExprNodeConstantDesc(3))); - long numRows = new StatsRulesProcFactory.FilterStatsRule().evaluateExpression( - stats, exprNodeDesc, STATS_PROC_CTX, Collections.emptyList(), null, VALUES.length + numNulls); + /** + * HIVE-29625: IN-filter row-count estimate by NDV of the column. + * unknown NDV (-1) -> factor *= 0.5 per IN value, currNumRows = round(rows * 0.5) + * verified-zero NDV (0) -> factor = 0, no rows match + */ + @ParameterizedTest(name = "{0}") + @MethodSource("evaluateInExprCases") + void testEvaluateInExprByNDV(String name, long ndvOverride, long expectedRows) + throws SemanticException { + Statistics stats = createStatistics(VALUES, 0); + stats.getColumnStats().get(0).setCountDistint(ndvOverride); - assertEquals(8, numRows); - } + AnnotateStatsProcCtx ctx = spy(new AnnotateStatsProcCtx(null)); + when(ctx.getConf()).thenReturn(new HiveConf()); - @Test - public void testComparisonRowCountLessThanMin() throws SemanticException { - long numNulls = 2; - Statistics stats = createStatistics(VALUES, numNulls); + ExprNodeDesc inExpr = new ExprNodeGenericFuncDesc(TypeInfoFactory.booleanTypeInfo, + new GenericUDFIn(), + Arrays.asList(COL_EXPR, createExprNodeConstantDesc(1), createExprNodeConstantDesc(2))); - ExprNodeDesc exprNodeDesc = new ExprNodeGenericFuncDesc(TypeInfoFactory.intTypeInfo, - new GenericUDFOPLessThan(), Arrays.asList(COL_EXPR, createExprNodeConstantDesc(1))); long numRows = new StatsRulesProcFactory.FilterStatsRule().evaluateExpression( - stats, exprNodeDesc, STATS_PROC_CTX, Collections.emptyList(), null, VALUES.length + numNulls); + stats, inExpr, ctx, Arrays.asList(COL_NAME), null, VALUES.length); - assertEquals(0, numRows); + assertEquals(expectedRows, numRows); } - @Test - public void testComparisonRowCountLessThanBelowMin() throws SemanticException { - long numNulls = 2; - Statistics stats = createStatistics(VALUES, numNulls); - - ExprNodeDesc exprNodeDesc = new ExprNodeGenericFuncDesc(TypeInfoFactory.intTypeInfo, - new GenericUDFOPLessThan(), Arrays.asList(COL_EXPR, createExprNodeConstantDesc(0))); - long numRows = new StatsRulesProcFactory.FilterStatsRule().evaluateExpression( - stats, exprNodeDesc, STATS_PROC_CTX, Collections.emptyList(), null, VALUES.length + numNulls); - - assertEquals(0, numRows); + private static Stream evaluateInExprCases() { + return Stream.of( + Arguments.of("unknownNDVAppliesHalfFactor", -1L, Math.round(VALUES.length * 0.5)), + Arguments.of("verifiedZeroReturnsZero", 0L, 0L) + ); } - @Test - public void testComparisonRowCountLessThanMax() throws SemanticException { - long numNulls = 2; - Statistics stats = createStatistics(VALUES, numNulls); + /** + * HIVE-29625: col = const row-count estimate by NDV of the column. + * unknown NDV (-1) -> numRows/2 (13/2 = 6) + * verified-zero NDV (0) -> 0 rows match + * known NDV (n) -> uniform distribution numRows/n (13/7 = 1 ~ rounded to 2) + */ + @ParameterizedTest(name = "{0}") + @MethodSource("evaluateEqualCases") + void testEvaluateEqualByNDV(String name, long ndvOverride, long expectedRows) + throws SemanticException { + Statistics stats = createStatistics(VALUES, 0); + stats.getColumnStats().get(0).setCountDistint(ndvOverride); - ExprNodeDesc exprNodeDesc = new ExprNodeGenericFuncDesc(TypeInfoFactory.intTypeInfo, - new GenericUDFOPLessThan(), Arrays.asList(COL_EXPR, createExprNodeConstantDesc(7))); - long numRows = new StatsRulesProcFactory.FilterStatsRule().evaluateExpression( - stats, exprNodeDesc, STATS_PROC_CTX, Collections.emptyList(), null, VALUES.length + numNulls); + AnnotateStatsProcCtx ctx = spy(new AnnotateStatsProcCtx(null)); + when(ctx.getConf()).thenReturn(new HiveConf()); - assertEquals(12, numRows); - } + ExprNodeDesc eqExpr = new ExprNodeGenericFuncDesc(TypeInfoFactory.booleanTypeInfo, + new GenericUDFOPEqual(), + Arrays.asList(COL_EXPR, createExprNodeConstantDesc(1))); - @Test - public void testComparisonRowCountLessThanAboveMax() throws SemanticException { - long numNulls = 2; - Statistics stats = createStatistics(VALUES, numNulls); - - ExprNodeDesc exprNodeDesc = new ExprNodeGenericFuncDesc(TypeInfoFactory.intTypeInfo, - new GenericUDFOPLessThan(), Arrays.asList(COL_EXPR, createExprNodeConstantDesc(8))); long numRows = new StatsRulesProcFactory.FilterStatsRule().evaluateExpression( - stats, exprNodeDesc, STATS_PROC_CTX, Collections.emptyList(), null, VALUES.length + numNulls); + stats, eqExpr, ctx, Arrays.asList(COL_NAME), null, VALUES.length); - assertEquals(13, numRows); + assertEquals(expectedRows, numRows); } - @Test - public void testComparisonRowCountEqualOrLessThan() throws SemanticException { - long numNulls = 2; - Statistics stats = createStatistics(VALUES, numNulls); - ExprNodeDesc exprNodeDesc = new ExprNodeGenericFuncDesc(TypeInfoFactory.intTypeInfo, - new GenericUDFOPEqualOrLessThan(), Arrays.asList(COL_EXPR, createExprNodeConstantDesc(3))); - long numRows = new StatsRulesProcFactory.FilterStatsRule().evaluateExpression( - stats, exprNodeDesc, STATS_PROC_CTX, Collections.emptyList(), null, VALUES.length + numNulls); - - assertEquals(9, numRows); + private static Stream evaluateEqualCases() { + return Stream.of( + Arguments.of("unknownNDVUsesHalfRows", -1L, 6L), + Arguments.of("verifiedZeroReturnsZero", 0L, 0L), + Arguments.of("knownNDVUsesUniformDistribution", 7L, 2L) + ); } - @Test - public void testComparisonRowCountEqualOrLessThanMin() throws SemanticException { - long numNulls = 2; - Statistics stats = createStatistics(VALUES, numNulls); - ExprNodeDesc exprNodeDesc = new ExprNodeGenericFuncDesc(TypeInfoFactory.intTypeInfo, - new GenericUDFOPEqualOrLessThan(), Arrays.asList(COL_EXPR, createExprNodeConstantDesc(1))); - long numRows = new StatsRulesProcFactory.FilterStatsRule().evaluateExpression( - stats, exprNodeDesc, STATS_PROC_CTX, Collections.emptyList(), null, VALUES.length + numNulls); - - assertEquals(1, numRows); + @ParameterizedTest(name = "{0}") + @MethodSource("groupByFinalCases") + void testGroupByStatsRuleFinalCardinality(String name, long keyNdv, long expectedRows) throws SemanticException { + assertGroupByFinalCardinality(keyNdv, expectedRows); } - @Test - public void testComparisonRowCountEqualOrLessThanBelowMin() throws SemanticException { - long numNulls = 2; - Statistics stats = createStatistics(VALUES, numNulls); - ExprNodeDesc exprNodeDesc = new ExprNodeGenericFuncDesc(TypeInfoFactory.intTypeInfo, - new GenericUDFOPEqualOrLessThan(), Arrays.asList(COL_EXPR, createExprNodeConstantDesc(0))); - long numRows = new StatsRulesProcFactory.FilterStatsRule().evaluateExpression( - stats, exprNodeDesc, STATS_PROC_CTX, Collections.emptyList(), null, VALUES.length + numNulls); - - assertEquals(0, numRows); + private static Stream groupByFinalCases() { + return Stream.of( + Arguments.of("ndvUnknownAppliesFallback", -1L, 500L), + Arguments.of("ndvVerifiedZeroFlowsThroughClampedToOne", 0L, 1L), + Arguments.of("ndvKnownUsesProduct", 10L, 10L) + ); } - @Test - public void testComparisonRowCountEqualOrLessThanMax() throws SemanticException { - long numNulls = 2; - Statistics stats = createStatistics(VALUES, numNulls); - ExprNodeDesc exprNodeDesc = new ExprNodeGenericFuncDesc(TypeInfoFactory.intTypeInfo, - new GenericUDFOPEqualOrLessThan(), Arrays.asList(COL_EXPR, createExprNodeConstantDesc(7))); - long numRows = new StatsRulesProcFactory.FilterStatsRule().evaluateExpression( - stats, exprNodeDesc, STATS_PROC_CTX, Collections.emptyList(), null, VALUES.length + numNulls); + @ParameterizedTest(name = "{0}") + @MethodSource("groupByHashCases") + void testCheckMapSideAggregationHashCardinality(String name, long keyNdv, long expectedRows) + throws SemanticException { + assertGroupByHashCardinality(keyNdv, expectedRows); + } - assertEquals(13, numRows); + private static Stream groupByHashCases() { + return Stream.of( + Arguments.of("ndvUnknownFallsBackToHalfParent", -1L, 500L), + Arguments.of("ndvKnownUsesProduct", 100L, 100L) + ); } - @Test - public void testComparisonRowCountEqualOrLessThanAboveMax() throws SemanticException { - long numNulls = 2; - Statistics stats = createStatistics(VALUES, numNulls); - ExprNodeDesc exprNodeDesc = new ExprNodeGenericFuncDesc(TypeInfoFactory.intTypeInfo, - new GenericUDFOPEqualOrLessThan(), Arrays.asList(COL_EXPR, createExprNodeConstantDesc(8))); - long numRows = new StatsRulesProcFactory.FilterStatsRule().evaluateExpression( - stats, exprNodeDesc, STATS_PROC_CTX, Collections.emptyList(), null, VALUES.length + numNulls); + private void assertGroupByHashCardinality(long keyNdv, long expectedRows) throws SemanticException { + Statistics parentStats = new Statistics(1000, 8000, 0, 0); + parentStats.setBasicStatsState(Statistics.State.COMPLETE); + parentStats.setColumnStatsState(Statistics.State.COMPLETE); + ColStatistics keyCol = new ColStatistics("k", "int"); + keyCol.setCountDistint(keyNdv); + keyCol.setNumNulls(0); + parentStats.setColumnStats(Collections.singletonList(keyCol)); + + @SuppressWarnings("unchecked") + Operator parent = mock(Operator.class); + when(parent.getStatistics()).thenReturn(parentStats); + when(parent.getParentOperators()).thenReturn(Collections.emptyList()); + + GroupByDesc gbyDesc = mock(GroupByDesc.class); + when(gbyDesc.getMode()).thenReturn(GroupByDesc.Mode.HASH); + when(gbyDesc.getAggregators()).thenReturn(Collections.emptyList()); + when(gbyDesc.isGroupingSetsPresent()).thenReturn(false); + ExprNodeColumnDesc keyExpr = new ExprNodeColumnDesc(TypeInfoFactory.intTypeInfo, "k", "table", false); + when(gbyDesc.getKeys()).thenReturn(Collections.singletonList(keyExpr)); + + GroupByOperator gop = mock(GroupByOperator.class); + when(gop.getParentOperators()).thenReturn(Collections.singletonList(parent)); + when(gop.getConf()).thenReturn(gbyDesc); + Map colExprMap = new HashMap<>(); + colExprMap.put("_col0", keyExpr); + when(gop.getColumnExprMap()).thenReturn(colExprMap); + RowSchema rs = mock(RowSchema.class); + ColumnInfo colInfo = new ColumnInfo("_col0", TypeInfoFactory.intTypeInfo, "table", false); + when(rs.getSignature()).thenReturn(Collections.singletonList(colInfo)); + when(rs.getColumnInfo("_col0")).thenReturn(colInfo); + when(gop.getSchema()).thenReturn(rs); + + Context context = mock(Context.class); + HiveConf conf = new HiveConf(); + conf.setBoolVar(HiveConf.ConfVars.HIVE_QUERY_REEXECUTION_ENABLED, false); + when(context.getConf()).thenReturn(conf); + ParseContext pctx = mock(ParseContext.class); + when(pctx.getContext()).thenReturn(context); + AnnotateStatsProcCtx ctx = spy(new AnnotateStatsProcCtx(null)); + when(ctx.getConf()).thenReturn(conf); + when(ctx.getParseContext()).thenReturn(pctx); + + // checkMapSideAggregation calls DagUtils.getContainerResource(conf) to compute + // the available hash-aggregation memory. Stub it to a generous 1024 MB so the + // estimated hash table size stays well under the threshold and hashAgg is selected. + try (MockedStatic dagMock = mockStatic(DagUtils.class)) { + Resource res = mock(Resource.class); + when(res.getMemorySize()).thenReturn(1024L); + dagMock.when(() -> DagUtils.getContainerResource(any())).thenReturn(res); + + new StatsRulesProcFactory.GroupByStatsRule().process(gop, null, ctx, (Object[]) null); + } - assertEquals(13, numRows); + ArgumentCaptor captor = ArgumentCaptor.forClass(Statistics.class); + verify(gop).setStatistics(captor.capture()); + assertEquals(expectedRows, captor.getValue().getNumRows()); } - @Test - public void testComparisonRowCountGreaterThan() throws SemanticException { - long numNulls = 2; - Statistics stats = createStatistics(VALUES, numNulls); - ExprNodeDesc exprNodeDesc = new ExprNodeGenericFuncDesc(TypeInfoFactory.intTypeInfo, - new GenericUDFOPGreaterThan(), Arrays.asList(COL_EXPR, createExprNodeConstantDesc(5))); - long numRows = new StatsRulesProcFactory.FilterStatsRule().evaluateExpression( - stats, exprNodeDesc, STATS_PROC_CTX, Collections.emptyList(), null, VALUES.length + numNulls); + private void assertGroupByFinalCardinality(long keyNdv, long expectedRows) throws SemanticException { + Statistics parentStats = new Statistics(1000, 8000, 0, 0); + parentStats.setBasicStatsState(Statistics.State.COMPLETE); + parentStats.setColumnStatsState(Statistics.State.COMPLETE); + ColStatistics keyCol = new ColStatistics("k", "int"); + keyCol.setCountDistint(keyNdv); + keyCol.setNumNulls(0); + parentStats.setColumnStats(Collections.singletonList(keyCol)); - assertEquals(2, numRows); - } + @SuppressWarnings("unchecked") + Operator parent = mock(Operator.class); + when(parent.getStatistics()).thenReturn(parentStats); + when(parent.getParentOperators()).thenReturn(Collections.emptyList()); + + GroupByDesc gbyDesc = mock(GroupByDesc.class); + when(gbyDesc.getMode()).thenReturn(GroupByDesc.Mode.FINAL); + when(gbyDesc.getAggregators()).thenReturn(Collections.emptyList()); + when(gbyDesc.isGroupingSetsPresent()).thenReturn(false); + ExprNodeColumnDesc keyExpr = new ExprNodeColumnDesc(TypeInfoFactory.intTypeInfo, "k", "table", false); + when(gbyDesc.getKeys()).thenReturn(Collections.singletonList(keyExpr)); + + GroupByOperator gop = mock(GroupByOperator.class); + when(gop.getParentOperators()).thenReturn(Collections.singletonList(parent)); + when(gop.getConf()).thenReturn(gbyDesc); + Map colExprMap = new HashMap<>(); + colExprMap.put("_col0", keyExpr); + when(gop.getColumnExprMap()).thenReturn(colExprMap); + RowSchema rs = mock(RowSchema.class); + ColumnInfo colInfo = new ColumnInfo("_col0", TypeInfoFactory.intTypeInfo, "table", false); + when(rs.getSignature()).thenReturn(Collections.singletonList(colInfo)); + when(rs.getColumnInfo("_col0")).thenReturn(colInfo); + when(gop.getSchema()).thenReturn(rs); + + Context context = mock(Context.class); + HiveConf conf = new HiveConf(); + conf.setBoolVar(HiveConf.ConfVars.HIVE_QUERY_REEXECUTION_ENABLED, false); + when(context.getConf()).thenReturn(conf); + ParseContext pctx = mock(ParseContext.class); + when(pctx.getContext()).thenReturn(context); + AnnotateStatsProcCtx ctx = spy(new AnnotateStatsProcCtx(null)); + when(ctx.getConf()).thenReturn(conf); + when(ctx.getParseContext()).thenReturn(pctx); - @Test - public void testComparisonRowCountGreaterThanMin() throws SemanticException { - long numNulls = 2; - Statistics stats = createStatistics(VALUES, numNulls); - ExprNodeDesc exprNodeDesc = new ExprNodeGenericFuncDesc(TypeInfoFactory.intTypeInfo, - new GenericUDFOPGreaterThan(), Arrays.asList(COL_EXPR, createExprNodeConstantDesc(1))); - long numRows = new StatsRulesProcFactory.FilterStatsRule().evaluateExpression( - stats, exprNodeDesc, STATS_PROC_CTX, Collections.emptyList(), null, VALUES.length + numNulls); + new StatsRulesProcFactory.GroupByStatsRule().process(gop, null, ctx, (Object[]) null); - assertEquals(12, numRows); + ArgumentCaptor captor = ArgumentCaptor.forClass(Statistics.class); + verify(gop).setStatistics(captor.capture()); + assertEquals(expectedRows, captor.getValue().getNumRows()); } - @Test - public void testComparisonRowCountGreaterThanBelowMin() throws SemanticException { + @ParameterizedTest(name = "{0}") + @MethodSource("comparisonRowCountLessThanCases") + void testComparisonRowCountLessThan(String name, int constant, long expected) throws SemanticException { long numNulls = 2; Statistics stats = createStatistics(VALUES, numNulls); + ExprNodeDesc exprNodeDesc = new ExprNodeGenericFuncDesc(TypeInfoFactory.intTypeInfo, - new GenericUDFOPGreaterThan(), Arrays.asList(COL_EXPR, createExprNodeConstantDesc(0))); + new GenericUDFOPLessThan(), Arrays.asList(COL_EXPR, createExprNodeConstantDesc(constant))); long numRows = new StatsRulesProcFactory.FilterStatsRule().evaluateExpression( stats, exprNodeDesc, STATS_PROC_CTX, Collections.emptyList(), null, VALUES.length + numNulls); - assertEquals(13, numRows); + assertEquals(expected, numRows); } - @Test - public void testComparisonRowCountGreaterThanMax() throws SemanticException { - long numNulls = 2; - Statistics stats = createStatistics(VALUES, numNulls); - ExprNodeDesc exprNodeDesc = new ExprNodeGenericFuncDesc(TypeInfoFactory.intTypeInfo, - new GenericUDFOPGreaterThan(), Arrays.asList(COL_EXPR, createExprNodeConstantDesc(7))); - long numRows = new StatsRulesProcFactory.FilterStatsRule().evaluateExpression( - stats, exprNodeDesc, STATS_PROC_CTX, Collections.emptyList(), null, VALUES.length + numNulls); - - assertEquals(0, numRows); + private static Stream comparisonRowCountLessThanCases() { + return Stream.of( + Arguments.of("midRange", 3, 8L), + Arguments.of("equalToMin", 1, 0L), + Arguments.of("belowMin", 0, 0L), + Arguments.of("equalToMax", 7, 12L), + Arguments.of("aboveMax", 8, 13L) + ); } - @Test - public void testComparisonRowCountGreaterThanAboveMax() throws SemanticException { + @ParameterizedTest(name = "{0}") + @MethodSource("comparisonRowCountEqualOrLessThanCases") + void testComparisonRowCountEqualOrLessThan(String name, int constant, long expected) throws SemanticException { long numNulls = 2; Statistics stats = createStatistics(VALUES, numNulls); ExprNodeDesc exprNodeDesc = new ExprNodeGenericFuncDesc(TypeInfoFactory.intTypeInfo, - new GenericUDFOPGreaterThan(), Arrays.asList(COL_EXPR, createExprNodeConstantDesc(8))); + new GenericUDFOPEqualOrLessThan(), Arrays.asList(COL_EXPR, createExprNodeConstantDesc(constant))); long numRows = new StatsRulesProcFactory.FilterStatsRule().evaluateExpression( stats, exprNodeDesc, STATS_PROC_CTX, Collections.emptyList(), null, VALUES.length + numNulls); - assertEquals(0, numRows); + assertEquals(expected, numRows); } - @Test - public void testComparisonRowCountEqualOrGreaterThan() throws SemanticException { - long numNulls = 2; - Statistics stats = createStatistics(VALUES, numNulls); - ExprNodeDesc exprNodeDesc = new ExprNodeGenericFuncDesc(TypeInfoFactory.intTypeInfo, - new GenericUDFOPEqualOrGreaterThan(), Arrays.asList(COL_EXPR, createExprNodeConstantDesc(5))); - long numRows = new StatsRulesProcFactory.FilterStatsRule().evaluateExpression( - stats, exprNodeDesc, STATS_PROC_CTX, Collections.emptyList(), null, VALUES.length + numNulls); - - assertEquals(3, numRows); + private static Stream comparisonRowCountEqualOrLessThanCases() { + return Stream.of( + Arguments.of("midRange", 3, 9L), + Arguments.of("equalToMin", 1, 1L), + Arguments.of("belowMin", 0, 0L), + Arguments.of("equalToMax", 7, 13L), + Arguments.of("aboveMax", 8, 13L) + ); } - @Test - public void testComparisonRowCountEqualOrGreaterThanMin() throws SemanticException { + @ParameterizedTest(name = "{0}") + @MethodSource("comparisonRowCountGreaterThanCases") + void testComparisonRowCountGreaterThan(String name, int constant, long expected) throws SemanticException { long numNulls = 2; Statistics stats = createStatistics(VALUES, numNulls); ExprNodeDesc exprNodeDesc = new ExprNodeGenericFuncDesc(TypeInfoFactory.intTypeInfo, - new GenericUDFOPEqualOrGreaterThan(), Arrays.asList(COL_EXPR, createExprNodeConstantDesc(1))); + new GenericUDFOPGreaterThan(), Arrays.asList(COL_EXPR, createExprNodeConstantDesc(constant))); long numRows = new StatsRulesProcFactory.FilterStatsRule().evaluateExpression( stats, exprNodeDesc, STATS_PROC_CTX, Collections.emptyList(), null, VALUES.length + numNulls); - assertEquals(13, numRows); + assertEquals(expected, numRows); } - @Test - public void testComparisonRowCountEqualOrGreaterThanBelowMin() throws SemanticException { - long numNulls = 2; - Statistics stats = createStatistics(VALUES, numNulls); - ExprNodeDesc exprNodeDesc = new ExprNodeGenericFuncDesc(TypeInfoFactory.intTypeInfo, - new GenericUDFOPEqualOrGreaterThan(), Arrays.asList(COL_EXPR, createExprNodeConstantDesc(0))); - long numRows = new StatsRulesProcFactory.FilterStatsRule().evaluateExpression( - stats, exprNodeDesc, STATS_PROC_CTX, Collections.emptyList(), null, VALUES.length + numNulls); - - assertEquals(13, numRows); + private static Stream comparisonRowCountGreaterThanCases() { + return Stream.of( + Arguments.of("midRange", 5, 2L), + Arguments.of("equalToMin", 1, 12L), + Arguments.of("belowMin", 0, 13L), + Arguments.of("equalToMax", 7, 0L), + Arguments.of("aboveMax", 8, 0L) + ); } - @Test - public void testComparisonRowCountEqualOrGreaterThanMax() throws SemanticException { + @ParameterizedTest(name = "{0}") + @MethodSource("comparisonRowCountEqualOrGreaterThanCases") + void testComparisonRowCountEqualOrGreaterThan(String name, int constant, long expected) throws SemanticException { long numNulls = 2; Statistics stats = createStatistics(VALUES, numNulls); ExprNodeDesc exprNodeDesc = new ExprNodeGenericFuncDesc(TypeInfoFactory.intTypeInfo, - new GenericUDFOPEqualOrGreaterThan(), Arrays.asList(COL_EXPR, createExprNodeConstantDesc(7))); + new GenericUDFOPEqualOrGreaterThan(), Arrays.asList(COL_EXPR, createExprNodeConstantDesc(constant))); long numRows = new StatsRulesProcFactory.FilterStatsRule().evaluateExpression( stats, exprNodeDesc, STATS_PROC_CTX, Collections.emptyList(), null, VALUES.length + numNulls); - assertEquals(1, numRows); + assertEquals(expected, numRows); } - @Test - public void testComparisonRowCountEqualOrGreaterThanBeyondMax() throws SemanticException { - long numNulls = 2; - Statistics stats = createStatistics(VALUES, numNulls); - ExprNodeDesc exprNodeDesc = new ExprNodeGenericFuncDesc(TypeInfoFactory.intTypeInfo, - new GenericUDFOPEqualOrGreaterThan(), Arrays.asList(COL_EXPR, createExprNodeConstantDesc(8))); - long numRows = new StatsRulesProcFactory.FilterStatsRule().evaluateExpression( - stats, exprNodeDesc, STATS_PROC_CTX, Collections.emptyList(), null, VALUES.length + numNulls); - - assertEquals(0, numRows); + private static Stream comparisonRowCountEqualOrGreaterThanCases() { + return Stream.of( + Arguments.of("midRange", 5, 3L), + Arguments.of("equalToMin", 1, 13L), + Arguments.of("belowMin", 0, 13L), + Arguments.of("equalToMax", 7, 1L), + Arguments.of("aboveMax", 8, 0L) + ); } @Test - public void testComparisonRowCountEqualOrLessThanWhenMinEqualMax() throws SemanticException { + void testComparisonRowCountEqualOrLessThanWhenMinEqualMax() throws SemanticException { long[] values = { 1L, 1L }; long numNulls = 2; Statistics stats = createStatistics(values, numNulls); @@ -362,7 +438,7 @@ public void testComparisonRowCountEqualOrLessThanWhenMinEqualMax() throws Semant } @Test - public void testComparisonRowCountEqualOrGreaterThanWhenMinEqualMax() throws SemanticException { + void testComparisonRowCountEqualOrGreaterThanWhenMinEqualMax() throws SemanticException { long[] values = { 1L, 1L }; long numNulls = 2; Statistics stats = createStatistics(values, numNulls); @@ -376,7 +452,7 @@ public void testComparisonRowCountEqualOrGreaterThanWhenMinEqualMax() throws Sem } @Test - public void testBetween() throws SemanticException { + void testBetween() throws SemanticException { long numNulls = 2; Statistics stats = createStatistics(VALUES, numNulls); @@ -390,7 +466,7 @@ public void testBetween() throws SemanticException { } @Test - public void testLiteralExtraction() { + void testLiteralExtraction() { final double DELTA = 1e-5; assertEquals((float) 100, @@ -420,7 +496,7 @@ public void testLiteralExtraction() { } @Test - public void testLiteralExtractionFailures() { + void testLiteralExtractionFailures() { // make sure the correct exceptions are raised so that we can default to standard computation String[] types = {"int", "tinyint", "smallint", "bigint", "date", "timestamp", "float", "double"}; for (String type : types) { @@ -437,7 +513,7 @@ public void testLiteralExtractionFailures() { } @Test - public void testBetweenLeftLowerThanMin() throws SemanticException { + void testBetweenLeftLowerThanMin() throws SemanticException { long numNulls = 2; Statistics stats = createStatistics(VALUES, numNulls); @@ -457,7 +533,7 @@ public void testBetweenLeftLowerThanMin() throws SemanticException { } @Test - public void testBetweenLeftLowerThanMinRightHigherThanMax() throws SemanticException { + void testBetweenLeftLowerThanMinRightHigherThanMax() throws SemanticException { long numNulls = 2; Statistics stats = createStatistics(VALUES, numNulls); @@ -471,7 +547,7 @@ public void testBetweenLeftLowerThanMinRightHigherThanMax() throws SemanticExcep } @Test - public void testBetweenRightHigherThanMax() throws SemanticException { + void testBetweenRightHigherThanMax() throws SemanticException { long numNulls = 2; Statistics stats = createStatistics(VALUES, numNulls); @@ -491,7 +567,7 @@ public void testBetweenRightHigherThanMax() throws SemanticException { } @Test - public void testBetweenRightLowerThanMin() throws SemanticException { + void testBetweenRightLowerThanMin() throws SemanticException { long numNulls = 2; Statistics stats = createStatistics(VALUES, numNulls); @@ -505,7 +581,7 @@ public void testBetweenRightLowerThanMin() throws SemanticException { } @Test - public void testBetweenLeftHigherThanMax() throws SemanticException { + void testBetweenLeftHigherThanMax() throws SemanticException { long numNulls = 2; Statistics stats = createStatistics(VALUES, numNulls); @@ -519,7 +595,7 @@ public void testBetweenLeftHigherThanMax() throws SemanticException { } @Test - public void testBetweenLeftEqualMax() throws SemanticException { + void testBetweenLeftEqualMax() throws SemanticException { long numNulls = 2; Statistics stats = createStatistics(VALUES, numNulls); @@ -533,7 +609,7 @@ public void testBetweenLeftEqualMax() throws SemanticException { } @Test - public void testNotBetween() throws SemanticException { + void testNotBetween() throws SemanticException { long numNulls = 2; Statistics stats = createStatistics(VALUES, numNulls); @@ -558,7 +634,7 @@ public void testNotBetween() throws SemanticException { } @Test - public void testNotBetweenLowerThanMinHigherThanMax() throws SemanticException { + void testNotBetweenLowerThanMinHigherThanMax() throws SemanticException { long numNulls = 2; Statistics stats = createStatistics(VALUES, numNulls); @@ -572,7 +648,7 @@ public void testNotBetweenLowerThanMinHigherThanMax() throws SemanticException { } @Test - public void testNotBetweenLeftEqualsRight() throws SemanticException { + void testNotBetweenLeftEqualsRight() throws SemanticException { long numNulls = 2; Statistics stats = createStatistics(VALUES, numNulls); @@ -586,7 +662,7 @@ public void testNotBetweenLeftEqualsRight() throws SemanticException { } @Test - public void testNotBetweenRightLowerThanLeft() throws SemanticException { + void testNotBetweenRightLowerThanLeft() throws SemanticException { long numNulls = 2; Statistics stats = createStatistics(VALUES, numNulls); @@ -648,7 +724,7 @@ private static ColStatistics createColStatistics( * Without the fix, valuesCount = numRows - (-1) = numRows + 1 (wrong). */ @Test - public void testComputeAggregateColumnMinMaxWithUnknownNumNulls() throws SemanticException { + void testComputeAggregateColumnMinMaxWithUnknownNumNulls() throws SemanticException { ColStatistics cs = new ColStatistics("_col0", "bigint"); HiveConf conf = new HiveConf(); @@ -676,14 +752,14 @@ public void testComputeAggregateColumnMinMaxWithUnknownNumNulls() throws Semanti // Verify: With the fix, COUNT Range should be (0, 100) // numNulls=-1 is treated as 0, so valuesCount = 100 - 0 = 100 // Without the fix, valuesCount = 100 - (-1) = 101 (WRONG) - assertNotNull("Range should be set on COUNT column", cs.getRange()); - assertEquals("COUNT min should be 0", 0L, ((Number) cs.getRange().minValue).longValue()); - assertEquals("COUNT max should be 100 (numRows), not 101", - 100L, ((Number) cs.getRange().maxValue).longValue()); + assertNotNull(cs.getRange(), "Range should be set on COUNT column"); + assertEquals(0L, ((Number) cs.getRange().minValue).longValue(), "COUNT min should be 0"); + assertEquals(100L, ((Number) cs.getRange().maxValue).longValue(), + "COUNT max should be 100 (numRows), not 101"); } @Test - public void testComputeAggregateColumnMinMaxWithKnownNumNulls() throws SemanticException { + void testComputeAggregateColumnMinMaxWithKnownNumNulls() throws SemanticException { ColStatistics cs = new ColStatistics("_col0", "bigint"); HiveConf conf = new HiveConf(); @@ -708,10 +784,58 @@ public void testComputeAggregateColumnMinMaxWithKnownNumNulls() throws SemanticE cs, conf, agg, "bigint", parentStats); // With known numNulls=20, valuesCount = 100 - 20 = 80 - assertNotNull("Range should be set", cs.getRange()); + assertNotNull(cs.getRange(), "Range should be set"); assertEquals(0L, ((Number) cs.getRange().minValue).longValue()); - assertEquals("COUNT max should be 80 (numRows - numNulls)", - 80L, ((Number) cs.getRange().maxValue).longValue()); + assertEquals(80L, ((Number) cs.getRange().maxValue).longValue(), + "COUNT max should be 80 (numRows - numNulls)"); + } + + /** + * HIVE-29625: COUNT(DISTINCT col) uses parentCS.getCountDistint() as the max range. + * When NDV is unknown (-1) the new guard short-circuits before building a Range with + * negative maxValue. When NDV is known, Range is set to [0, NDV]. + */ + @ParameterizedTest(name = "{0}") + @MethodSource("computeAggregateColumnMinMaxDistinctCases") + void testComputeAggregateColumnMinMaxDistinctByNDV( + String name, long parentNDV, Long expectedMax) throws SemanticException { + ColStatistics cs = new ColStatistics("_col0", "bigint"); + HiveConf conf = new HiveConf(); + + ColStatistics parentColStats = new ColStatistics("val", "int"); + parentColStats.setNumNulls(0); + parentColStats.setCountDistint(parentNDV); + parentColStats.setRange(1, 100); + + Statistics parentStats = new Statistics(100, 400, 400, 400); + parentStats.addToColumnStats(Collections.singletonList(parentColStats)); + + ExprNodeColumnDesc colExpr = new ExprNodeColumnDesc( + TypeInfoFactory.intTypeInfo, "val", "t", false); + AggregationDesc agg = new AggregationDesc(); + agg.setGenericUDAFName("count"); + agg.setParameters(Collections.singletonList(colExpr)); + agg.setDistinct(true); + agg.setMode(GenericUDAFEvaluator.Mode.COMPLETE); + + StatsRulesProcFactory.GroupByStatsRule.computeAggregateColumnMinMax( + cs, conf, agg, "bigint", parentStats); + + if (expectedMax == null) { + assertNull(cs.getRange(), "Range should NOT be set when DISTINCT NDV is unknown"); + } else { + assertNotNull(cs.getRange(), "Range should be set when DISTINCT NDV is known"); + assertEquals(0L, ((Number) cs.getRange().minValue).longValue()); + assertEquals(expectedMax.longValue(), ((Number) cs.getRange().maxValue).longValue(), + "COUNT DISTINCT max should equal the parent NDV"); + } + } + + private static Stream computeAggregateColumnMinMaxDistinctCases() { + return Stream.of( + Arguments.of("unknownNDVReturnsEarlyNoRange", -1L, null), + Arguments.of("knownNDVSetsRangeUpToNDV", 50L, 50L) + ); } /** @@ -720,7 +844,7 @@ public void testComputeAggregateColumnMinMaxWithKnownNumNulls() throws SemanticE * Without the fix, LEFT_OUTER_JOIN would calculate: newNumNulls = oldNumNulls + leftUnmatchedRows = -1 + 100 = 99 */ @Test - public void testUpdateNumNullsPreservesUnknownNumNulls() { + void testUpdateNumNullsPreservesUnknownNumNulls() { StatsRulesProcFactory.JoinStatsRule joinStatsRule = new StatsRulesProcFactory.JoinStatsRule(); // Create ColStatistics with numNulls = -1 (unknown) @@ -749,7 +873,101 @@ public void testUpdateNumNullsPreservesUnknownNumNulls() { joinStatsRule.updateNumNulls(colStats, 100L, 100L, 1000L, 0L, mockJop); // Assert that numNulls is still -1 (unchanged) - assertEquals("Unknown numNulls (-1) should be preserved after updateNumNulls", - -1L, colStats.getNumNulls()); + assertEquals(-1L, colStats.getNumNulls(), + "Unknown numNulls (-1) should be preserved after updateNumNulls"); + } + + @ParameterizedTest(name = "{0}") + @MethodSource("calculateUnmatchedRowsForOuterCases") + void testCalculateUnmatchedRowsForOuter( + String name, long ndv, long distinctUnmatched, long expected) { + assertCalculateUnmatchedRowsForOuter(ndv, distinctUnmatched, expected); + } + + private static Stream calculateUnmatchedRowsForOuterCases() { + return Stream.of( + Arguments.of("distinctValUnknownReturnsInputRowCount", -1L, 5L, 100L), + Arguments.of("distinctValVerifiedZeroReturnsInputRowCount", 0L, 5L, 100L), + Arguments.of("distinctUnmatchedUnknownReturnsInputRowCount", 10L, -1L, 100L), + Arguments.of("distinctUnmatchedExceedsReturnsInputRowCount", 10L, 15L, 100L), + Arguments.of("normalCaseDivides", 10L, 2L, 20L) + ); + } + + @ParameterizedTest(name = "{0}") + @MethodSource("computeRowCountAssumingInnerJoinCases") + void testComputeRowCountAssumingInnerJoin(String name, long denom, long expected) { + assertComputeRowCountAssumingInnerJoin(denom, expected); + } + + private static Stream computeRowCountAssumingInnerJoinCases() { + return Stream.of( + Arguments.of("denomPositiveDivides", 10L, 2000L), + Arguments.of("denomZeroClampsToOne", 0L, 20000L), + Arguments.of("denomNegativeClampsToOne", -1L, 20000L) + ); + } + + @ParameterizedTest(name = "{0}") + @MethodSource("updateColStatsCases") + void testUpdateColStats(String name, long initialNdv, long expectedNdv) { + ColStatistics cs = new ColStatistics("k", "int"); + cs.setCountDistint(initialNdv); + cs.setNumNulls(0); + Statistics stats = new Statistics(1000, 8000, 0, 0); + stats.setColumnStats(Collections.singletonList(cs)); + + Map reversedExprs = new HashMap<>(); + reversedExprs.put("k", (byte) 0); + JoinCondDesc joinCond = mock(JoinCondDesc.class); + when(joinCond.getType()).thenReturn(JoinDesc.INNER_JOIN); + JoinDesc joinDesc = mock(JoinDesc.class); + when(joinDesc.getReversedExprs()).thenReturn(reversedExprs); + when(joinDesc.getConds()).thenReturn(new JoinCondDesc[]{joinCond}); + when(joinDesc.getJoinKeys()).thenReturn(new ExprNodeDesc[][]{}); + @SuppressWarnings("unchecked") + CommonJoinOperator jop = mock(CommonJoinOperator.class); + when(jop.getConf()).thenReturn(joinDesc); + RowSchema schema = mock(RowSchema.class); + when(schema.getColumnNames()).thenReturn(Collections.singletonList("k")); + when(schema.getSignature()).thenReturn(Collections.emptyList()); + when(jop.getSchema()).thenReturn(schema); + Map rowCountParents = new HashMap<>(); + rowCountParents.put(0, 1000L); + HiveConf conf = new HiveConf(); + conf.setBoolVar(HiveConf.ConfVars.HIVE_STATS_JOIN_NDV_READJUSTMENT, false); + + new StatsRulesProcFactory.JoinStatsRule().updateColStats( + conf, stats, 0L, 0L, 500L, jop, rowCountParents); + + assertEquals(expectedNdv, cs.getCountDistint()); + } + + private static Stream updateColStatsCases() { + return Stream.of( + Arguments.of("unknownNdvSkipsMath", -1L, -1L), + Arguments.of("knownNdvScaledByRatio", 100L, 50L) + ); + } + + private void assertComputeRowCountAssumingInnerJoin(long denom, long expected) { + StatsRulesProcFactory.JoinStatsRule rule = new StatsRulesProcFactory.JoinStatsRule(); + long actual = rule.computeRowCountAssumingInnerJoin(Arrays.asList(100L, 200L), denom, null); + assertEquals(expected, actual); + } + + private void assertCalculateUnmatchedRowsForOuter(long ndv, long distinctUnmatched, long expected) { + HiveConf conf = new HiveConf(); + ColStatistics cs = new ColStatistics("k", "int"); + cs.setCountDistint(ndv); + cs.setNumNulls(0); + Statistics stats = new Statistics(100, 400, 0, 0); + stats.setColumnStats(Collections.singletonList(cs)); + + StatsRulesProcFactory.JoinStatsRule rule = new StatsRulesProcFactory.JoinStatsRule(); + long actual = rule.calculateUnmatchedRowsForOuter( + conf, 100L, Collections.singletonList("k"), stats, distinctUnmatched); + + assertEquals(expected, actual); } } diff --git a/ql/src/test/org/apache/hadoop/hive/ql/stats/TestStatsUtils.java b/ql/src/test/org/apache/hadoop/hive/ql/stats/TestStatsUtils.java index c009472fed0a..679bf28734ed 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/stats/TestStatsUtils.java +++ b/ql/src/test/org/apache/hadoop/hive/ql/stats/TestStatsUtils.java @@ -25,6 +25,7 @@ import java.lang.reflect.Field; import java.lang.reflect.Modifier; +import java.util.Arrays; import java.util.Collections; import java.util.List; import java.util.Set; @@ -32,13 +33,16 @@ import org.apache.commons.lang3.reflect.FieldUtils; import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.metastore.api.BinaryColumnStatsData; import org.apache.hadoop.hive.metastore.api.BooleanColumnStatsData; import org.apache.hadoop.hive.metastore.api.ColumnStatisticsData; import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj; import org.apache.hadoop.hive.metastore.api.Date; import org.apache.hadoop.hive.metastore.api.DateColumnStatsData; +import org.apache.hadoop.hive.metastore.api.DecimalColumnStatsData; import org.apache.hadoop.hive.metastore.api.DoubleColumnStatsData; import org.apache.hadoop.hive.metastore.api.LongColumnStatsData; +import org.apache.hadoop.hive.metastore.api.StringColumnStatsData; import org.apache.hadoop.hive.metastore.api.Timestamp; import org.apache.hadoop.hive.metastore.api.TimestampColumnStatsData; import org.apache.hadoop.hive.ql.plan.ColStatistics; @@ -285,6 +289,204 @@ void testStatisticsAddToColumnStatsPropagatesUnknownFromExisting() { assertEquals(-1, merged.getNumNulls(), "Unknown numNulls (-1) should be propagated when existing is unknown"); } + @ParameterizedTest(name = "{0}") + @MethodSource("addToColumnStatsCountDistinctCases") + void testStatisticsAddToColumnStatsCountDistinctMerge( + String scenarioName, long existingNdv, long incomingNdv, long expectedMergedNdv) { + Statistics stats = new Statistics(1000, 8000, 0, 0); + ColStatistics existing = createColStats("col1", existingNdv, 0); + stats.setColumnStats(Collections.singletonList(existing)); + + ColStatistics incoming = createColStats("col1", incomingNdv, 0); + stats.addToColumnStats(Collections.singletonList(incoming)); + + ColStatistics merged = stats.getColumnStatisticsFromColName("col1"); + assertEquals(expectedMergedNdv, merged.getCountDistint(), + "countDistinct after merge"); + } + + private static Stream addToColumnStatsCountDistinctCases() { + return Stream.of( + Arguments.of("incomingUnknownPropagates", 5L, -1L, -1L), + Arguments.of("existingUnknownPropagates", -1L, 5L, -1L), + Arguments.of("bothUnknownStaysUnknown", -1L, -1L, -1L), + Arguments.of("maxPicksIncomingWhenHigher", 3L, 7L, 7L), + Arguments.of("maxPicksExistingWhenHigher", 7L, 3L, 7L) + ); + } + + @ParameterizedTest(name = "{0}") + @MethodSource("containsUnknownNDVCases") + void testContainsUnknownNDV(String scenarioName, List input, boolean expected) { + assertEquals(expected, StatsUtils.containsUnknownNDV(input), + "containsUnknownNDV(" + input + ")"); + } + + private static Stream containsUnknownNDVCases() { + return Stream.of( + Arguments.of("allPositive", Arrays.asList(1L, 2L, 3L), false), + Arguments.of("containsZero_NotUnknown", Arrays.asList(1L, 0L, 3L), false), + Arguments.of("singleUnknown", Arrays.asList(1L, -1L, 3L), true), + Arguments.of("allUnknown", Arrays.asList(-1L, -1L, -1L), true), + Arguments.of("firstIsUnknown_ShortCircuit", Arrays.asList(-1L, 2L, 3L), true), + Arguments.of("emptyList", Collections.emptyList(), false) + ); + } + + @Test + void testAddWithExpDecayReturnsUnknownWhenAnyInputIsUnknown() { + Long result = StatsUtils.addWithExpDecay(Arrays.asList(10L, -1L, 5L)); + assertEquals(-1L, result, "addWithExpDecay should propagate unknown NDV (-1) when present"); + } + + @Test + void testAddWithExpDecayComputesWhenAllInputsKnown() { + Long result = StatsUtils.addWithExpDecay(Arrays.asList(100L, 25L)); + // Exponential decay: 100 * 25^(1/2) = 100 * 5 = 500. + assertEquals(500L, result, "addWithExpDecay should return the exponential-decay denominator for known inputs"); + } + + @ParameterizedTest(name = "{0}") + @MethodSource("computeNDVGroupingColumnsCases") + void testComputeNDVGroupingColumns(String scenarioName, List colStats, + Statistics.State parentColStatsState, boolean expDecay, long expected) { + Statistics parentStats = new Statistics(1000, 8000, 0, 0); + parentStats.setColumnStatsState(parentColStatsState); + + long result = StatsUtils.computeNDVGroupingColumns(colStats, parentStats, expDecay); + + assertEquals(expected, result, scenarioName); + } + + private static Stream computeNDVGroupingColumnsCases() { + return Stream.of( + Arguments.of("allKnownReturnsProduct", + Arrays.asList(makeColStat("c1", 10), makeColStat("c2", 20)), + Statistics.State.COMPLETE, false, 200L), + Arguments.of("unknownColumnReturnsMinusOne", + Arrays.asList(makeColStat("c1", 10), makeColStat("c2", -1)), + Statistics.State.COMPLETE, false, -1L), + Arguments.of("emptyColumnsReturnsOne", + Collections.emptyList(), + Statistics.State.COMPLETE, false, 1L), + Arguments.of("nullColStatWithCompleteParentSkipped", + Arrays.asList(null, makeColStat("c2", 10)), + Statistics.State.COMPLETE, false, 10L), + Arguments.of("nullColStatWithPartialParentReturnsMinusOne", + Arrays.asList((ColStatistics) null), + Statistics.State.PARTIAL, false, -1L), + Arguments.of("expDecayWithKnownInputs", + Arrays.asList(makeColStat("c1", 100), makeColStat("c2", 25)), + Statistics.State.COMPLETE, true, 500L), + Arguments.of("expDecayWithUnknownPropagates", + Arrays.asList(makeColStat("c1", 100), makeColStat("c2", -1)), + Statistics.State.COMPLETE, true, -1L) + ); + } + + private static ColStatistics makeColStat(String name, long ndv) { + ColStatistics cs = new ColStatistics(name, "string"); + cs.setCountDistint(ndv); + cs.setNumNulls(0); + return cs; + } + + @ParameterizedTest(name = "{0}") + @MethodSource("getColStatisticsUnsetNumDVsCases") + void testGetColStatisticsReturnsUnknownNDVWhenNumDVsNotSet( + String typeName, ColumnStatisticsData data) { + ColumnStatisticsObj cso = new ColumnStatisticsObj(); + cso.setColName("test_col"); + cso.setColType(typeName); + cso.setStatsData(data); + + ColStatistics cs = StatsUtils.getColStatistics(cso, "test_col"); + + assertNotNull(cs, "ColStatistics should not be null for " + typeName); + assertEquals(-1, cs.getCountDistint(), + "When numDVs is unset for " + typeName + ", NDV should be -1"); + } + + private static Stream getColStatisticsUnsetNumDVsCases() { + LongColumnStatsData longStats = new LongColumnStatsData(); + longStats.setNumNulls(10); + // numDVs NOT set + + DoubleColumnStatsData doubleStats = new DoubleColumnStatsData(); + doubleStats.setNumNulls(10); + + StringColumnStatsData stringStats = new StringColumnStatsData(); + stringStats.setNumNulls(10); + stringStats.setAvgColLen(5.0); + stringStats.setMaxColLen(20); + + BinaryColumnStatsData binaryStats = new BinaryColumnStatsData(); + binaryStats.setNumNulls(10); + binaryStats.setAvgColLen(5.0); + binaryStats.setMaxColLen(20); + + TimestampColumnStatsData timestampStats = new TimestampColumnStatsData(); + timestampStats.setNumNulls(10); + + DecimalColumnStatsData decimalStats = new DecimalColumnStatsData(); + decimalStats.setNumNulls(10); + + DateColumnStatsData dateStats = new DateColumnStatsData(); + dateStats.setNumNulls(10); + + return Stream.of( + Arguments.of(serdeConstants.BIGINT_TYPE_NAME, wrapLong(longStats)), + Arguments.of(serdeConstants.DOUBLE_TYPE_NAME, wrapDouble(doubleStats)), + Arguments.of(serdeConstants.STRING_TYPE_NAME, wrapString(stringStats)), + Arguments.of(serdeConstants.BINARY_TYPE_NAME, wrapBinary(binaryStats)), + Arguments.of(serdeConstants.TIMESTAMP_TYPE_NAME, wrapTimestamp(timestampStats)), + Arguments.of(serdeConstants.DECIMAL_TYPE_NAME, wrapDecimal(decimalStats)), + Arguments.of(serdeConstants.DATE_TYPE_NAME, wrapDate(dateStats)) + ); + } + + private static ColumnStatisticsData wrapLong(LongColumnStatsData s) { + ColumnStatisticsData d = new ColumnStatisticsData(); + d.setLongStats(s); + return d; + } + + private static ColumnStatisticsData wrapDouble(DoubleColumnStatsData s) { + ColumnStatisticsData d = new ColumnStatisticsData(); + d.setDoubleStats(s); + return d; + } + + private static ColumnStatisticsData wrapString(StringColumnStatsData s) { + ColumnStatisticsData d = new ColumnStatisticsData(); + d.setStringStats(s); + return d; + } + + private static ColumnStatisticsData wrapBinary(BinaryColumnStatsData s) { + ColumnStatisticsData d = new ColumnStatisticsData(); + d.setBinaryStats(s); + return d; + } + + private static ColumnStatisticsData wrapTimestamp(TimestampColumnStatsData s) { + ColumnStatisticsData d = new ColumnStatisticsData(); + d.setTimestampStats(s); + return d; + } + + private static ColumnStatisticsData wrapDecimal(DecimalColumnStatsData s) { + ColumnStatisticsData d = new ColumnStatisticsData(); + d.setDecimalStats(s); + return d; + } + + private static ColumnStatisticsData wrapDate(DateColumnStatsData s) { + ColumnStatisticsData d = new ColumnStatisticsData(); + d.setDateStats(s); + return d; + } + @Test void testGetColStatisticsBooleanWithUnknownNumTrues() { ColumnStatisticsObj cso = new ColumnStatisticsObj(); @@ -465,6 +667,53 @@ void testUpdateStatsPreservesUnknownNumNulls() { assertEquals(-1, updated.getNumNulls(), "Unknown numNulls (-1) should be preserved after scaling"); } + @Test + void testUpdateStatsMarksFilteredColumnEvenWhenNDVUnknown() { + // HIVE-29625: setFilterColumn() is now called unconditionally for affected columns, + // even when NDV is unknown (-1). The NDV math is skipped but the filter mark applies. + Statistics stats = new Statistics(1000, 8000, 0, 0); + ColStatistics cs = createColStats("col1", -1, 0); // unknown NDV + stats.setColumnStats(Collections.singletonList(cs)); + + StatsUtils.updateStats(stats, 500, true, null, Collections.singleton("col1")); + + ColStatistics updated = stats.getColumnStats().get(0); + assertEquals(true, updated.isFilteredColumn(), + "Filter-column flag should be set even when NDV is unknown"); + assertEquals(-1, updated.getCountDistint(), + "Unknown NDV (-1) should be preserved when affected column has no NDV"); + } + + @Test + void testUpdateStatsRecomputesNDVWhenAffectedAndKnown() { + // Regression check: when NDV is known and ratio <= 1.0, the NDV math still runs + // inside the new oldDV >= 0 guard. + Statistics stats = new Statistics(1000, 8000, 0, 0); + ColStatistics cs = createColStats("col1", 100, 0); // known NDV + stats.setColumnStats(Collections.singletonList(cs)); + + StatsUtils.updateStats(stats, 500, true, null, Collections.singleton("col1")); + + ColStatistics updated = stats.getColumnStats().get(0); + assertEquals(true, updated.isFilteredColumn(), + "Filter-column flag should be set for affected column with known NDV"); + // ratio = 500/1000 = 0.5 -> newDV = ceil(0.5 * 100) = 50 + assertEquals(50, updated.getCountDistint(), + "Known NDV should be scaled by the row-count ratio"); + } + + @Test + void testScaleColStatisticsPreservesUnknownCountDistint() { + // HIVE-29625: when factor < 1.0 and NDV is unknown (-1), the sentinel is preserved. + ColStatistics cs = createColStats("col1", -1, 0); // unknown NDV + List colStats = Collections.singletonList(cs); + + StatsUtils.scaleColStatistics(colStats, 0.5); + + assertEquals(-1, colStats.get(0).getCountDistint(), + "Unknown NDV (-1) should be preserved when factor < 1.0"); + } + @Test void testScaleColStatisticsPreservesUnknownNumNulls() { ColStatistics cs = createColStats("col1", 100, -1); // unknown numNulls diff --git a/ql/src/test/org/apache/hadoop/hive/ql/stats/estimator/TestPessimisticStatCombiner.java b/ql/src/test/org/apache/hadoop/hive/ql/stats/estimator/TestPessimisticStatCombiner.java index 98bc589e40d3..752907f755d8 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/stats/estimator/TestPessimisticStatCombiner.java +++ b/ql/src/test/org/apache/hadoop/hive/ql/stats/estimator/TestPessimisticStatCombiner.java @@ -20,8 +20,13 @@ import static org.junit.jupiter.api.Assertions.assertEquals; +import java.util.stream.Stream; + import org.apache.hadoop.hive.ql.plan.ColStatistics; import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; class TestPessimisticStatCombiner { @@ -136,6 +141,31 @@ void testCombineBothUnknownNumNulls() { assertEquals(-1, combined.getNumNulls(), "Both unknown should result in unknown (-1)"); } + @ParameterizedTest(name = "{0}") + @MethodSource("combineCountDistinctCases") + void testCombineCountDistinctMerge(String scenarioName, long stat1Ndv, long stat2Ndv, long expectedNdv) { + ColStatistics stat1 = createStat("col1", "int", stat1Ndv, 5, 4.0); + ColStatistics stat2 = createStat("col2", "int", stat2Ndv, 10, 4.0); + + PessimisticStatCombiner combiner = new PessimisticStatCombiner(); + combiner.add(stat1); + combiner.add(stat2); + + ColStatistics combined = combiner.getResult().get(); + assertEquals(expectedNdv, combined.getCountDistint(), + "countDistinct after PROPAGATE combine"); + } + + private static Stream combineCountDistinctCases() { + return Stream.of( + Arguments.of("firstUnknownPropagates", -1L, 50L, -1L), + Arguments.of("secondUnknownPropagates", 50L, -1L, -1L), + Arguments.of("bothUnknownStaysUnknown", -1L, -1L, -1L), + Arguments.of("picksHigherWhenSecondHigher", 30L, 50L, 50L), + Arguments.of("keepsHigherWhenFirstHigher", 50L, 30L, 50L) + ); + } + @Test void testCombineBothUnknownNumTruesAndNumFalses() { ColStatistics stat1 = createStat("col1", "boolean", 2, 5, 1.0); diff --git a/ql/src/test/results/clientpositive/llap/parquet_types_non_dictionary_encoding_vectorization.q.out b/ql/src/test/results/clientpositive/llap/parquet_types_non_dictionary_encoding_vectorization.q.out index d4d9cb53e2b9..0b708705624b 100644 --- a/ql/src/test/results/clientpositive/llap/parquet_types_non_dictionary_encoding_vectorization.q.out +++ b/ql/src/test/results/clientpositive/llap/parquet_types_non_dictionary_encoding_vectorization.q.out @@ -2414,13 +2414,13 @@ STAGE PLANS: minReductionHashAggr: 0.99 mode: hash outputColumnNames: _col0, _col1 - Statistics: Num rows: 1 Data size: 48 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 150 Data size: 1960 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator key expressions: _col0 (type: binary) null sort order: z sort order: + Map-reduce partition columns: _col0 (type: binary) - Statistics: Num rows: 1 Data size: 48 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 150 Data size: 1960 Basic stats: COMPLETE Column stats: COMPLETE value expressions: _col1 (type: bigint) Execution mode: vectorized, llap LLAP IO: all inputs (cache only) @@ -2432,16 +2432,16 @@ STAGE PLANS: keys: KEY._col0 (type: binary) mode: mergepartial outputColumnNames: _col0, _col1 - Statistics: Num rows: 1 Data size: 48 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 75 Data size: 1000 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: hex(_col0) (type: string), _col1 (type: bigint), _col0 (type: binary) outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 1 Data size: 232 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 75 Data size: 14800 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator key expressions: _col2 (type: binary) null sort order: z sort order: + - Statistics: Num rows: 1 Data size: 232 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 75 Data size: 14800 Basic stats: COMPLETE Column stats: COMPLETE value expressions: _col0 (type: string), _col1 (type: bigint) Reducer 3 Execution mode: vectorized, llap @@ -2449,10 +2449,10 @@ STAGE PLANS: Select Operator expressions: VALUE._col0 (type: string), VALUE._col1 (type: bigint) outputColumnNames: _col0, _col1 - Statistics: Num rows: 1 Data size: 192 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 75 Data size: 14400 Basic stats: COMPLETE Column stats: COMPLETE File Output Operator compressed: false - Statistics: Num rows: 1 Data size: 192 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 75 Data size: 14400 Basic stats: COMPLETE Column stats: COMPLETE table: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat diff --git a/ql/src/test/results/clientpositive/llap/vector_binary_join_groupby.q.out b/ql/src/test/results/clientpositive/llap/vector_binary_join_groupby.q.out index 41bc14e5e354..8e8d1d548e0e 100644 --- a/ql/src/test/results/clientpositive/llap/vector_binary_join_groupby.q.out +++ b/ql/src/test/results/clientpositive/llap/vector_binary_join_groupby.q.out @@ -137,7 +137,7 @@ STAGE PLANS: TableScan alias: t1 filterExpr: bin is not null (type: boolean) - probeDecodeDetails: cacheKey:HASH_MAP_MAPJOIN_30_container, bigKeyColName:bin, smallTablePos:1, keyRatio:0.0 + probeDecodeDetails: cacheKey:HASH_MAP_MAPJOIN_30_container, bigKeyColName:bin, smallTablePos:1, keyRatio:100.0 Statistics: Num rows: 100 Data size: 34084 Basic stats: COMPLETE Column stats: COMPLETE TableScan Vectorization: native: true