From 217c58eec153f0f6c93a1e18b154b1f513905806 Mon Sep 17 00:00:00 2001 From: Shubham Sharma Date: Sat, 4 Apr 2026 10:45:43 -0400 Subject: [PATCH 1/3] HIVE-29516: Fix NPE in StatsUtils.updateStats when column statistics unavailable Check column stats availability before passing useColStats=true in TezCompiler.removeSemijoinOptimizationByBenefit() to avoid NPE when column statistics are not present. --- .../hadoop/hive/ql/parse/TezCompiler.java | 3 +- .../semijoin_stats_missing_colstats.q | 19 +++ .../semijoin_stats_missing_colstats.q.out | 160 ++++++++++++++++++ 3 files changed, 181 insertions(+), 1 deletion(-) create mode 100644 ql/src/test/queries/clientpositive/semijoin_stats_missing_colstats.q create mode 100644 ql/src/test/results/clientpositive/llap/semijoin_stats_missing_colstats.q.out diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/TezCompiler.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/TezCompiler.java index 2a947c5e0eed..ede30bfb946d 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/TezCompiler.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/TezCompiler.java @@ -1977,8 +1977,9 @@ private void removeSemijoinOptimizationByBenefit(OptimizeTezProcContext procCtx) LOG.debug("Old stats for {}: {}", roi.filterOperator, roi.filterStats); LOG.debug("Number of rows reduction: {}/{}", newNumRows, roi.filterStats.getNumRows()); } + boolean useColStats = roi.filterStats.getColumnStats() != null; StatsUtils.updateStats(roi.filterStats, newNumRows, - true, roi.filterOperator, roi.colNames); + useColStats, roi.filterOperator, roi.colNames); if (LOG.isDebugEnabled()) { LOG.debug("New stats for {}: {}", roi.filterOperator, roi.filterStats); } diff --git a/ql/src/test/queries/clientpositive/semijoin_stats_missing_colstats.q b/ql/src/test/queries/clientpositive/semijoin_stats_missing_colstats.q new file mode 100644 index 000000000000..dc4984960ec0 --- /dev/null +++ b/ql/src/test/queries/clientpositive/semijoin_stats_missing_colstats.q @@ -0,0 +1,19 @@ +-- HIVE-29516: Verify that query compilation succeeds when column statistics +-- are missing during semijoin optimization in removeSemijoinOptimizationByBenefit. + +set hive.tez.dynamic.partition.pruning=true; +set hive.tez.dynamic.semijoin.reduction=true; +set hive.tez.bigtable.minsize.semijoin.reduction=1; +set hive.tez.min.bloom.filter.entries=1; +set hive.tez.bloom.filter.factor=1.0f; +set hive.auto.convert.join=false; + +create table t1_nocolstats (id int, val string); +create table t2_nocolstats (id int, val string); + +alter table t1_nocolstats update statistics set('numRows'='100000000', 'rawDataSize'='2000000000'); +alter table t2_nocolstats update statistics set('numRows'='1000', 'rawDataSize'='20000'); + +explain +select t1.id, t1.val, t2.val +from t1_nocolstats t1 join t2_nocolstats t2 on t1.id = t2.id; diff --git a/ql/src/test/results/clientpositive/llap/semijoin_stats_missing_colstats.q.out b/ql/src/test/results/clientpositive/llap/semijoin_stats_missing_colstats.q.out new file mode 100644 index 000000000000..4e8b0385fe19 --- /dev/null +++ b/ql/src/test/results/clientpositive/llap/semijoin_stats_missing_colstats.q.out @@ -0,0 +1,160 @@ +PREHOOK: query: create table t1_nocolstats (id int, val string) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@t1_nocolstats +POSTHOOK: query: create table t1_nocolstats (id int, val string) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@t1_nocolstats +PREHOOK: query: create table t2_nocolstats (id int, val string) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@t2_nocolstats +POSTHOOK: query: create table t2_nocolstats (id int, val string) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@t2_nocolstats +PREHOOK: query: alter table t1_nocolstats update statistics set('numRows'='100000000', 'rawDataSize'='2000000000') +PREHOOK: type: ALTERTABLE_UPDATETABLESTATS +PREHOOK: Input: default@t1_nocolstats +PREHOOK: Output: default@t1_nocolstats +POSTHOOK: query: alter table t1_nocolstats update statistics set('numRows'='100000000', 'rawDataSize'='2000000000') +POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS +POSTHOOK: Input: default@t1_nocolstats +POSTHOOK: Output: default@t1_nocolstats +PREHOOK: query: alter table t2_nocolstats update statistics set('numRows'='1000', 'rawDataSize'='20000') +PREHOOK: type: ALTERTABLE_UPDATETABLESTATS +PREHOOK: Input: default@t2_nocolstats +PREHOOK: Output: default@t2_nocolstats +POSTHOOK: query: alter table t2_nocolstats update statistics set('numRows'='1000', 'rawDataSize'='20000') +POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS +POSTHOOK: Input: default@t2_nocolstats +POSTHOOK: Output: default@t2_nocolstats +PREHOOK: query: explain +select t1.id, t1.val, t2.val +from t1_nocolstats t1 join t2_nocolstats t2 on t1.id = t2.id +PREHOOK: type: QUERY +PREHOOK: Input: default@t1_nocolstats +PREHOOK: Input: default@t2_nocolstats +#### A masked pattern was here #### +POSTHOOK: query: explain +select t1.id, t1.val, t2.val +from t1_nocolstats t1 join t2_nocolstats t2 on t1.id = t2.id +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1_nocolstats +POSTHOOK: Input: default@t2_nocolstats +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Map 1 <- Reducer 4 (BROADCAST_EDGE) + Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 3 (SIMPLE_EDGE) + Reducer 4 <- Map 3 (CUSTOM_SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: t1 + filterExpr: (id is not null and id BETWEEN DynamicValue(RS_7_t2_id_min) AND DynamicValue(RS_7_t2_id_max) and in_bloom_filter(id, DynamicValue(RS_7_t2_id_bloom_filter))) (type: boolean) + Statistics: Num rows: 100000000 Data size: 17860000188 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (id is not null and id BETWEEN DynamicValue(RS_7_t2_id_min) AND DynamicValue(RS_7_t2_id_max) and in_bloom_filter(id, DynamicValue(RS_7_t2_id_bloom_filter))) (type: boolean) + Statistics: Num rows: 95000000 Data size: 16967000178 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: id (type: int), val (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 95000000 Data size: 16967000178 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: int) + null sort order: z + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 95000000 Data size: 16967000178 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: string) + Execution mode: vectorized, llap + LLAP IO: all inputs + Map 3 + Map Operator Tree: + TableScan + alias: t2 + filterExpr: id is not null (type: boolean) + Statistics: Num rows: 1000 Data size: 178788 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: id is not null (type: boolean) + Statistics: Num rows: 950 Data size: 169848 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: id (type: int), val (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 950 Data size: 169848 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: int) + null sort order: z + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 950 Data size: 169848 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: string) + Select Operator + expressions: _col0 (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 950 Data size: 169848 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: min(_col0), max(_col0), bloom_filter(_col0, expectedEntries=950) + minReductionHashAggr: 0.99 + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 340 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + null sort order: + sort order: + Statistics: Num rows: 1 Data size: 340 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: int), _col1 (type: int), _col2 (type: binary) + Execution mode: vectorized, llap + LLAP IO: all inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Merge Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col0 (type: int) + 1 _col0 (type: int) + outputColumnNames: _col0, _col1, _col3 + Statistics: Num rows: 104500002 Data size: 18663700600 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: int), _col1 (type: string), _col3 (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 104500002 Data size: 18663700600 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 104500002 Data size: 18663700600 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Reducer 4 + Execution mode: vectorized, llap + Reduce Operator Tree: + Group By Operator + aggregations: min(VALUE._col0), max(VALUE._col1), bloom_filter(VALUE._col2, 1, expectedEntries=950) + mode: final + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 340 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + null sort order: + sort order: + Statistics: Num rows: 1 Data size: 340 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: int), _col1 (type: int), _col2 (type: binary) + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink From 5439f62f659547c46104de0a9007a0ce64e24f4c Mon Sep 17 00:00:00 2001 From: Shubham Sharma Date: Sun, 5 Apr 2026 13:25:40 -0400 Subject: [PATCH 2/3] HIVE-29516 Fix whitespace related to test failure --- .../llap/semijoin_stats_missing_colstats.q.out | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/ql/src/test/results/clientpositive/llap/semijoin_stats_missing_colstats.q.out b/ql/src/test/results/clientpositive/llap/semijoin_stats_missing_colstats.q.out index 4e8b0385fe19..8a82a858f96d 100644 --- a/ql/src/test/results/clientpositive/llap/semijoin_stats_missing_colstats.q.out +++ b/ql/src/test/results/clientpositive/llap/semijoin_stats_missing_colstats.q.out @@ -58,7 +58,7 @@ STAGE PLANS: Reducer 4 <- Map 3 (CUSTOM_SIMPLE_EDGE) #### A masked pattern was here #### Vertices: - Map 1 + Map 1 Map Operator Tree: TableScan alias: t1 @@ -80,7 +80,7 @@ STAGE PLANS: value expressions: _col1 (type: string) Execution mode: vectorized, llap LLAP IO: all inputs - Map 3 + Map 3 Map Operator Tree: TableScan alias: t2 @@ -111,13 +111,13 @@ STAGE PLANS: outputColumnNames: _col0, _col1, _col2 Statistics: Num rows: 1 Data size: 340 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator - null sort order: - sort order: + null sort order: + sort order: Statistics: Num rows: 1 Data size: 340 Basic stats: COMPLETE Column stats: NONE value expressions: _col0 (type: int), _col1 (type: int), _col2 (type: binary) Execution mode: vectorized, llap LLAP IO: all inputs - Reducer 2 + Reducer 2 Execution mode: llap Reduce Operator Tree: Merge Join Operator @@ -139,7 +139,7 @@ STAGE PLANS: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - Reducer 4 + Reducer 4 Execution mode: vectorized, llap Reduce Operator Tree: Group By Operator @@ -148,8 +148,8 @@ STAGE PLANS: outputColumnNames: _col0, _col1, _col2 Statistics: Num rows: 1 Data size: 340 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator - null sort order: - sort order: + null sort order: + sort order: Statistics: Num rows: 1 Data size: 340 Basic stats: COMPLETE Column stats: NONE value expressions: _col0 (type: int), _col1 (type: int), _col2 (type: binary) @@ -158,3 +158,4 @@ STAGE PLANS: limit: -1 Processor Tree: ListSink + From 7328e0f5a0a5cc1157433ce5ca23956904ae5270 Mon Sep 17 00:00:00 2001 From: Stamatis Zampetakis Date: Fri, 10 Apr 2026 18:00:57 +0200 Subject: [PATCH 3/3] HIVE-29516: Modify test to hit NPE and minimize it In order to trigger the NPE there are two important requirements: * `hive.stats.fetch.column.stats` must be set to false * the big table must be partitioned All configuration properties that are not necessary or using the default values are dropped. --- .../semijoin_removal_missing_colstats.q | 11 ++ .../semijoin_stats_missing_colstats.q | 19 --- ...> semijoin_removal_missing_colstats.q.out} | 120 +++++++++--------- 3 files changed, 74 insertions(+), 76 deletions(-) create mode 100644 ql/src/test/queries/clientpositive/semijoin_removal_missing_colstats.q delete mode 100644 ql/src/test/queries/clientpositive/semijoin_stats_missing_colstats.q rename ql/src/test/results/clientpositive/llap/{semijoin_stats_missing_colstats.q.out => semijoin_removal_missing_colstats.q.out} (50%) diff --git a/ql/src/test/queries/clientpositive/semijoin_removal_missing_colstats.q b/ql/src/test/queries/clientpositive/semijoin_removal_missing_colstats.q new file mode 100644 index 000000000000..d867451d4e6f --- /dev/null +++ b/ql/src/test/queries/clientpositive/semijoin_removal_missing_colstats.q @@ -0,0 +1,11 @@ +-- HIVE-29516: NPE in StatsUtils.updateStats when removing semijoin by benefit and column statistics are missing +set hive.stats.fetch.column.stats=false; + +create table big (id int, val string) partitioned by (bday int); +alter table big add partition (bday=20260410); +alter table big partition (bday=20260410) update statistics set ('numRows' = '1000000000'); + +create table small (id int, val string); +alter table small update statistics set ('numRows' = '1000'); + +explain select big.val, small.val from big join small on big.id = small.id; diff --git a/ql/src/test/queries/clientpositive/semijoin_stats_missing_colstats.q b/ql/src/test/queries/clientpositive/semijoin_stats_missing_colstats.q deleted file mode 100644 index dc4984960ec0..000000000000 --- a/ql/src/test/queries/clientpositive/semijoin_stats_missing_colstats.q +++ /dev/null @@ -1,19 +0,0 @@ --- HIVE-29516: Verify that query compilation succeeds when column statistics --- are missing during semijoin optimization in removeSemijoinOptimizationByBenefit. - -set hive.tez.dynamic.partition.pruning=true; -set hive.tez.dynamic.semijoin.reduction=true; -set hive.tez.bigtable.minsize.semijoin.reduction=1; -set hive.tez.min.bloom.filter.entries=1; -set hive.tez.bloom.filter.factor=1.0f; -set hive.auto.convert.join=false; - -create table t1_nocolstats (id int, val string); -create table t2_nocolstats (id int, val string); - -alter table t1_nocolstats update statistics set('numRows'='100000000', 'rawDataSize'='2000000000'); -alter table t2_nocolstats update statistics set('numRows'='1000', 'rawDataSize'='20000'); - -explain -select t1.id, t1.val, t2.val -from t1_nocolstats t1 join t2_nocolstats t2 on t1.id = t2.id; diff --git a/ql/src/test/results/clientpositive/llap/semijoin_stats_missing_colstats.q.out b/ql/src/test/results/clientpositive/llap/semijoin_removal_missing_colstats.q.out similarity index 50% rename from ql/src/test/results/clientpositive/llap/semijoin_stats_missing_colstats.q.out rename to ql/src/test/results/clientpositive/llap/semijoin_removal_missing_colstats.q.out index 8a82a858f96d..dc6e3bc3faf2 100644 --- a/ql/src/test/results/clientpositive/llap/semijoin_stats_missing_colstats.q.out +++ b/ql/src/test/results/clientpositive/llap/semijoin_removal_missing_colstats.q.out @@ -1,48 +1,54 @@ -PREHOOK: query: create table t1_nocolstats (id int, val string) +PREHOOK: query: create table big (id int, val string) partitioned by (bday int) PREHOOK: type: CREATETABLE PREHOOK: Output: database:default -PREHOOK: Output: default@t1_nocolstats -POSTHOOK: query: create table t1_nocolstats (id int, val string) +PREHOOK: Output: default@big +POSTHOOK: query: create table big (id int, val string) partitioned by (bday int) POSTHOOK: type: CREATETABLE POSTHOOK: Output: database:default -POSTHOOK: Output: default@t1_nocolstats -PREHOOK: query: create table t2_nocolstats (id int, val string) +POSTHOOK: Output: default@big +PREHOOK: query: alter table big add partition (bday=20260410) +PREHOOK: type: ALTERTABLE_ADDPARTS +PREHOOK: Output: default@big +POSTHOOK: query: alter table big add partition (bday=20260410) +POSTHOOK: type: ALTERTABLE_ADDPARTS +POSTHOOK: Output: default@big +POSTHOOK: Output: default@big@bday=20260410 +PREHOOK: query: alter table big partition (bday=20260410) update statistics set ('numRows' = '1000000000') +PREHOOK: type: ALTERTABLE_UPDATEPARTSTATS +PREHOOK: Input: default@big +PREHOOK: Output: default@big@bday=20260410 +POSTHOOK: query: alter table big partition (bday=20260410) update statistics set ('numRows' = '1000000000') +POSTHOOK: type: ALTERTABLE_UPDATEPARTSTATS +POSTHOOK: Input: default@big +POSTHOOK: Input: default@big@bday=20260410 +POSTHOOK: Output: default@big@bday=20260410 +PREHOOK: query: create table small (id int, val string) PREHOOK: type: CREATETABLE PREHOOK: Output: database:default -PREHOOK: Output: default@t2_nocolstats -POSTHOOK: query: create table t2_nocolstats (id int, val string) +PREHOOK: Output: default@small +POSTHOOK: query: create table small (id int, val string) POSTHOOK: type: CREATETABLE POSTHOOK: Output: database:default -POSTHOOK: Output: default@t2_nocolstats -PREHOOK: query: alter table t1_nocolstats update statistics set('numRows'='100000000', 'rawDataSize'='2000000000') +POSTHOOK: Output: default@small +PREHOOK: query: alter table small update statistics set ('numRows' = '1000') PREHOOK: type: ALTERTABLE_UPDATETABLESTATS -PREHOOK: Input: default@t1_nocolstats -PREHOOK: Output: default@t1_nocolstats -POSTHOOK: query: alter table t1_nocolstats update statistics set('numRows'='100000000', 'rawDataSize'='2000000000') +PREHOOK: Input: default@small +PREHOOK: Output: default@small +POSTHOOK: query: alter table small update statistics set ('numRows' = '1000') POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS -POSTHOOK: Input: default@t1_nocolstats -POSTHOOK: Output: default@t1_nocolstats -PREHOOK: query: alter table t2_nocolstats update statistics set('numRows'='1000', 'rawDataSize'='20000') -PREHOOK: type: ALTERTABLE_UPDATETABLESTATS -PREHOOK: Input: default@t2_nocolstats -PREHOOK: Output: default@t2_nocolstats -POSTHOOK: query: alter table t2_nocolstats update statistics set('numRows'='1000', 'rawDataSize'='20000') -POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS -POSTHOOK: Input: default@t2_nocolstats -POSTHOOK: Output: default@t2_nocolstats -PREHOOK: query: explain -select t1.id, t1.val, t2.val -from t1_nocolstats t1 join t2_nocolstats t2 on t1.id = t2.id +POSTHOOK: Input: default@small +POSTHOOK: Output: default@small +PREHOOK: query: explain select big.val, small.val from big join small on big.id = small.id PREHOOK: type: QUERY -PREHOOK: Input: default@t1_nocolstats -PREHOOK: Input: default@t2_nocolstats +PREHOOK: Input: default@big +PREHOOK: Input: default@big@bday=20260410 +PREHOOK: Input: default@small #### A masked pattern was here #### -POSTHOOK: query: explain -select t1.id, t1.val, t2.val -from t1_nocolstats t1 join t2_nocolstats t2 on t1.id = t2.id +POSTHOOK: query: explain select big.val, small.val from big join small on big.id = small.id POSTHOOK: type: QUERY -POSTHOOK: Input: default@t1_nocolstats -POSTHOOK: Input: default@t2_nocolstats +POSTHOOK: Input: default@big +POSTHOOK: Input: default@big@bday=20260410 +POSTHOOK: Input: default@small #### A masked pattern was here #### STAGE DEPENDENCIES: Stage-1 is a root stage @@ -61,59 +67,59 @@ STAGE PLANS: Map 1 Map Operator Tree: TableScan - alias: t1 - filterExpr: (id is not null and id BETWEEN DynamicValue(RS_7_t2_id_min) AND DynamicValue(RS_7_t2_id_max) and in_bloom_filter(id, DynamicValue(RS_7_t2_id_bloom_filter))) (type: boolean) - Statistics: Num rows: 100000000 Data size: 17860000188 Basic stats: COMPLETE Column stats: NONE + alias: big + filterExpr: (id is not null and id BETWEEN DynamicValue(RS_7_small_id_min) AND DynamicValue(RS_7_small_id_max) and in_bloom_filter(id, DynamicValue(RS_7_small_id_bloom_filter))) (type: boolean) + Statistics: Num rows: 1000000000 Data size: 296000000000 Basic stats: COMPLETE Column stats: NONE Filter Operator - predicate: (id is not null and id BETWEEN DynamicValue(RS_7_t2_id_min) AND DynamicValue(RS_7_t2_id_max) and in_bloom_filter(id, DynamicValue(RS_7_t2_id_bloom_filter))) (type: boolean) - Statistics: Num rows: 95000000 Data size: 16967000178 Basic stats: COMPLETE Column stats: NONE + predicate: (id is not null and id BETWEEN DynamicValue(RS_7_small_id_min) AND DynamicValue(RS_7_small_id_max) and in_bloom_filter(id, DynamicValue(RS_7_small_id_bloom_filter))) (type: boolean) + Statistics: Num rows: 1000000000 Data size: 296000000000 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: id (type: int), val (type: string) outputColumnNames: _col0, _col1 - Statistics: Num rows: 95000000 Data size: 16967000178 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 1000000000 Data size: 296000000000 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator key expressions: _col0 (type: int) null sort order: z sort order: + Map-reduce partition columns: _col0 (type: int) - Statistics: Num rows: 95000000 Data size: 16967000178 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 1000000000 Data size: 296000000000 Basic stats: COMPLETE Column stats: NONE value expressions: _col1 (type: string) Execution mode: vectorized, llap LLAP IO: all inputs Map 3 Map Operator Tree: TableScan - alias: t2 + alias: small filterExpr: id is not null (type: boolean) - Statistics: Num rows: 1000 Data size: 178788 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 1000 Data size: 0 Basic stats: PARTIAL Column stats: NONE Filter Operator predicate: id is not null (type: boolean) - Statistics: Num rows: 950 Data size: 169848 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE Select Operator expressions: id (type: int), val (type: string) outputColumnNames: _col0, _col1 - Statistics: Num rows: 950 Data size: 169848 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE Reduce Output Operator key expressions: _col0 (type: int) null sort order: z sort order: + Map-reduce partition columns: _col0 (type: int) - Statistics: Num rows: 950 Data size: 169848 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE value expressions: _col1 (type: string) Select Operator expressions: _col0 (type: int) outputColumnNames: _col0 - Statistics: Num rows: 950 Data size: 169848 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE Group By Operator - aggregations: min(_col0), max(_col0), bloom_filter(_col0, expectedEntries=950) + aggregations: min(_col0), max(_col0), bloom_filter(_col0, expectedEntries=1000000) minReductionHashAggr: 0.99 mode: hash outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 1 Data size: 340 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 1 Data size: 152 Basic stats: PARTIAL Column stats: NONE Reduce Output Operator null sort order: sort order: - Statistics: Num rows: 1 Data size: 340 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 1 Data size: 152 Basic stats: PARTIAL Column stats: NONE value expressions: _col0 (type: int), _col1 (type: int), _col2 (type: binary) Execution mode: vectorized, llap LLAP IO: all inputs @@ -126,15 +132,15 @@ STAGE PLANS: keys: 0 _col0 (type: int) 1 _col0 (type: int) - outputColumnNames: _col0, _col1, _col3 - Statistics: Num rows: 104500002 Data size: 18663700600 Basic stats: COMPLETE Column stats: NONE + outputColumnNames: _col1, _col3 + Statistics: Num rows: 1100000023 Data size: 325600007057 Basic stats: PARTIAL Column stats: NONE Select Operator - expressions: _col0 (type: int), _col1 (type: string), _col3 (type: string) - outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 104500002 Data size: 18663700600 Basic stats: COMPLETE Column stats: NONE + expressions: _col1 (type: string), _col3 (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1100000023 Data size: 325600007057 Basic stats: PARTIAL Column stats: NONE File Output Operator compressed: false - Statistics: Num rows: 104500002 Data size: 18663700600 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 1100000023 Data size: 325600007057 Basic stats: PARTIAL Column stats: NONE table: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat @@ -143,14 +149,14 @@ STAGE PLANS: Execution mode: vectorized, llap Reduce Operator Tree: Group By Operator - aggregations: min(VALUE._col0), max(VALUE._col1), bloom_filter(VALUE._col2, 1, expectedEntries=950) + aggregations: min(VALUE._col0), max(VALUE._col1), bloom_filter(VALUE._col2, 1, expectedEntries=1000000) mode: final outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 1 Data size: 340 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 1 Data size: 152 Basic stats: PARTIAL Column stats: NONE Reduce Output Operator null sort order: sort order: - Statistics: Num rows: 1 Data size: 340 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 1 Data size: 152 Basic stats: PARTIAL Column stats: NONE value expressions: _col0 (type: int), _col1 (type: int), _col2 (type: binary) Stage: Stage-0