From 217c58eec153f0f6c93a1e18b154b1f513905806 Mon Sep 17 00:00:00 2001
From: Shubham Sharma <shubh.luck@yahoo.in>
Date: Sat, 4 Apr 2026 10:45:43 -0400
Subject: [PATCH 1/3] HIVE-29516: Fix NPE in StatsUtils.updateStats when column
 statistics unavailable

Check column stats availability before passing useColStats=true in
TezCompiler.removeSemijoinOptimizationByBenefit() to avoid NPE when
column statistics are not present.
---
 .../hadoop/hive/ql/parse/TezCompiler.java     |   3 +-
 .../semijoin_stats_missing_colstats.q         |  19 +++
 .../semijoin_stats_missing_colstats.q.out     | 160 ++++++++++++++++++
 3 files changed, 181 insertions(+), 1 deletion(-)
 create mode 100644 ql/src/test/queries/clientpositive/semijoin_stats_missing_colstats.q
 create mode 100644 ql/src/test/results/clientpositive/llap/semijoin_stats_missing_colstats.q.out

diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/TezCompiler.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/TezCompiler.java
index 2a947c5e0eed..ede30bfb946d 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/parse/TezCompiler.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/TezCompiler.java
@@ -1977,8 +1977,9 @@ private void removeSemijoinOptimizationByBenefit(OptimizeTezProcContext procCtx)
           LOG.debug("Old stats for {}: {}", roi.filterOperator, roi.filterStats);
           LOG.debug("Number of rows reduction: {}/{}", newNumRows, roi.filterStats.getNumRows());
         }
+        boolean useColStats = roi.filterStats.getColumnStats() != null;
         StatsUtils.updateStats(roi.filterStats, newNumRows,
-            true, roi.filterOperator, roi.colNames);
+            useColStats, roi.filterOperator, roi.colNames);
         if (LOG.isDebugEnabled()) {
           LOG.debug("New stats for {}: {}", roi.filterOperator, roi.filterStats);
         }
diff --git a/ql/src/test/queries/clientpositive/semijoin_stats_missing_colstats.q b/ql/src/test/queries/clientpositive/semijoin_stats_missing_colstats.q
new file mode 100644
index 000000000000..dc4984960ec0
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/semijoin_stats_missing_colstats.q
@@ -0,0 +1,19 @@
+-- HIVE-29516: Verify that query compilation succeeds when column statistics
+-- are missing during semijoin optimization in removeSemijoinOptimizationByBenefit.
+
+set hive.tez.dynamic.partition.pruning=true;
+set hive.tez.dynamic.semijoin.reduction=true;
+set hive.tez.bigtable.minsize.semijoin.reduction=1;
+set hive.tez.min.bloom.filter.entries=1;
+set hive.tez.bloom.filter.factor=1.0f;
+set hive.auto.convert.join=false;
+
+create table t1_nocolstats (id int, val string);
+create table t2_nocolstats (id int, val string);
+
+alter table t1_nocolstats update statistics set('numRows'='100000000', 'rawDataSize'='2000000000');
+alter table t2_nocolstats update statistics set('numRows'='1000', 'rawDataSize'='20000');
+
+explain
+select t1.id, t1.val, t2.val
+from t1_nocolstats t1 join t2_nocolstats t2 on t1.id = t2.id;
diff --git a/ql/src/test/results/clientpositive/llap/semijoin_stats_missing_colstats.q.out b/ql/src/test/results/clientpositive/llap/semijoin_stats_missing_colstats.q.out
new file mode 100644
index 000000000000..4e8b0385fe19
--- /dev/null
+++ b/ql/src/test/results/clientpositive/llap/semijoin_stats_missing_colstats.q.out
@@ -0,0 +1,160 @@
+PREHOOK: query: create table t1_nocolstats (id int, val string)
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@t1_nocolstats
+POSTHOOK: query: create table t1_nocolstats (id int, val string)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@t1_nocolstats
+PREHOOK: query: create table t2_nocolstats (id int, val string)
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@t2_nocolstats
+POSTHOOK: query: create table t2_nocolstats (id int, val string)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@t2_nocolstats
+PREHOOK: query: alter table t1_nocolstats update statistics set('numRows'='100000000', 'rawDataSize'='2000000000')
+PREHOOK: type: ALTERTABLE_UPDATETABLESTATS
+PREHOOK: Input: default@t1_nocolstats
+PREHOOK: Output: default@t1_nocolstats
+POSTHOOK: query: alter table t1_nocolstats update statistics set('numRows'='100000000', 'rawDataSize'='2000000000')
+POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS
+POSTHOOK: Input: default@t1_nocolstats
+POSTHOOK: Output: default@t1_nocolstats
+PREHOOK: query: alter table t2_nocolstats update statistics set('numRows'='1000', 'rawDataSize'='20000')
+PREHOOK: type: ALTERTABLE_UPDATETABLESTATS
+PREHOOK: Input: default@t2_nocolstats
+PREHOOK: Output: default@t2_nocolstats
+POSTHOOK: query: alter table t2_nocolstats update statistics set('numRows'='1000', 'rawDataSize'='20000')
+POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS
+POSTHOOK: Input: default@t2_nocolstats
+POSTHOOK: Output: default@t2_nocolstats
+PREHOOK: query: explain
+select t1.id, t1.val, t2.val
+from t1_nocolstats t1 join t2_nocolstats t2 on t1.id = t2.id
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1_nocolstats
+PREHOOK: Input: default@t2_nocolstats
+#### A masked pattern was here ####
+POSTHOOK: query: explain
+select t1.id, t1.val, t2.val
+from t1_nocolstats t1 join t2_nocolstats t2 on t1.id = t2.id
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1_nocolstats
+POSTHOOK: Input: default@t2_nocolstats
+#### A masked pattern was here ####
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Tez
+#### A masked pattern was here ####
+      Edges:
+        Map 1 <- Reducer 4 (BROADCAST_EDGE)
+        Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 3 (SIMPLE_EDGE)
+        Reducer 4 <- Map 3 (CUSTOM_SIMPLE_EDGE)
+#### A masked pattern was here ####
+      Vertices:
+        Map 1
+            Map Operator Tree:
+                TableScan
+                  alias: t1
+                  filterExpr: (id is not null and id BETWEEN DynamicValue(RS_7_t2_id_min) AND DynamicValue(RS_7_t2_id_max) and in_bloom_filter(id, DynamicValue(RS_7_t2_id_bloom_filter))) (type: boolean)
+                  Statistics: Num rows: 100000000 Data size: 17860000188 Basic stats: COMPLETE Column stats: NONE
+                  Filter Operator
+                    predicate: (id is not null and id BETWEEN DynamicValue(RS_7_t2_id_min) AND DynamicValue(RS_7_t2_id_max) and in_bloom_filter(id, DynamicValue(RS_7_t2_id_bloom_filter))) (type: boolean)
+                    Statistics: Num rows: 95000000 Data size: 16967000178 Basic stats: COMPLETE Column stats: NONE
+                    Select Operator
+                      expressions: id (type: int), val (type: string)
+                      outputColumnNames: _col0, _col1
+                      Statistics: Num rows: 95000000 Data size: 16967000178 Basic stats: COMPLETE Column stats: NONE
+                      Reduce Output Operator
+                        key expressions: _col0 (type: int)
+                        null sort order: z
+                        sort order: +
+                        Map-reduce partition columns: _col0 (type: int)
+                        Statistics: Num rows: 95000000 Data size: 16967000178 Basic stats: COMPLETE Column stats: NONE
+                        value expressions: _col1 (type: string)
+            Execution mode: vectorized, llap
+            LLAP IO: all inputs
+        Map 3
+            Map Operator Tree:
+                TableScan
+                  alias: t2
+                  filterExpr: id is not null (type: boolean)
+                  Statistics: Num rows: 1000 Data size: 178788 Basic stats: COMPLETE Column stats: NONE
+                  Filter Operator
+                    predicate: id is not null (type: boolean)
+                    Statistics: Num rows: 950 Data size: 169848 Basic stats: COMPLETE Column stats: NONE
+                    Select Operator
+                      expressions: id (type: int), val (type: string)
+                      outputColumnNames: _col0, _col1
+                      Statistics: Num rows: 950 Data size: 169848 Basic stats: COMPLETE Column stats: NONE
+                      Reduce Output Operator
+                        key expressions: _col0 (type: int)
+                        null sort order: z
+                        sort order: +
+                        Map-reduce partition columns: _col0 (type: int)
+                        Statistics: Num rows: 950 Data size: 169848 Basic stats: COMPLETE Column stats: NONE
+                        value expressions: _col1 (type: string)
+                      Select Operator
+                        expressions: _col0 (type: int)
+                        outputColumnNames: _col0
+                        Statistics: Num rows: 950 Data size: 169848 Basic stats: COMPLETE Column stats: NONE
+                        Group By Operator
+                          aggregations: min(_col0), max(_col0), bloom_filter(_col0, expectedEntries=950)
+                          minReductionHashAggr: 0.99
+                          mode: hash
+                          outputColumnNames: _col0, _col1, _col2
+                          Statistics: Num rows: 1 Data size: 340 Basic stats: COMPLETE Column stats: NONE
+                          Reduce Output Operator
+                            null sort order:
+                            sort order:
+                            Statistics: Num rows: 1 Data size: 340 Basic stats: COMPLETE Column stats: NONE
+                            value expressions: _col0 (type: int), _col1 (type: int), _col2 (type: binary)
+            Execution mode: vectorized, llap
+            LLAP IO: all inputs
+        Reducer 2
+            Execution mode: llap
+            Reduce Operator Tree:
+              Merge Join Operator
+                condition map:
+                     Inner Join 0 to 1
+                keys:
+                  0 _col0 (type: int)
+                  1 _col0 (type: int)
+                outputColumnNames: _col0, _col1, _col3
+                Statistics: Num rows: 104500002 Data size: 18663700600 Basic stats: COMPLETE Column stats: NONE
+                Select Operator
+                  expressions: _col0 (type: int), _col1 (type: string), _col3 (type: string)
+                  outputColumnNames: _col0, _col1, _col2
+                  Statistics: Num rows: 104500002 Data size: 18663700600 Basic stats: COMPLETE Column stats: NONE
+                  File Output Operator
+                    compressed: false
+                    Statistics: Num rows: 104500002 Data size: 18663700600 Basic stats: COMPLETE Column stats: NONE
+                    table:
+                        input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+                        output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                        serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+        Reducer 4
+            Execution mode: vectorized, llap
+            Reduce Operator Tree:
+              Group By Operator
+                aggregations: min(VALUE._col0), max(VALUE._col1), bloom_filter(VALUE._col2, 1, expectedEntries=950)
+                mode: final
+                outputColumnNames: _col0, _col1, _col2
+                Statistics: Num rows: 1 Data size: 340 Basic stats: COMPLETE Column stats: NONE
+                Reduce Output Operator
+                  null sort order:
+                  sort order:
+                  Statistics: Num rows: 1 Data size: 340 Basic stats: COMPLETE Column stats: NONE
+                  value expressions: _col0 (type: int), _col1 (type: int), _col2 (type: binary)
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink

From 5439f62f659547c46104de0a9007a0ce64e24f4c Mon Sep 17 00:00:00 2001
From: Shubham Sharma <shubh.luck@yahoo.in>
Date: Sun, 5 Apr 2026 13:25:40 -0400
Subject: [PATCH 2/3] HIVE-29516 Fix whitespace related to test failure

---
 .../llap/semijoin_stats_missing_colstats.q.out  | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/ql/src/test/results/clientpositive/llap/semijoin_stats_missing_colstats.q.out b/ql/src/test/results/clientpositive/llap/semijoin_stats_missing_colstats.q.out
index 4e8b0385fe19..8a82a858f96d 100644
--- a/ql/src/test/results/clientpositive/llap/semijoin_stats_missing_colstats.q.out
+++ b/ql/src/test/results/clientpositive/llap/semijoin_stats_missing_colstats.q.out
@@ -58,7 +58,7 @@ STAGE PLANS:
         Reducer 4 <- Map 3 (CUSTOM_SIMPLE_EDGE)
 #### A masked pattern was here ####
       Vertices:
-        Map 1
+        Map 1 
             Map Operator Tree:
                 TableScan
                   alias: t1
@@ -80,7 +80,7 @@ STAGE PLANS:
                         value expressions: _col1 (type: string)
             Execution mode: vectorized, llap
             LLAP IO: all inputs
-        Map 3
+        Map 3 
             Map Operator Tree:
                 TableScan
                   alias: t2
@@ -111,13 +111,13 @@ STAGE PLANS:
                           outputColumnNames: _col0, _col1, _col2
                           Statistics: Num rows: 1 Data size: 340 Basic stats: COMPLETE Column stats: NONE
                           Reduce Output Operator
-                            null sort order:
-                            sort order:
+                            null sort order: 
+                            sort order: 
                             Statistics: Num rows: 1 Data size: 340 Basic stats: COMPLETE Column stats: NONE
                             value expressions: _col0 (type: int), _col1 (type: int), _col2 (type: binary)
             Execution mode: vectorized, llap
             LLAP IO: all inputs
-        Reducer 2
+        Reducer 2 
             Execution mode: llap
             Reduce Operator Tree:
               Merge Join Operator
@@ -139,7 +139,7 @@ STAGE PLANS:
                         input format: org.apache.hadoop.mapred.SequenceFileInputFormat
                         output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
                         serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
-        Reducer 4
+        Reducer 4 
             Execution mode: vectorized, llap
             Reduce Operator Tree:
               Group By Operator
@@ -148,8 +148,8 @@ STAGE PLANS:
                 outputColumnNames: _col0, _col1, _col2
                 Statistics: Num rows: 1 Data size: 340 Basic stats: COMPLETE Column stats: NONE
                 Reduce Output Operator
-                  null sort order:
-                  sort order:
+                  null sort order: 
+                  sort order: 
                   Statistics: Num rows: 1 Data size: 340 Basic stats: COMPLETE Column stats: NONE
                   value expressions: _col0 (type: int), _col1 (type: int), _col2 (type: binary)
 
@@ -158,3 +158,4 @@ STAGE PLANS:
       limit: -1
       Processor Tree:
         ListSink
+

From 7328e0f5a0a5cc1157433ce5ca23956904ae5270 Mon Sep 17 00:00:00 2001
From: Stamatis Zampetakis <zabetak@gmail.com>
Date: Fri, 10 Apr 2026 18:00:57 +0200
Subject: [PATCH 3/3] HIVE-29516: Modify test to hit NPE and minimize it

In order to trigger the NPE there are two important requirements:
* `hive.stats.fetch.column.stats` must be set to false
* the big table must be partitioned

All configuration properties that are not necessary or using the default values are dropped.
---
 .../semijoin_removal_missing_colstats.q       |  11 ++
 .../semijoin_stats_missing_colstats.q         |  19 ---
 ...> semijoin_removal_missing_colstats.q.out} | 120 +++++++++---------
 3 files changed, 74 insertions(+), 76 deletions(-)
 create mode 100644 ql/src/test/queries/clientpositive/semijoin_removal_missing_colstats.q
 delete mode 100644 ql/src/test/queries/clientpositive/semijoin_stats_missing_colstats.q
 rename ql/src/test/results/clientpositive/llap/{semijoin_stats_missing_colstats.q.out => semijoin_removal_missing_colstats.q.out} (50%)

diff --git a/ql/src/test/queries/clientpositive/semijoin_removal_missing_colstats.q b/ql/src/test/queries/clientpositive/semijoin_removal_missing_colstats.q
new file mode 100644
index 000000000000..d867451d4e6f
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/semijoin_removal_missing_colstats.q
@@ -0,0 +1,11 @@
+-- HIVE-29516: NPE in StatsUtils.updateStats when removing semijoin by benefit and column statistics are missing
+set hive.stats.fetch.column.stats=false;
+
+create table big (id int, val string) partitioned by (bday int);
+alter table big add partition (bday=20260410);
+alter table big partition (bday=20260410) update statistics set ('numRows' = '1000000000');
+
+create table small (id int, val string);
+alter table small update statistics set ('numRows' = '1000');
+
+explain select big.val, small.val from big join small on big.id = small.id;
diff --git a/ql/src/test/queries/clientpositive/semijoin_stats_missing_colstats.q b/ql/src/test/queries/clientpositive/semijoin_stats_missing_colstats.q
deleted file mode 100644
index dc4984960ec0..000000000000
--- a/ql/src/test/queries/clientpositive/semijoin_stats_missing_colstats.q
+++ /dev/null
@@ -1,19 +0,0 @@
--- HIVE-29516: Verify that query compilation succeeds when column statistics
--- are missing during semijoin optimization in removeSemijoinOptimizationByBenefit.
-
-set hive.tez.dynamic.partition.pruning=true;
-set hive.tez.dynamic.semijoin.reduction=true;
-set hive.tez.bigtable.minsize.semijoin.reduction=1;
-set hive.tez.min.bloom.filter.entries=1;
-set hive.tez.bloom.filter.factor=1.0f;
-set hive.auto.convert.join=false;
-
-create table t1_nocolstats (id int, val string);
-create table t2_nocolstats (id int, val string);
-
-alter table t1_nocolstats update statistics set('numRows'='100000000', 'rawDataSize'='2000000000');
-alter table t2_nocolstats update statistics set('numRows'='1000', 'rawDataSize'='20000');
-
-explain
-select t1.id, t1.val, t2.val
-from t1_nocolstats t1 join t2_nocolstats t2 on t1.id = t2.id;
diff --git a/ql/src/test/results/clientpositive/llap/semijoin_stats_missing_colstats.q.out b/ql/src/test/results/clientpositive/llap/semijoin_removal_missing_colstats.q.out
similarity index 50%
rename from ql/src/test/results/clientpositive/llap/semijoin_stats_missing_colstats.q.out
rename to ql/src/test/results/clientpositive/llap/semijoin_removal_missing_colstats.q.out
index 8a82a858f96d..dc6e3bc3faf2 100644
--- a/ql/src/test/results/clientpositive/llap/semijoin_stats_missing_colstats.q.out
+++ b/ql/src/test/results/clientpositive/llap/semijoin_removal_missing_colstats.q.out
@@ -1,48 +1,54 @@
-PREHOOK: query: create table t1_nocolstats (id int, val string)
+PREHOOK: query: create table big (id int, val string) partitioned by (bday int)
 PREHOOK: type: CREATETABLE
 PREHOOK: Output: database:default
-PREHOOK: Output: default@t1_nocolstats
-POSTHOOK: query: create table t1_nocolstats (id int, val string)
+PREHOOK: Output: default@big
+POSTHOOK: query: create table big (id int, val string) partitioned by (bday int)
 POSTHOOK: type: CREATETABLE
 POSTHOOK: Output: database:default
-POSTHOOK: Output: default@t1_nocolstats
-PREHOOK: query: create table t2_nocolstats (id int, val string)
+POSTHOOK: Output: default@big
+PREHOOK: query: alter table big add partition (bday=20260410)
+PREHOOK: type: ALTERTABLE_ADDPARTS
+PREHOOK: Output: default@big
+POSTHOOK: query: alter table big add partition (bday=20260410)
+POSTHOOK: type: ALTERTABLE_ADDPARTS
+POSTHOOK: Output: default@big
+POSTHOOK: Output: default@big@bday=20260410
+PREHOOK: query: alter table big partition (bday=20260410) update statistics set ('numRows' = '1000000000')
+PREHOOK: type: ALTERTABLE_UPDATEPARTSTATS
+PREHOOK: Input: default@big
+PREHOOK: Output: default@big@bday=20260410
+POSTHOOK: query: alter table big partition (bday=20260410) update statistics set ('numRows' = '1000000000')
+POSTHOOK: type: ALTERTABLE_UPDATEPARTSTATS
+POSTHOOK: Input: default@big
+POSTHOOK: Input: default@big@bday=20260410
+POSTHOOK: Output: default@big@bday=20260410
+PREHOOK: query: create table small (id int, val string)
 PREHOOK: type: CREATETABLE
 PREHOOK: Output: database:default
-PREHOOK: Output: default@t2_nocolstats
-POSTHOOK: query: create table t2_nocolstats (id int, val string)
+PREHOOK: Output: default@small
+POSTHOOK: query: create table small (id int, val string)
 POSTHOOK: type: CREATETABLE
 POSTHOOK: Output: database:default
-POSTHOOK: Output: default@t2_nocolstats
-PREHOOK: query: alter table t1_nocolstats update statistics set('numRows'='100000000', 'rawDataSize'='2000000000')
+POSTHOOK: Output: default@small
+PREHOOK: query: alter table small update statistics set ('numRows' = '1000')
 PREHOOK: type: ALTERTABLE_UPDATETABLESTATS
-PREHOOK: Input: default@t1_nocolstats
-PREHOOK: Output: default@t1_nocolstats
-POSTHOOK: query: alter table t1_nocolstats update statistics set('numRows'='100000000', 'rawDataSize'='2000000000')
+PREHOOK: Input: default@small
+PREHOOK: Output: default@small
+POSTHOOK: query: alter table small update statistics set ('numRows' = '1000')
 POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS
-POSTHOOK: Input: default@t1_nocolstats
-POSTHOOK: Output: default@t1_nocolstats
-PREHOOK: query: alter table t2_nocolstats update statistics set('numRows'='1000', 'rawDataSize'='20000')
-PREHOOK: type: ALTERTABLE_UPDATETABLESTATS
-PREHOOK: Input: default@t2_nocolstats
-PREHOOK: Output: default@t2_nocolstats
-POSTHOOK: query: alter table t2_nocolstats update statistics set('numRows'='1000', 'rawDataSize'='20000')
-POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS
-POSTHOOK: Input: default@t2_nocolstats
-POSTHOOK: Output: default@t2_nocolstats
-PREHOOK: query: explain
-select t1.id, t1.val, t2.val
-from t1_nocolstats t1 join t2_nocolstats t2 on t1.id = t2.id
+POSTHOOK: Input: default@small
+POSTHOOK: Output: default@small
+PREHOOK: query: explain select big.val, small.val from big join small on big.id = small.id
 PREHOOK: type: QUERY
-PREHOOK: Input: default@t1_nocolstats
-PREHOOK: Input: default@t2_nocolstats
+PREHOOK: Input: default@big
+PREHOOK: Input: default@big@bday=20260410
+PREHOOK: Input: default@small
 #### A masked pattern was here ####
-POSTHOOK: query: explain
-select t1.id, t1.val, t2.val
-from t1_nocolstats t1 join t2_nocolstats t2 on t1.id = t2.id
+POSTHOOK: query: explain select big.val, small.val from big join small on big.id = small.id
 POSTHOOK: type: QUERY
-POSTHOOK: Input: default@t1_nocolstats
-POSTHOOK: Input: default@t2_nocolstats
+POSTHOOK: Input: default@big
+POSTHOOK: Input: default@big@bday=20260410
+POSTHOOK: Input: default@small
 #### A masked pattern was here ####
 STAGE DEPENDENCIES:
   Stage-1 is a root stage
@@ -61,59 +67,59 @@ STAGE PLANS:
         Map 1 
             Map Operator Tree:
                 TableScan
-                  alias: t1
-                  filterExpr: (id is not null and id BETWEEN DynamicValue(RS_7_t2_id_min) AND DynamicValue(RS_7_t2_id_max) and in_bloom_filter(id, DynamicValue(RS_7_t2_id_bloom_filter))) (type: boolean)
-                  Statistics: Num rows: 100000000 Data size: 17860000188 Basic stats: COMPLETE Column stats: NONE
+                  alias: big
+                  filterExpr: (id is not null and id BETWEEN DynamicValue(RS_7_small_id_min) AND DynamicValue(RS_7_small_id_max) and in_bloom_filter(id, DynamicValue(RS_7_small_id_bloom_filter))) (type: boolean)
+                  Statistics: Num rows: 1000000000 Data size: 296000000000 Basic stats: COMPLETE Column stats: NONE
                   Filter Operator
-                    predicate: (id is not null and id BETWEEN DynamicValue(RS_7_t2_id_min) AND DynamicValue(RS_7_t2_id_max) and in_bloom_filter(id, DynamicValue(RS_7_t2_id_bloom_filter))) (type: boolean)
-                    Statistics: Num rows: 95000000 Data size: 16967000178 Basic stats: COMPLETE Column stats: NONE
+                    predicate: (id is not null and id BETWEEN DynamicValue(RS_7_small_id_min) AND DynamicValue(RS_7_small_id_max) and in_bloom_filter(id, DynamicValue(RS_7_small_id_bloom_filter))) (type: boolean)
+                    Statistics: Num rows: 1000000000 Data size: 296000000000 Basic stats: COMPLETE Column stats: NONE
                     Select Operator
                       expressions: id (type: int), val (type: string)
                       outputColumnNames: _col0, _col1
-                      Statistics: Num rows: 95000000 Data size: 16967000178 Basic stats: COMPLETE Column stats: NONE
+                      Statistics: Num rows: 1000000000 Data size: 296000000000 Basic stats: COMPLETE Column stats: NONE
                       Reduce Output Operator
                         key expressions: _col0 (type: int)
                         null sort order: z
                         sort order: +
                         Map-reduce partition columns: _col0 (type: int)
-                        Statistics: Num rows: 95000000 Data size: 16967000178 Basic stats: COMPLETE Column stats: NONE
+                        Statistics: Num rows: 1000000000 Data size: 296000000000 Basic stats: COMPLETE Column stats: NONE
                         value expressions: _col1 (type: string)
             Execution mode: vectorized, llap
             LLAP IO: all inputs
         Map 3 
             Map Operator Tree:
                 TableScan
-                  alias: t2
+                  alias: small
                   filterExpr: id is not null (type: boolean)
-                  Statistics: Num rows: 1000 Data size: 178788 Basic stats: COMPLETE Column stats: NONE
+                  Statistics: Num rows: 1000 Data size: 0 Basic stats: PARTIAL Column stats: NONE
                   Filter Operator
                     predicate: id is not null (type: boolean)
-                    Statistics: Num rows: 950 Data size: 169848 Basic stats: COMPLETE Column stats: NONE
+                    Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE
                     Select Operator
                       expressions: id (type: int), val (type: string)
                       outputColumnNames: _col0, _col1
-                      Statistics: Num rows: 950 Data size: 169848 Basic stats: COMPLETE Column stats: NONE
+                      Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE
                       Reduce Output Operator
                         key expressions: _col0 (type: int)
                         null sort order: z
                         sort order: +
                         Map-reduce partition columns: _col0 (type: int)
-                        Statistics: Num rows: 950 Data size: 169848 Basic stats: COMPLETE Column stats: NONE
+                        Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE
                         value expressions: _col1 (type: string)
                       Select Operator
                         expressions: _col0 (type: int)
                         outputColumnNames: _col0
-                        Statistics: Num rows: 950 Data size: 169848 Basic stats: COMPLETE Column stats: NONE
+                        Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE
                         Group By Operator
-                          aggregations: min(_col0), max(_col0), bloom_filter(_col0, expectedEntries=950)
+                          aggregations: min(_col0), max(_col0), bloom_filter(_col0, expectedEntries=1000000)
                           minReductionHashAggr: 0.99
                           mode: hash
                           outputColumnNames: _col0, _col1, _col2
-                          Statistics: Num rows: 1 Data size: 340 Basic stats: COMPLETE Column stats: NONE
+                          Statistics: Num rows: 1 Data size: 152 Basic stats: PARTIAL Column stats: NONE
                           Reduce Output Operator
                             null sort order: 
                             sort order: 
-                            Statistics: Num rows: 1 Data size: 340 Basic stats: COMPLETE Column stats: NONE
+                            Statistics: Num rows: 1 Data size: 152 Basic stats: PARTIAL Column stats: NONE
                             value expressions: _col0 (type: int), _col1 (type: int), _col2 (type: binary)
             Execution mode: vectorized, llap
             LLAP IO: all inputs
@@ -126,15 +132,15 @@ STAGE PLANS:
                 keys:
                   0 _col0 (type: int)
                   1 _col0 (type: int)
-                outputColumnNames: _col0, _col1, _col3
-                Statistics: Num rows: 104500002 Data size: 18663700600 Basic stats: COMPLETE Column stats: NONE
+                outputColumnNames: _col1, _col3
+                Statistics: Num rows: 1100000023 Data size: 325600007057 Basic stats: PARTIAL Column stats: NONE
                 Select Operator
-                  expressions: _col0 (type: int), _col1 (type: string), _col3 (type: string)
-                  outputColumnNames: _col0, _col1, _col2
-                  Statistics: Num rows: 104500002 Data size: 18663700600 Basic stats: COMPLETE Column stats: NONE
+                  expressions: _col1 (type: string), _col3 (type: string)
+                  outputColumnNames: _col0, _col1
+                  Statistics: Num rows: 1100000023 Data size: 325600007057 Basic stats: PARTIAL Column stats: NONE
                   File Output Operator
                     compressed: false
-                    Statistics: Num rows: 104500002 Data size: 18663700600 Basic stats: COMPLETE Column stats: NONE
+                    Statistics: Num rows: 1100000023 Data size: 325600007057 Basic stats: PARTIAL Column stats: NONE
                     table:
                         input format: org.apache.hadoop.mapred.SequenceFileInputFormat
                         output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
@@ -143,14 +149,14 @@ STAGE PLANS:
             Execution mode: vectorized, llap
             Reduce Operator Tree:
               Group By Operator
-                aggregations: min(VALUE._col0), max(VALUE._col1), bloom_filter(VALUE._col2, 1, expectedEntries=950)
+                aggregations: min(VALUE._col0), max(VALUE._col1), bloom_filter(VALUE._col2, 1, expectedEntries=1000000)
                 mode: final
                 outputColumnNames: _col0, _col1, _col2
-                Statistics: Num rows: 1 Data size: 340 Basic stats: COMPLETE Column stats: NONE
+                Statistics: Num rows: 1 Data size: 152 Basic stats: PARTIAL Column stats: NONE
                 Reduce Output Operator
                   null sort order: 
                   sort order: 
-                  Statistics: Num rows: 1 Data size: 340 Basic stats: COMPLETE Column stats: NONE
+                  Statistics: Num rows: 1 Data size: 152 Basic stats: PARTIAL Column stats: NONE
                   value expressions: _col0 (type: int), _col1 (type: int), _col2 (type: binary)
 
   Stage: Stage-0