-
Notifications
You must be signed in to change notification settings - Fork 29.2k
[SPARK-55848][SQL] Fix incorrect dedup results with SPJ partial clustering #54679
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
5de557f
a308ad6
fd1c96e
4f9752e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -94,13 +94,13 @@ class KeyGroupedPartitioningSuite extends DistributionAndOrderingSuiteBase { | |
|
|
||
| checkQueryPlan(df, catalystDistribution, | ||
| physical.KeyGroupedPartitioning(catalystDistribution.clustering, projectedPositions, | ||
| partitionValues, partitionValues)) | ||
| partitionValues, partitionValues, isPartiallyClustered = false)) | ||
|
|
||
| // multiple group keys should work too as long as partition keys are subset of them | ||
| df = sql(s"SELECT count(*) FROM testcat.ns.$table GROUP BY id, ts") | ||
| checkQueryPlan(df, catalystDistribution, | ||
| physical.KeyGroupedPartitioning(catalystDistribution.clustering, projectedPositions, | ||
| partitionValues, partitionValues)) | ||
| partitionValues, partitionValues, isPartiallyClustered = false)) | ||
| } | ||
|
|
||
| test("non-clustered distribution: no partition") { | ||
|
|
@@ -1130,6 +1130,131 @@ class KeyGroupedPartitioningSuite extends DistributionAndOrderingSuiteBase { | |
| } | ||
| } | ||
|
|
||
| test("[SPARK-54378] dropDuplicates after SPJ with partial clustering should give correct " + | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you please start the new test names with |
||
| "results") { | ||
| val items_partitions = Array(identity("id")) | ||
| createTable(items, itemsColumns, items_partitions) | ||
| // Insert two copies of id=1 so that the left side has duplicate rows for id=1 after the join, | ||
| // and three distinct id values in total. | ||
| sql(s"INSERT INTO testcat.ns.$items VALUES " + | ||
| "(1, 'aa', 40.0, cast('2020-01-01' as timestamp)), " + | ||
| "(1, 'aa', 41.0, cast('2020-01-15' as timestamp)), " + | ||
| "(2, 'bb', 10.0, cast('2020-01-01' as timestamp)), " + | ||
| "(3, 'cc', 15.5, cast('2020-02-01' as timestamp))") | ||
|
|
||
| val purchases_partitions = Array(identity("item_id")) | ||
| createTable(purchases, purchasesColumns, purchases_partitions) | ||
| sql(s"INSERT INTO testcat.ns.$purchases VALUES " + | ||
| "(1, 42.0, cast('2020-01-01' as timestamp)), " + | ||
| "(2, 11.0, cast('2020-01-01' as timestamp)), " + | ||
| "(3, 19.5, cast('2020-02-01' as timestamp))") | ||
|
|
||
| Seq(true, false).foreach { partiallyClustered => | ||
| withSQLConf( | ||
| SQLConf.REQUIRE_ALL_CLUSTER_KEYS_FOR_CO_PARTITION.key -> false.toString, | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we need to turn
|
||
| SQLConf.V2_BUCKETING_PUSH_PART_VALUES_ENABLED.key -> true.toString, | ||
| SQLConf.V2_BUCKETING_PARTIALLY_CLUSTERED_DISTRIBUTION_ENABLED.key -> | ||
| partiallyClustered.toString) { | ||
| // dropDuplicates on the join key: must produce exactly 3 distinct id values regardless | ||
| // of whether partial clustering is active. | ||
| val df = sql( | ||
| s""" | ||
| |SELECT DISTINCT i.id | ||
| |FROM testcat.ns.$items i | ||
| |JOIN testcat.ns.$purchases p ON i.id = p.item_id | ||
| |""".stripMargin) | ||
| checkAnswer(df, Seq(Row(1), Row(2), Row(3))) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you please check the presence of shuffles and the number of partitons of scans are the expected? |
||
| } | ||
| } | ||
| } | ||
|
|
||
| test("[SPARK-54378] Window dedup after SPJ with partial clustering should give correct " + | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ditto. |
||
| "results") { | ||
| val items_partitions = Array(identity("id")) | ||
| createTable(items, itemsColumns, items_partitions) | ||
| // Two rows with id=1 so that a naive per-task row_number() without a shuffle would | ||
| // keep both when partial clustering splits them across tasks. | ||
| sql(s"INSERT INTO testcat.ns.$items VALUES " + | ||
| "(1, 'aa', 40.0, cast('2020-01-01' as timestamp)), " + | ||
| "(1, 'aa', 41.0, cast('2020-01-15' as timestamp)), " + | ||
| "(2, 'bb', 10.0, cast('2020-01-01' as timestamp)), " + | ||
| "(3, 'cc', 15.5, cast('2020-02-01' as timestamp))") | ||
|
|
||
| val purchases_partitions = Array(identity("item_id")) | ||
| createTable(purchases, purchasesColumns, purchases_partitions) | ||
| sql(s"INSERT INTO testcat.ns.$purchases VALUES " + | ||
| "(1, 42.0, cast('2020-01-01' as timestamp)), " + | ||
| "(2, 11.0, cast('2020-01-01' as timestamp)), " + | ||
| "(3, 19.5, cast('2020-02-01' as timestamp))") | ||
|
|
||
| Seq(true, false).foreach { partiallyClustered => | ||
| withSQLConf( | ||
| SQLConf.REQUIRE_ALL_CLUSTER_KEYS_FOR_CO_PARTITION.key -> false.toString, | ||
| SQLConf.V2_BUCKETING_PUSH_PART_VALUES_ENABLED.key -> true.toString, | ||
| SQLConf.V2_BUCKETING_PARTIALLY_CLUSTERED_DISTRIBUTION_ENABLED.key -> | ||
| partiallyClustered.toString) { | ||
| // row_number() OVER (PARTITION BY id) should produce rn=1 for exactly one row per id. | ||
| val df = sql( | ||
| s""" | ||
| |SELECT id, price | ||
| |FROM ( | ||
| | SELECT i.id, i.price, | ||
| | ROW_NUMBER() OVER (PARTITION BY i.id ORDER BY i.price DESC) AS rn | ||
| | FROM testcat.ns.$items i | ||
| | JOIN testcat.ns.$purchases p ON i.id = p.item_id | ||
| |) t | ||
| |WHERE rn = 1 | ||
| |""".stripMargin) | ||
| // For id=1 only the row with the highest price (41.0) should survive. | ||
| checkAnswer(df, Seq(Row(1, 41.0f), Row(2, 10.0f), Row(3, 15.5f))) | ||
| } | ||
| } | ||
| } | ||
|
|
||
| test("SPARK-55848: dropDuplicates after SPJ with partial clustering should produce " + | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this test different to the first one? |
||
| "correct results") { | ||
| val items_partitions = Array(identity("id")) | ||
| createTable(items, itemsColumns, items_partitions) | ||
| // Two rows for id=1 so partial clustering may split them across tasks | ||
| sql( | ||
| s"INSERT INTO testcat.ns.$items VALUES " + | ||
| "(1, 'aa', 40.0, cast('2020-01-01' as timestamp)), " + | ||
| "(1, 'aa', 41.0, cast('2020-01-15' as timestamp)), " + | ||
| "(2, 'bb', 10.0, cast('2020-01-01' as timestamp)), " + | ||
| "(3, 'cc', 15.5, cast('2020-02-01' as timestamp))") | ||
|
|
||
| val purchases_partitions = Array(identity("item_id")) | ||
| createTable(purchases, purchasesColumns, purchases_partitions) | ||
| sql( | ||
| s"INSERT INTO testcat.ns.$purchases VALUES " + | ||
| "(1, 42.0, cast('2020-01-01' as timestamp)), " + | ||
| "(1, 50.0, cast('2020-01-02' as timestamp)), " + | ||
| "(2, 11.0, cast('2020-01-01' as timestamp)), " + | ||
| "(3, 19.5, cast('2020-02-01' as timestamp))") | ||
|
|
||
| withSQLConf( | ||
| SQLConf.REQUIRE_ALL_CLUSTER_KEYS_FOR_CO_PARTITION.key -> false.toString, | ||
| SQLConf.V2_BUCKETING_PUSH_PART_VALUES_ENABLED.key -> true.toString, | ||
| SQLConf.V2_BUCKETING_PARTIALLY_CLUSTERED_DISTRIBUTION_ENABLED.key -> true.toString) { | ||
| // dropDuplicates on the join key after a partially-clustered SPJ must still | ||
| // produce the correct number of distinct ids. Before SPARK-55848, the | ||
| // isPartiallyClustered flag was missing, so EnsureRequirements did not insert | ||
| // an Exchange before the dedup, leading to duplicate rows. | ||
| val df = sql(s""" | ||
| |SELECT DISTINCT i.id | ||
| |FROM testcat.ns.$items i | ||
| |JOIN testcat.ns.$purchases p ON i.id = p.item_id | ||
| |""".stripMargin) | ||
| checkAnswer(df, Seq(Row(1), Row(2), Row(3))) | ||
|
|
||
| // Also verify the plan inserts a shuffle for the dedup when partial clustering is active. | ||
| val allShuffles = collectAllShuffles(df.queryExecution.executedPlan) | ||
| assert( | ||
| allShuffles.nonEmpty, | ||
| "should contain a shuffle for the post-join dedup with partial clustering") | ||
| } | ||
| } | ||
|
|
||
| test("data source partitioning + dynamic partition filtering") { | ||
| withSQLConf( | ||
| SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1", | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is SPARK-54378 related to this issue?