From 12578e3e87858e522180671f82b975a65ddb0543 Mon Sep 17 00:00:00 2001
From: Alexander Gololobov <davenger@clickhouse.com>
Date: Tue, 9 Jun 2026 15:25:51 +0000
Subject: [PATCH 1/2] Merge pull request #106020 from
 ClickHouse/multistage_distributed_queries

Multi-stage distributed queries
---
 programs/server/Server.cpp                    |   79 +
 programs/server/config.xml                    |   19 +
 src/Analyzer/TableExpressionModifiers.cpp     |   16 +
 src/Analyzer/TableExpressionModifiers.h       |    6 +
 src/CMakeLists.txt                            |    2 +
 .../BuzzHouse/Generator/SessionSettings.cpp   |    2 +-
 src/Common/CurrentMetrics.cpp                 |    3 +
 src/Common/setThreadName.h                    |    1 +
 src/Core/ProtocolDefines.h                    |    4 +-
 src/Core/Settings.cpp                         |    2 +-
 .../QueryPlan/BroadcastExchangeStep.cpp       |   26 +
 .../QueryPlan/BroadcastExchangeStep.h         |   46 +
 .../QueryPlan/BroadcastReceiveStep.cpp        |   65 +
 .../QueryPlan/BroadcastReceiveStep.h          |   35 +
 .../QueryPlan/BroadcastSendStep.cpp           |   79 +
 src/Processors/QueryPlan/BroadcastSendStep.h  |   40 +
 .../QueryPlan/BuildQueryPipelineSettings.h    |    7 +
 src/Processors/QueryPlan/ExchangeLookup.h     |   54 +
 .../QueryPlan/GatherExchangeStep.cpp          |   17 +
 src/Processors/QueryPlan/GatherExchangeStep.h |   49 +
 .../QueryPlan/GatherReceiveStep.cpp           |   88 +
 src/Processors/QueryPlan/GatherReceiveStep.h  |   37 +
 src/Processors/QueryPlan/GatherSendStep.cpp   |   60 +
 src/Processors/QueryPlan/GatherSendStep.h     |   36 +
 src/Processors/QueryPlan/IParameterLookup.h   |   19 +
 src/Processors/QueryPlan/JoinStepLogical.h    |    1 +
 .../QueryPlan/LogicalExchangeStep.h           |   47 +
 src/Processors/QueryPlan/ObjectFilterStep.h   |    1 +
 .../QueryPlan/Optimizations/Optimizations.h   |    7 +
 .../QueryPlanOptimizationSettings.cpp         |   35 +-
 .../QueryPlanOptimizationSettings.h           |    3 +-
 .../Optimizations/makeDistributed.cpp         | 1053 ++++++++++++
 .../Optimizations/optimizeExtended.cpp        |   14 -
 .../QueryPlan/Optimizations/optimizeJoin.cpp  |   13 +-
 .../QueryPlan/Optimizations/optimizeTopK.cpp  |   10 +
 .../QueryPlan/Optimizations/optimizeTree.cpp  |   29 +-
 src/Processors/QueryPlan/QueryPlan.cpp        |  192 +++
 src/Processors/QueryPlan/QueryPlan.h          |   54 +
 .../QueryPlan/QueryPlanStepRegistry.cpp       |   22 +
 .../QueryPlan/ReadFromMergeTree.cpp           |  355 +++-
 src/Processors/QueryPlan/ReadFromMergeTree.h  |   12 +
 .../QueryPlan/ReadFromObjectStorageStep.h     |    1 +
 .../QueryPlan/ReadFromPreparedSource.cpp      |   53 +
 .../QueryPlan/ReadFromPreparedSource.h        |    6 +
 .../QueryPlan/ReadFromTableFunctionStep.cpp   |   14 -
 .../QueryPlan/ReadFromTableFunctionStep.h     |    1 +
 .../QueryPlan/ReadFromTableStep.cpp           |   14 -
 src/Processors/QueryPlan/ReadFromTableStep.h  |    1 +
 .../QueryPlan/ScatterExchangeStep.cpp         |   28 +
 .../QueryPlan/ScatterExchangeStep.h           |   64 +
 .../QueryPlan/ShuffleExchangeStep.cpp         |   18 +
 .../QueryPlan/ShuffleExchangeStep.h           |   62 +
 .../QueryPlan/ShuffleReceiveStep.cpp          |   64 +
 src/Processors/QueryPlan/ShuffleReceiveStep.h |   34 +
 src/Processors/QueryPlan/ShuffleSendStep.cpp  |  121 ++
 src/Processors/QueryPlan/ShuffleSendStep.h    |   48 +
 src/Processors/Sinks/NativeCompressedSink.cpp |   84 +
 src/Processors/Sinks/NativeCompressedSink.h   |   43 +
 .../Sources/NativeCompressedSource.cpp        |   63 +
 .../Sources/NativeCompressedSource.h          |   34 +
 .../Sources/ReadFromDistributedPlanSource.cpp |   75 +
 .../Sources/ReadFromDistributedPlanSource.h   |   57 +
 .../ScatterByPartitionTransform.cpp           |   35 +-
 .../Transforms/ScatterByPartitionTransform.h  |    7 +-
 src/QueryPipeline/DistributedPlanExecutor.cpp | 1523 +++++++++++++++++
 src/QueryPipeline/DistributedPlanExecutor.h   |  138 ++
 .../DistributedQuery/ExchangeConnections.cpp  |  188 ++
 .../DistributedQuery/ExchangeConnections.h    |   93 +
 .../DistributedQuery/ExchangeServer.cpp       |  263 +++
 src/Server/DistributedQuery/ExchangeServer.h  |   45 +
 .../DistributedQuery/FutureConnection.cpp     |   95 +
 .../DistributedQuery/FutureConnection.h       |   60 +
 .../StreamingExchangeLookup.cpp               |   69 +
 .../StreamingExchangeLookup.h                 |   20 +
 .../StreamingExchangeProtocol.cpp             |   72 +
 .../StreamingExchangeProtocol.h               |   85 +
 .../StreamingExchangeSink.cpp                 |  449 +++++
 .../DistributedQuery/StreamingExchangeSink.h  |   93 +
 .../StreamingExchangeSource.cpp               |  340 ++++
 .../StreamingExchangeSource.h                 |   83 +
 .../tests/gtest_distributed_query.cpp         |  573 +++++++
 .../tests/gtest_exchange_server_handshake.cpp |  203 +++
 .../StatelessWorker/StatelessTaskExecutor.cpp |  209 +++
 .../StatelessWorker/StatelessTaskExecutor.h   |   68 +
 .../StatelessWorker/StatelessWorkerClient.cpp |  211 +++
 .../StatelessWorker/StatelessWorkerClient.h   |   22 +
 .../StatelessWorkerEndpoint.cpp               |  335 ++++
 .../StatelessWorker/StatelessWorkerEndpoint.h |   28 +
 .../StatelessWorkerProtocol.cpp               |   24 +
 .../StatelessWorker/StatelessWorkerProtocol.h |   21 +
 src/Storages/SelectQueryInfo.cpp              |    3 +-
 tests/config/config.d/distributed_query.xml   |   39 +
 tests/config/install.sh                       |    6 +
 .../test_distributed_plan_cancel/__init__.py  |    0
 .../configs/config.d/stateless_worker.xml     |   36 +
 .../test_distributed_plan_cancel/test.py      |   87 +
 .../__init__.py                               |    0
 .../configs/config.d/stateless_worker.xml     |   46 +
 .../test.py                                   |  444 +++++
 ...03394_distributed_broadcast_join.reference |   57 +
 .../03394_distributed_broadcast_join.sql      |   57 +
 .../03394_distributed_shuffle_join.reference  |    3 +
 .../03394_distributed_shuffle_join.sql        |   27 +
 ...ed_shuffle_join_early_close_sink.reference |    1 +
 ...tributed_shuffle_join_early_close_sink.sql |   17 +
 ...ed_shuffle_join_with_aggregation.reference |   54 +
 ...tributed_shuffle_join_with_aggregation.sql |   68 +
 ...ributed_shuffle_join_with_filter.reference |   36 +
 ...4_distributed_shuffle_join_with_filter.sql |   40 +
 ...distributed_shuffle_join_with_in.reference |   24 +
 ...03394_distributed_shuffle_join_with_in.sql |   51 +
 ...buted_shuffle_join_with_prewhere.reference |   12 +
 ...distributed_shuffle_join_with_prewhere.sql |   40 +
 .../03394_distributed_sort.reference          |   19 +
 .../0_stateless/03394_distributed_sort.sh     |   36 +
 .../04097_distributed_join_kinds.reference    |  195 +++
 .../04097_distributed_join_kinds.sql          |  242 +++
 ...4105_distributed_final_replacing.reference |    4 +
 .../04105_distributed_final_replacing.sql     |   24 +
 ...tributed_shuffle_join_type_mixed.reference |   17 +
 ...05_distributed_shuffle_join_type_mixed.sql |   43 +
 ...d_aggregation_correctness_guards.reference |    2 +
 ...ributed_aggregation_correctness_guards.sql |   25 +
 ...istributed_read_error_terminates.reference |    0
 ...4307_distributed_read_error_terminates.sql |   20 +
 ...tals_rollup_cube_not_distributed.reference |    3 +
 ...ted_totals_rollup_cube_not_distributed.sql |   25 +
 ...d_aggregation_persisted_exchange.reference |    5 +
 ...ributed_aggregation_persisted_exchange.sql |   18 +
 ...ted_unserializable_step_rejected.reference |    0
 ...stributed_unserializable_step_rejected.sql |   17 +
 ...plan_set_operation_const_columns.reference |    1 +
 ...buted_plan_set_operation_const_columns.sql |   44 +
 ...20_distributed_plan_read_rejects.reference |    2 +
 .../04320_distributed_plan_read_rejects.sql   |   26 +
 ...d_plan_count_implicit_projection.reference |    3 +
 ...ributed_plan_count_implicit_projection.sql |   32 +
 137 files changed, 10772 insertions(+), 56 deletions(-)
 create mode 100644 src/Processors/QueryPlan/BroadcastExchangeStep.cpp
 create mode 100644 src/Processors/QueryPlan/BroadcastExchangeStep.h
 create mode 100644 src/Processors/QueryPlan/BroadcastReceiveStep.cpp
 create mode 100644 src/Processors/QueryPlan/BroadcastReceiveStep.h
 create mode 100644 src/Processors/QueryPlan/BroadcastSendStep.cpp
 create mode 100644 src/Processors/QueryPlan/BroadcastSendStep.h
 create mode 100644 src/Processors/QueryPlan/ExchangeLookup.h
 create mode 100644 src/Processors/QueryPlan/GatherExchangeStep.cpp
 create mode 100644 src/Processors/QueryPlan/GatherExchangeStep.h
 create mode 100644 src/Processors/QueryPlan/GatherReceiveStep.cpp
 create mode 100644 src/Processors/QueryPlan/GatherReceiveStep.h
 create mode 100644 src/Processors/QueryPlan/GatherSendStep.cpp
 create mode 100644 src/Processors/QueryPlan/GatherSendStep.h
 create mode 100644 src/Processors/QueryPlan/IParameterLookup.h
 create mode 100644 src/Processors/QueryPlan/LogicalExchangeStep.h
 create mode 100644 src/Processors/QueryPlan/Optimizations/makeDistributed.cpp
 delete mode 100644 src/Processors/QueryPlan/Optimizations/optimizeExtended.cpp
 create mode 100644 src/Processors/QueryPlan/ScatterExchangeStep.cpp
 create mode 100644 src/Processors/QueryPlan/ScatterExchangeStep.h
 create mode 100644 src/Processors/QueryPlan/ShuffleExchangeStep.cpp
 create mode 100644 src/Processors/QueryPlan/ShuffleExchangeStep.h
 create mode 100644 src/Processors/QueryPlan/ShuffleReceiveStep.cpp
 create mode 100644 src/Processors/QueryPlan/ShuffleReceiveStep.h
 create mode 100644 src/Processors/QueryPlan/ShuffleSendStep.cpp
 create mode 100644 src/Processors/QueryPlan/ShuffleSendStep.h
 create mode 100644 src/Processors/Sinks/NativeCompressedSink.cpp
 create mode 100644 src/Processors/Sinks/NativeCompressedSink.h
 create mode 100644 src/Processors/Sources/NativeCompressedSource.cpp
 create mode 100644 src/Processors/Sources/NativeCompressedSource.h
 create mode 100644 src/Processors/Sources/ReadFromDistributedPlanSource.cpp
 create mode 100644 src/Processors/Sources/ReadFromDistributedPlanSource.h
 create mode 100644 src/QueryPipeline/DistributedPlanExecutor.cpp
 create mode 100644 src/QueryPipeline/DistributedPlanExecutor.h
 create mode 100644 src/Server/DistributedQuery/ExchangeConnections.cpp
 create mode 100644 src/Server/DistributedQuery/ExchangeConnections.h
 create mode 100644 src/Server/DistributedQuery/ExchangeServer.cpp
 create mode 100644 src/Server/DistributedQuery/ExchangeServer.h
 create mode 100644 src/Server/DistributedQuery/FutureConnection.cpp
 create mode 100644 src/Server/DistributedQuery/FutureConnection.h
 create mode 100644 src/Server/DistributedQuery/StreamingExchangeLookup.cpp
 create mode 100644 src/Server/DistributedQuery/StreamingExchangeLookup.h
 create mode 100644 src/Server/DistributedQuery/StreamingExchangeProtocol.cpp
 create mode 100644 src/Server/DistributedQuery/StreamingExchangeProtocol.h
 create mode 100644 src/Server/DistributedQuery/StreamingExchangeSink.cpp
 create mode 100644 src/Server/DistributedQuery/StreamingExchangeSink.h
 create mode 100644 src/Server/DistributedQuery/StreamingExchangeSource.cpp
 create mode 100644 src/Server/DistributedQuery/StreamingExchangeSource.h
 create mode 100644 src/Server/DistributedQuery/tests/gtest_distributed_query.cpp
 create mode 100644 src/Server/DistributedQuery/tests/gtest_exchange_server_handshake.cpp
 create mode 100644 src/Server/StatelessWorker/StatelessTaskExecutor.cpp
 create mode 100644 src/Server/StatelessWorker/StatelessTaskExecutor.h
 create mode 100644 src/Server/StatelessWorker/StatelessWorkerClient.cpp
 create mode 100644 src/Server/StatelessWorker/StatelessWorkerClient.h
 create mode 100644 src/Server/StatelessWorker/StatelessWorkerEndpoint.cpp
 create mode 100644 src/Server/StatelessWorker/StatelessWorkerEndpoint.h
 create mode 100644 src/Server/StatelessWorker/StatelessWorkerProtocol.cpp
 create mode 100644 src/Server/StatelessWorker/StatelessWorkerProtocol.h
 create mode 100644 tests/config/config.d/distributed_query.xml
 create mode 100644 tests/integration/test_distributed_plan_cancel/__init__.py
 create mode 100644 tests/integration/test_distributed_plan_cancel/configs/config.d/stateless_worker.xml
 create mode 100644 tests/integration/test_distributed_plan_cancel/test.py
 create mode 100644 tests/integration/test_distributed_plan_replicated_merge_tree/__init__.py
 create mode 100644 tests/integration/test_distributed_plan_replicated_merge_tree/configs/config.d/stateless_worker.xml
 create mode 100644 tests/integration/test_distributed_plan_replicated_merge_tree/test.py
 create mode 100644 tests/queries/0_stateless/03394_distributed_broadcast_join.reference
 create mode 100644 tests/queries/0_stateless/03394_distributed_broadcast_join.sql
 create mode 100644 tests/queries/0_stateless/03394_distributed_shuffle_join.reference
 create mode 100644 tests/queries/0_stateless/03394_distributed_shuffle_join.sql
 create mode 100644 tests/queries/0_stateless/03394_distributed_shuffle_join_early_close_sink.reference
 create mode 100644 tests/queries/0_stateless/03394_distributed_shuffle_join_early_close_sink.sql
 create mode 100644 tests/queries/0_stateless/03394_distributed_shuffle_join_with_aggregation.reference
 create mode 100644 tests/queries/0_stateless/03394_distributed_shuffle_join_with_aggregation.sql
 create mode 100644 tests/queries/0_stateless/03394_distributed_shuffle_join_with_filter.reference
 create mode 100644 tests/queries/0_stateless/03394_distributed_shuffle_join_with_filter.sql
 create mode 100644 tests/queries/0_stateless/03394_distributed_shuffle_join_with_in.reference
 create mode 100644 tests/queries/0_stateless/03394_distributed_shuffle_join_with_in.sql
 create mode 100644 tests/queries/0_stateless/03394_distributed_shuffle_join_with_prewhere.reference
 create mode 100644 tests/queries/0_stateless/03394_distributed_shuffle_join_with_prewhere.sql
 create mode 100644 tests/queries/0_stateless/03394_distributed_sort.reference
 create mode 100755 tests/queries/0_stateless/03394_distributed_sort.sh
 create mode 100644 tests/queries/0_stateless/04097_distributed_join_kinds.reference
 create mode 100644 tests/queries/0_stateless/04097_distributed_join_kinds.sql
 create mode 100644 tests/queries/0_stateless/04105_distributed_final_replacing.reference
 create mode 100644 tests/queries/0_stateless/04105_distributed_final_replacing.sql
 create mode 100644 tests/queries/0_stateless/04305_distributed_shuffle_join_type_mixed.reference
 create mode 100644 tests/queries/0_stateless/04305_distributed_shuffle_join_type_mixed.sql
 create mode 100644 tests/queries/0_stateless/04306_distributed_aggregation_correctness_guards.reference
 create mode 100644 tests/queries/0_stateless/04306_distributed_aggregation_correctness_guards.sql
 create mode 100644 tests/queries/0_stateless/04307_distributed_read_error_terminates.reference
 create mode 100644 tests/queries/0_stateless/04307_distributed_read_error_terminates.sql
 create mode 100644 tests/queries/0_stateless/04308_distributed_totals_rollup_cube_not_distributed.reference
 create mode 100644 tests/queries/0_stateless/04308_distributed_totals_rollup_cube_not_distributed.sql
 create mode 100644 tests/queries/0_stateless/04309_distributed_aggregation_persisted_exchange.reference
 create mode 100644 tests/queries/0_stateless/04309_distributed_aggregation_persisted_exchange.sql
 create mode 100644 tests/queries/0_stateless/04310_distributed_unserializable_step_rejected.reference
 create mode 100644 tests/queries/0_stateless/04310_distributed_unserializable_step_rejected.sql
 create mode 100644 tests/queries/0_stateless/04319_distributed_plan_set_operation_const_columns.reference
 create mode 100644 tests/queries/0_stateless/04319_distributed_plan_set_operation_const_columns.sql
 create mode 100644 tests/queries/0_stateless/04320_distributed_plan_read_rejects.reference
 create mode 100644 tests/queries/0_stateless/04320_distributed_plan_read_rejects.sql
 create mode 100644 tests/queries/0_stateless/04321_distributed_plan_count_implicit_projection.reference
 create mode 100644 tests/queries/0_stateless/04321_distributed_plan_count_implicit_projection.sql

diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp
index 774c6d9f479b..35b816e78b4b 100644
--- a/programs/server/Server.cpp
+++ b/programs/server/Server.cpp
@@ -115,8 +115,11 @@
 #include <Compression/CompressionCodecEncrypted.h>
 #include <Parsers/ASTAlterQuery.h>
 #include <Server/CloudPlacementInfo.h>
+#include <Server/DistributedQuery/ExchangeConnections.h>
+#include <Server/DistributedQuery/ExchangeServer.h>
 #include <Server/HTTP/HTTPServer.h>
 #include <Server/HTTP/HTTPServerConnectionFactory.h>
+#include <Server/StatelessWorker/StatelessWorkerEndpoint.h>
 #include <Server/MySQLHandlerFactory.h>
 #include <Server/PostgreSQLHandlerFactory.h>
 #include <Server/ProtocolServerAdapter.h>
@@ -1938,6 +1941,82 @@ try
     LOG_DEBUG(log, "Initializing interserver credentials.");
     global_context->updateInterserverCredentials(config());
 
+    std::shared_ptr<StatelessWorkerEndpoint> stateless_worker_endpoint_ptr{nullptr};
+    String stateless_worker_endpoint_name;
+    if (config().getBool("stateless_worker_server.enabled", false))
+    {
+        String stateless_worker_endpoint = config().getString("stateless_worker_server.endpoint", "localhost");
+        stateless_worker_endpoint_ptr = std::make_shared<StatelessWorkerEndpoint>();
+        stateless_worker_endpoint_name = stateless_worker_endpoint_ptr->getId(stateless_worker_endpoint);
+        global_context->getInterserverIOHandler().addEndpoint(stateless_worker_endpoint_name, stateless_worker_endpoint_ptr);
+        LOG_DEBUG(log, "Added stateless worker endpoint '{}'.", stateless_worker_endpoint_name);
+    }
+
+    SCOPE_EXIT({
+        if (stateless_worker_endpoint_ptr)
+        {
+            /// Remove the same endpoint that was registered (the configured name may differ from "localhost").
+            LOG_DEBUG(log, "Shutting down stateless worker endpoint '{}'.", stateless_worker_endpoint_name);
+            global_context->getInterserverIOHandler().removeEndpointIfExists(stateless_worker_endpoint_name);
+
+            stateless_worker_endpoint_ptr->blocker.cancelForever();
+            stateless_worker_endpoint_ptr->shutdown();
+            /// Acquire the lock to wait for all in-flight requests to finish.
+            std::lock_guard lock(stateless_worker_endpoint_ptr->rwlock);
+        }
+        stateless_worker_endpoint_ptr.reset();
+    });
+
+    #ifdef OS_LINUX
+    ExchangeConnectionsPtr exchange_connections_ptr = ExchangeConnections::instance();
+    std::vector<std::shared_ptr<ExchangeServer>> exchange_servers;
+    if (auto streaming_exchange_port = config().getUInt("distributed_query.streaming_exchange_port", 0))
+    {
+        if (streaming_exchange_port > 65535)
+            throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER,
+                "`distributed_query.streaming_exchange_port` must be in range 1..65535, got {}", streaming_exchange_port);
+
+        /// The exchange handshake is unauthenticated, so the listener is never bound to all interfaces
+        /// implicitly: the streaming exchange is enabled only when explicit listen host(s) are given.
+        Strings exchange_listen_hosts = DB::getMultipleValuesFromConfig(config(), "distributed_query", "streaming_exchange_listen_host");
+        if (exchange_listen_hosts.empty())
+        {
+            LOG_ERROR(log, "`distributed_query.streaming_exchange_port` is set but no "
+                "`distributed_query.streaming_exchange_listen_host` is configured; the streaming exchange "
+                "server is not started. Specify a listen host to enable it.");
+        }
+        else
+        {
+            for (const auto & listen_host : exchange_listen_hosts)
+            {
+                try
+                {
+                    exchange_servers.emplace_back(std::make_shared<ExchangeServer>(listen_host, streaming_exchange_port, exchange_connections_ptr));
+                    exchange_servers.back()->start();
+                }
+                catch (Poco::Exception & e)
+                {
+                    LOG_INFO(log, "Failed to start exchange server on {}:{}: {}",
+                        listen_host, streaming_exchange_port, e.displayText());
+                }
+            }
+            if (exchange_servers.empty())
+                throw Exception(ErrorCodes::NETWORK_ERROR, "Failed to start ExchangeServer on port {}", streaming_exchange_port);
+        }
+    }
+
+    SCOPE_EXIT({
+        for (auto & exchange_server_ptr : exchange_servers)
+        {
+            exchange_server_ptr->stop();
+            exchange_server_ptr.reset();
+        }
+    });
+    #else
+    if (config().getUInt("distributed_query.streaming_exchange_port", 0))
+        throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "ExchangeServer is not supported on non-linux platform");
+    #endif
+
     /// Set up caches.
 
     const size_t max_cache_size = static_cast<size_t>(static_cast<double>(physical_server_memory) * server_settings[ServerSetting::cache_size_to_ram_max_ratio]);
diff --git a/programs/server/config.xml b/programs/server/config.xml
index 8c16f317b320..319abd1bf44f 100644
--- a/programs/server/config.xml
+++ b/programs/server/config.xml
@@ -280,6 +280,25 @@
         <password></password>
     </interserver_http_credentials>-->
 
+    <!-- Multi-stage distributed query execution (experimental, enabled per query with the
+         `make_distributed_plan` setting). When a query uses streaming exchanges, workers connect
+         directly to each other on this port to pass intermediate results.
+
+         Security: the streaming exchange handshake is NOT authenticated. The port must be treated
+         like the interserver port: reachable only from trusted ClickHouse nodes in the same cluster,
+         never from untrusted networks. For that reason the listener is started only when an explicit
+         `streaming_exchange_listen_host` is configured; setting only the port logs an error and does
+         not start the listener. Bind it to a private interface and protect it with a firewall.
+         Exchanges are keyed by a random per-query identifier, which makes blind injection into a
+         running query impractical, but this is defense in depth, not an access-control mechanism.
+      -->
+    <!--
+    <distributed_query>
+        <streaming_exchange_port>9223</streaming_exchange_port>
+        <streaming_exchange_listen_host>127.0.0.1</streaming_exchange_listen_host>
+    </distributed_query>
+    -->
+
     <!-- Listen specified address.
          Use :: (wildcard IPv6 address), if you want to accept connections both with IPv4 and IPv6 from everywhere.
          Notes:
diff --git a/src/Analyzer/TableExpressionModifiers.cpp b/src/Analyzer/TableExpressionModifiers.cpp
index c8002f44c977..07595b104873 100644
--- a/src/Analyzer/TableExpressionModifiers.cpp
+++ b/src/Analyzer/TableExpressionModifiers.cpp
@@ -2,6 +2,8 @@
 
 #include <Common/SipHash.h>
 
+#include <IO/ReadBuffer.h>
+#include <IO/ReadHelpers.h>
 #include <IO/WriteBuffer.h>
 #include <IO/WriteHelpers.h>
 #include <IO/Operators.h>
@@ -40,6 +42,20 @@ void TableExpressionModifiers::updateTreeHash(SipHash & hash_state) const
     }
 }
 
+void serializeRational(TableExpressionModifiers::Rational val, WriteBuffer & out)
+{
+    writeIntBinary(val.numerator, out);
+    writeIntBinary(val.denominator, out);
+}
+
+TableExpressionModifiers::Rational deserializeRational(ReadBuffer & in)
+{
+    TableExpressionModifiers::Rational val;
+    readIntBinary(val.numerator, in);
+    readIntBinary(val.denominator, in);
+    return val;
+}
+
 String TableExpressionModifiers::formatForErrorMessage() const
 {
     WriteBufferFromOwnString buffer;
diff --git a/src/Analyzer/TableExpressionModifiers.h b/src/Analyzer/TableExpressionModifiers.h
index 4b3e2e768314..dff3e50d748a 100644
--- a/src/Analyzer/TableExpressionModifiers.h
+++ b/src/Analyzer/TableExpressionModifiers.h
@@ -5,6 +5,9 @@
 namespace DB
 {
 
+class ReadBuffer;
+class WriteBuffer;
+
 /** Modifiers that can be used for table, table function and subquery in JOIN TREE.
   *
   * Example: SELECT * FROM test_table SAMPLE 0.1 OFFSET 0.1 FINAL
@@ -74,6 +77,9 @@ class TableExpressionModifiers
     std::optional<Rational> sample_offset_ratio;
 };
 
+void serializeRational(TableExpressionModifiers::Rational val, WriteBuffer & out);
+TableExpressionModifiers::Rational deserializeRational(ReadBuffer & in);
+
 inline bool operator==(const TableExpressionModifiers & lhs, const TableExpressionModifiers & rhs)
 {
     return lhs.hasFinal() == rhs.hasFinal() && lhs.getSampleSizeRatio() == rhs.getSampleSizeRatio() && lhs.getSampleOffsetRatio() == rhs.getSampleOffsetRatio();
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 10e3d049ac52..a9d78ec5c5c4 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -332,6 +332,8 @@ if (TARGET ch_contrib::ssh)
     add_object_library(clickhouse_server_ssh Server/SSH)
 endif()
 add_object_library(clickhouse_server_embedded_client Server/ClientEmbedded)
+add_object_library(clickhouse_server_statelessworker Server/StatelessWorker)
+add_object_library(clickhouse_server_distributedquery Server/DistributedQuery)
 add_object_library(clickhouse_formats Formats)
 add_object_library(clickhouse_processors Processors)
 add_object_library(clickhouse_processors_executors Processors/Executors)
diff --git a/src/Client/BuzzHouse/Generator/SessionSettings.cpp b/src/Client/BuzzHouse/Generator/SessionSettings.cpp
index 107566db278e..f9889fd2fc25 100644
--- a/src/Client/BuzzHouse/Generator/SessionSettings.cpp
+++ b/src/Client/BuzzHouse/Generator/SessionSettings.cpp
@@ -541,7 +541,7 @@ std::unordered_map<String, CHSetting> serverSettings = {
     {"distributed_insert_skip_read_only_replicas", trueOrFalseSettingNoOracle},
     {"distributed_plan_default_reader_bucket_count",
      CHSetting(
-         [](RandomGenerator & rg, FuzzConfig &) { return std::to_string(rg.thresholdGenerator<uint64_t>(0.2, 0.2, 0, 128)); }, {}, false)},
+         [](RandomGenerator & rg, FuzzConfig &) { return std::to_string(rg.thresholdGenerator<uint64_t>(0.2, 0.2, 1, 128)); }, {}, false)},
     {"distributed_plan_default_shuffle_join_bucket_count",
      CHSetting(
          [](RandomGenerator & rg, FuzzConfig &) { return std::to_string(rg.thresholdGenerator<uint64_t>(0.2, 0.2, 0, 128)); }, {}, false)},
diff --git a/src/Common/CurrentMetrics.cpp b/src/Common/CurrentMetrics.cpp
index c46cc25687aa..4537dcc8a79d 100644
--- a/src/Common/CurrentMetrics.cpp
+++ b/src/Common/CurrentMetrics.cpp
@@ -489,6 +489,9 @@
     M(StatelessWorkerThreads, "Number of threads in the stateless worker thread pool.") \
     M(StatelessWorkerThreadsActive, "Number of threads in the stateless worker thread pool running a task.") \
     M(StatelessWorkerThreadsScheduled, "Number of queued or active jobs in the stateless worker thread pool.") \
+    M(ExchangeServerThreads, "Number of threads in the distributed exchange server handshake thread pool.") \
+    M(ExchangeServerThreadsActive, "Number of threads in the distributed exchange server handshake thread pool running a task.") \
+    M(ExchangeServerThreadsScheduled, "Number of queued or active jobs in the distributed exchange server handshake thread pool.") \
     M(ReadonlyDisks, "Number of disks that were marked as readonly during disk check.") \
     M(BrokenDisks, "Number of disks disks that were marked as broken during disk check.") \
     M(TaskTrackerThreads, "Number of threads used by the distributed query remote task tracker.") \
diff --git a/src/Common/setThreadName.h b/src/Common/setThreadName.h
index daeb07a719cf..ff18d4da38bc 100644
--- a/src/Common/setThreadName.h
+++ b/src/Common/setThreadName.h
@@ -56,6 +56,7 @@ namespace DB
     M(DISTRIBUTED_SCHEDULE_POOL, "BgDistSchPool") \
     M(DISTRIBUTED_SINK, "DistrOutStrProc") \
     M(DISTRIBUTED_INDEX_ANALYSIS, "DistIdxAnalysis") \
+    M(DISTRIBUTED_QUERY_TASK, "DistQueryTask") \
     M(DROP_TABLES, "DropTables") \
     M(DWARF_DECODER, "DWARFDecoder") \
     M(ERROR_LOG, "ErrorLog") \
diff --git a/src/Core/ProtocolDefines.h b/src/Core/ProtocolDefines.h
index 305aa2e7d80e..ea31888c9158 100644
--- a/src/Core/ProtocolDefines.h
+++ b/src/Core/ProtocolDefines.h
@@ -53,7 +53,9 @@ static constexpr auto DBMS_MIN_REVISION_WITH_QUERY_AND_LINE_NUMBERS = 54475;
 
 static constexpr auto DBMS_MERGE_TREE_PART_INFO_VERSION = 1;
 
-static constexpr auto DBMS_QUERY_PLAN_SERIALIZATION_VERSION = 0;
+static constexpr auto DBMS_QUERY_PLAN_SERIALIZATION_VERSION = 1;
+/// Version 1 added the initiator's settings changes to the task.
+static constexpr auto DBMS_DISTRIBUTED_TASK_SERIALIZATION_VERSION = 1;
 
 static constexpr auto DBMS_MIN_REVISION_WITH_INTERSERVER_SECRET = 54441;
 
diff --git a/src/Core/Settings.cpp b/src/Core/Settings.cpp
index 7a9089880d96..b70462b0f40c 100644
--- a/src/Core/Settings.cpp
+++ b/src/Core/Settings.cpp
@@ -7803,7 +7803,7 @@ Run all tasks of a distributed query plan locally. Useful for testing and debugg
     DECLARE(NonZeroUInt64, distributed_plan_default_shuffle_join_bucket_count, 8, R"(
 Default number of buckets for distributed shuffle-hash-join.
 )", EXPERIMENTAL) \
-    DECLARE(UInt64, distributed_plan_default_reader_bucket_count, 8, R"(
+    DECLARE(NonZeroUInt64, distributed_plan_default_reader_bucket_count, 8, R"(
 Default number of tasks for parallel reading in distributed query. Tasks are spread across between replicas.
 )", EXPERIMENTAL) \
     DECLARE(Bool, distributed_plan_optimize_exchanges, true, R"(
diff --git a/src/Processors/QueryPlan/BroadcastExchangeStep.cpp b/src/Processors/QueryPlan/BroadcastExchangeStep.cpp
new file mode 100644
index 000000000000..324096a8205c
--- /dev/null
+++ b/src/Processors/QueryPlan/BroadcastExchangeStep.cpp
@@ -0,0 +1,26 @@
+#include <Processors/QueryPlan/BroadcastExchangeStep.h>
+#include <Processors/QueryPlan/BroadcastSendStep.h>
+#include <Processors/QueryPlan/BroadcastReceiveStep.h>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int LOGICAL_ERROR;
+}
+
+std::pair<QueryPlanStepPtr, QueryPlanStepPtr> BroadcastExchangeStep::createSinkAndSourcePair(const String & exchange_id, const Strings & source_shards) const
+{
+    if (source_shards.size() != 1)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "BroadcastExchangeStep should have one source shard, got {}", source_shards.size());
+
+    size_t num_buckets = getResultBucketCount();
+    auto sink = std::make_unique<BroadcastSendStep>(input_headers.front(), exchange_id, num_buckets);
+
+    auto source = std::make_unique<BroadcastReceiveStep>(output_header, exchange_id, source_shards);
+
+    return {std::move(sink), std::move(source)};
+}
+
+}
diff --git a/src/Processors/QueryPlan/BroadcastExchangeStep.h b/src/Processors/QueryPlan/BroadcastExchangeStep.h
new file mode 100644
index 000000000000..ec0b03cb619d
--- /dev/null
+++ b/src/Processors/QueryPlan/BroadcastExchangeStep.h
@@ -0,0 +1,46 @@
+#pragma once
+
+#include <Processors/QueryPlan/LogicalExchangeStep.h>
+
+namespace DB
+{
+
+/// Copies the data from 1 logical streams into N logical streams.
+class BroadcastExchangeStep final : public LogicalExchangeStep
+{
+public:
+BroadcastExchangeStep(SharedHeader input_header_, size_t result_bucket_count_)
+        : LogicalExchangeStep(input_header_)
+        , result_bucket_count(result_bucket_count_)
+    {
+    }
+
+    String getName() const override { return "BroadcastExchange"; }
+
+    void transformPipeline(QueryPipelineBuilder & /*pipeline*/, const BuildQueryPipelineSettings &) override
+    {
+        /// Doesn't change the pipeline if executed directly
+    }
+
+    size_t getSourceBucketCount() const override
+    {
+        return 1;
+    }
+
+    size_t getResultBucketCount() const override
+    {
+        return result_bucket_count;
+    }
+
+    std::pair<QueryPlanStepPtr, QueryPlanStepPtr> createSinkAndSourcePair(const String & exchange_id, const Strings & source_shards) const override;
+
+private:
+    void updateOutputHeader() override
+    {
+        output_header = input_headers.front();
+    }
+
+    const size_t result_bucket_count;
+};
+
+}
diff --git a/src/Processors/QueryPlan/BroadcastReceiveStep.cpp b/src/Processors/QueryPlan/BroadcastReceiveStep.cpp
new file mode 100644
index 000000000000..633ad85be66a
--- /dev/null
+++ b/src/Processors/QueryPlan/BroadcastReceiveStep.cpp
@@ -0,0 +1,65 @@
+#include <Processors/QueryPlan/BroadcastReceiveStep.h>
+#include <Processors/QueryPlan/QueryPlanStepRegistry.h>
+#include <Processors/QueryPlan/Serialization.h>
+#include <Processors/QueryPlan/IParameterLookup.h>
+#include <Processors/QueryPlan/ExchangeLookup.h>
+#include <Processors/QueryPlan/LogicalExchangeStep.h>
+#include <Processors/ISource.h>
+#include <QueryPipeline/QueryPipelineBuilder.h>
+#include <QueryPipeline/Pipe.h>
+#include <IO/WriteHelpers.h>
+#include <IO/ReadHelpers.h>
+
+
+namespace DB
+{
+
+void BroadcastReceiveStep::initializePipeline(QueryPipelineBuilder & pipeline, const BuildQueryPipelineSettings & settings)
+{
+    const String bucket_id = settings.parameter_lookup->getParameter("bucket_id").safeGet<String>();
+
+    VectorWithMemoryTracking<std::unique_ptr<QueryPipelineBuilder>> pipelines;
+
+    /// Read all shards
+    for (const String & shard_id : source_shards)
+    {
+        std::unique_ptr<QueryPipelineBuilder> pipeline_ptr = std::make_unique<QueryPipelineBuilder>();
+        pipeline_ptr->init(Pipe(settings.exchange_lookup->createSource(output_header, ExchangeStreamId(exchange_id, shard_id, bucket_id))));
+        pipelines.emplace_back(std::move(pipeline_ptr));
+    }
+
+    pipeline = QueryPipelineBuilder::unitePipelines(std::move(pipelines), 0, &processors);
+}
+
+void BroadcastReceiveStep::serialize(Serialization & ctx) const
+{
+    writeStringBinary(exchange_id, ctx.out);
+    writeVarUInt(source_shards.size(), ctx.out);
+    for (const String & shard_id : source_shards)
+        writeStringBinary(shard_id, ctx.out);
+}
+
+std::unique_ptr<IQueryPlanStep> BroadcastReceiveStep::deserialize(Deserialization & ctx)
+{
+    String exchange_id;
+    readStringBinary(exchange_id, ctx.in);
+    size_t shard_id_count = 0;
+    readVarUInt(shard_id_count, ctx.in);
+    Strings list_of_shard_ids;
+    list_of_shard_ids.reserve(shard_id_count);
+    for (size_t i = 0; i < shard_id_count; ++i)
+    {
+        String shard_id;
+        readStringBinary(shard_id, ctx.in);
+        list_of_shard_ids.push_back(std::move(shard_id));
+    }
+    return std::make_unique<BroadcastReceiveStep>(ctx.output_header, exchange_id, list_of_shard_ids);
+}
+
+void registerBroadcastReceiveStep(QueryPlanStepRegistry & registry);
+void registerBroadcastReceiveStep(QueryPlanStepRegistry & registry)
+{
+    registry.registerStep("BroadcastReceive", BroadcastReceiveStep::deserialize);
+}
+
+}
diff --git a/src/Processors/QueryPlan/BroadcastReceiveStep.h b/src/Processors/QueryPlan/BroadcastReceiveStep.h
new file mode 100644
index 000000000000..23001792ef2c
--- /dev/null
+++ b/src/Processors/QueryPlan/BroadcastReceiveStep.h
@@ -0,0 +1,35 @@
+#pragma once
+
+#include <Processors/QueryPlan/ISourceStep.h>
+
+namespace DB
+{
+
+/// Receive part of BroadcastExchangeStep
+class BroadcastReceiveStep : public ISourceStep
+{
+public:
+    BroadcastReceiveStep(SharedHeader header_, const String & exchange_id_, const Strings & source_shards_)
+        : ISourceStep(std::move(header_))
+        , exchange_id(exchange_id_)
+        , source_shards(source_shards_)
+    {
+        /// TODO: is there a scenario where we broadcast partitioned source and thus have multiple source shards?
+        chassert(source_shards.size() == 1);
+    }
+
+    String getName() const override { return "BroadcastReceive"; }
+
+    void initializePipeline(QueryPipelineBuilder & pipeline, const BuildQueryPipelineSettings & settings) override;
+
+    void serialize(Serialization & ctx) const override;
+    bool isSerializable() const override { return true; }
+
+    static std::unique_ptr<IQueryPlanStep> deserialize(Deserialization & ctx);
+
+private:
+    const String exchange_id;
+    const Strings source_shards;
+};
+
+}
diff --git a/src/Processors/QueryPlan/BroadcastSendStep.cpp b/src/Processors/QueryPlan/BroadcastSendStep.cpp
new file mode 100644
index 000000000000..c4cddd5b5c44
--- /dev/null
+++ b/src/Processors/QueryPlan/BroadcastSendStep.cpp
@@ -0,0 +1,79 @@
+#include <Processors/QueryPlan/BroadcastSendStep.h>
+#include <Processors/QueryPlan/QueryPlanStepRegistry.h>
+#include <Processors/QueryPlan/Serialization.h>
+#include <Processors/QueryPlan/IParameterLookup.h>
+#include <Processors/QueryPlan/ExchangeLookup.h>
+#include <Processors/QueryPlan/LogicalExchangeStep.h>
+#include <Processors/Transforms/CopyTransform.h>
+#include <Processors/ISink.h>
+#include <Processors/Port.h>
+#include <QueryPipeline/QueryPipelineBuilder.h>
+#include <QueryPipeline/Pipe.h>
+#include <IO/WriteHelpers.h>
+#include <IO/ReadHelpers.h>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int LOGICAL_ERROR;
+}
+
+QueryPipelineBuilderPtr BroadcastSendStep::updatePipeline(QueryPipelineBuilders pipelines, const BuildQueryPipelineSettings & settings)
+{
+    /// Send copies of data to num_buckets outputs
+    auto & pipeline = *pipelines.front();
+    auto stream_header = pipeline.getSharedHeader();
+    {
+        pipeline.resize(1);
+        if (num_buckets > 1)
+        {
+            /// Copies the input block to num_buckets outputs
+            auto copy = std::make_shared<CopyTransform>(stream_header, num_buckets);
+            pipeline.addTransform(copy);
+        }
+    }
+
+    const String shard_id = settings.parameter_lookup->getParameter("bucket_id").safeGet<String>();
+
+    /// Add sink for each bucket
+    size_t bucket = 0;
+    pipeline.setSinks([&](const SharedHeader & header, Pipe::StreamType stream_type)
+    {
+        chassert(stream_type == Pipe::StreamType::Main);
+        String destination_bucket_id = toString(bucket);
+        ++bucket;
+        return settings.exchange_lookup->createSink(header, ExchangeStreamId(exchange_id, shard_id, destination_bucket_id));
+    });
+
+    if (bucket != num_buckets)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "BroadcastSendStep: expected {} buckets, but created only {}", num_buckets, bucket);
+
+    return std::move(pipelines.front());
+}
+
+void BroadcastSendStep::serialize(Serialization & ctx) const
+{
+    writeStringBinary(exchange_id, ctx.out);
+    writeVarUInt(num_buckets, ctx.out);
+}
+
+std::unique_ptr<IQueryPlanStep> BroadcastSendStep::deserialize(Deserialization & ctx)
+{
+    String exchange_id;
+    readStringBinary(exchange_id, ctx.in);
+
+    size_t num_buckets = 0;
+    readVarUInt(num_buckets, ctx.in);
+
+    return std::make_unique<BroadcastSendStep>(ctx.input_headers.front(), exchange_id, num_buckets);
+}
+
+void registerBroadcastSendStep(QueryPlanStepRegistry & registry);
+void registerBroadcastSendStep(QueryPlanStepRegistry & registry)
+{
+    registry.registerStep("BroadcastSend", BroadcastSendStep::deserialize);
+}
+
+}
diff --git a/src/Processors/QueryPlan/BroadcastSendStep.h b/src/Processors/QueryPlan/BroadcastSendStep.h
new file mode 100644
index 000000000000..2c03d065be9a
--- /dev/null
+++ b/src/Processors/QueryPlan/BroadcastSendStep.h
@@ -0,0 +1,40 @@
+#pragma once
+
+#include <Processors/QueryPlan/IQueryPlanStep.h>
+
+
+namespace DB
+{
+
+/// Send part of BroadcastExchangeStep
+/// Copies all data to each of the destination buckets
+class BroadcastSendStep final : public IQueryPlanStep
+{
+public:
+    BroadcastSendStep(SharedHeader input_header_, const String & exchange_id_, size_t num_buckets_)
+        : exchange_id(exchange_id_)
+        , num_buckets(num_buckets_)
+    {
+        chassert(num_buckets > 0);
+        updateInputHeaders({std::move(input_header_)});
+    }
+
+    String getName() const override { return "BroadcastSend"; }
+
+    bool hasOutputStream() const { return false; }
+
+    QueryPipelineBuilderPtr updatePipeline(QueryPipelineBuilders pipelines, const BuildQueryPipelineSettings & settings) override;
+
+    void serialize(Serialization & ctx) const override;
+    bool isSerializable() const override { return true; }
+
+    static std::unique_ptr<IQueryPlanStep> deserialize(Deserialization & ctx);
+
+private:
+    void updateOutputHeader() override {}
+
+    const String exchange_id;
+    const size_t num_buckets;
+};
+
+}
diff --git a/src/Processors/QueryPlan/BuildQueryPipelineSettings.h b/src/Processors/QueryPlan/BuildQueryPipelineSettings.h
index f599fda72ea8..6857219d9585 100644
--- a/src/Processors/QueryPlan/BuildQueryPipelineSettings.h
+++ b/src/Processors/QueryPlan/BuildQueryPipelineSettings.h
@@ -16,6 +16,10 @@ using QueryStatusPtr = std::shared_ptr<QueryStatus>;
 struct ITemporaryFileLookup;
 using TemporaryFileLookupPtr = std::shared_ptr<ITemporaryFileLookup>;
 using BlockMarshallingCallback = std::function<Block(const Block & block)>;
+struct IParameterLookup;
+using ParameterLookupPtr = std::shared_ptr<IParameterLookup>;
+struct IExchangeLookup;
+using ExchangeLookupPtr = std::shared_ptr<IExchangeLookup>;
 
 struct BuildQueryPipelineSettings
 {
@@ -28,6 +32,9 @@ struct BuildQueryPipelineSettings
     ProgressCallback progress_callback;
     TemporaryFileLookupPtr temporary_file_lookup;
     BlockMarshallingCallback block_marshalling_callback;
+    ParameterLookupPtr parameter_lookup;
+    ExchangeLookupPtr exchange_lookup;
+
 
     size_t max_threads;
     size_t aggregation_memory_efficient_merge_threads;
diff --git a/src/Processors/QueryPlan/ExchangeLookup.h b/src/Processors/QueryPlan/ExchangeLookup.h
new file mode 100644
index 000000000000..eb8a354c0e9c
--- /dev/null
+++ b/src/Processors/QueryPlan/ExchangeLookup.h
@@ -0,0 +1,54 @@
+#pragma once
+
+#include <base/types.h>
+#include <boost/noncopyable.hpp>
+#include <memory>
+
+namespace DB
+{
+
+class ISink;
+class ISource;
+class Block;
+
+/// Describes an individual stream of an Exchange, e.g. ShuffleExchange from M buckets to N buckets has M*N streams
+struct ExchangeStreamId
+{
+    String exchange_id;
+    String source_bucket;
+    String destination_bucket;
+
+    ExchangeStreamId() = default;
+
+    ExchangeStreamId(const String & exchange_id_, const String & source_bucket_, const String & destination_bucket_)
+        : exchange_id(exchange_id_)
+        , source_bucket(source_bucket_)
+        , destination_bucket(destination_bucket_)
+    {}
+
+    ExchangeStreamId(const String & exchange_id_, size_t source_bucket_, size_t destination_bucket_)
+        : exchange_id(exchange_id_)
+        , source_bucket(std::to_string(source_bucket_))
+        , destination_bucket(std::to_string(destination_bucket_))
+    {}
+
+    String toString() const
+    {
+        return exchange_id + "__" + source_bucket + "_" + destination_bucket;
+    }
+};
+
+/// Interface for creating Sink and Source processors by exchange logical id when building query pipeline for
+/// distributed query plan fragment. The idea is to store only logical names of exchange streams in the query plan
+/// and create actual Sink and Source processors only when the query pipeline is built.
+struct IExchangeLookup : boost::noncopyable
+{
+    virtual ~IExchangeLookup() = default;
+
+    virtual std::shared_ptr<ISink> createSink(SharedHeader input_header, const ExchangeStreamId & exchange_stream_id) = 0;
+    virtual std::shared_ptr<ISource> createSource(SharedHeader output_header, const ExchangeStreamId & exchange_stream_id) = 0;
+};
+
+using ExchangeLookupPtr = std::shared_ptr<IExchangeLookup>;
+
+}
diff --git a/src/Processors/QueryPlan/GatherExchangeStep.cpp b/src/Processors/QueryPlan/GatherExchangeStep.cpp
new file mode 100644
index 000000000000..fafe1706b863
--- /dev/null
+++ b/src/Processors/QueryPlan/GatherExchangeStep.cpp
@@ -0,0 +1,17 @@
+#include <Processors/QueryPlan/GatherExchangeStep.h>
+#include <Processors/QueryPlan/GatherSendStep.h>
+#include <Processors/QueryPlan/GatherReceiveStep.h>
+
+namespace DB
+{
+
+std::pair<QueryPlanStepPtr, QueryPlanStepPtr> GatherExchangeStep::createSinkAndSourcePair(const String & exchange_id, const Strings & source_shards) const
+{
+    auto sink = std::make_unique<GatherSendStep>(input_headers.front(), exchange_id);
+
+    auto source = std::make_unique<GatherReceiveStep>(output_header, exchange_id, source_shards.size(), maintain_sort_description);
+
+    return {std::move(sink), std::move(source)};
+}
+
+}
diff --git a/src/Processors/QueryPlan/GatherExchangeStep.h b/src/Processors/QueryPlan/GatherExchangeStep.h
new file mode 100644
index 000000000000..9e26abbeea10
--- /dev/null
+++ b/src/Processors/QueryPlan/GatherExchangeStep.h
@@ -0,0 +1,49 @@
+#pragma once
+
+#include <Processors/QueryPlan/LogicalExchangeStep.h>
+#include <Core/SortDescription.h>
+
+#include <optional>
+
+namespace DB
+{
+
+/// Collects multiple logical streams of data with the same schema into one stream.
+class GatherExchangeStep final : public LogicalExchangeStep
+{
+public:
+    explicit GatherExchangeStep(SharedHeader input_header_, size_t source_bucket_count_, std::optional<SortDescription> maintain_sort_description_ = std::nullopt)
+        : LogicalExchangeStep(input_header_, std::move(maintain_sort_description_))
+        , source_bucket_count(source_bucket_count_)
+    {
+    }
+
+    String getName() const override { return "GatherExchange"; }
+
+    void transformPipeline(QueryPipelineBuilder & /*pipeline*/, const BuildQueryPipelineSettings &) override
+    {
+        /// Doesn't change the pipeline if executed directly
+    }
+
+    size_t getSourceBucketCount() const override
+    {
+        return source_bucket_count;
+    }
+
+    size_t getResultBucketCount() const override
+    {
+        return 1;
+    }
+
+    std::pair<QueryPlanStepPtr, QueryPlanStepPtr> createSinkAndSourcePair(const String & exchange_id, const Strings & source_shards) const override;
+
+private:
+    void updateOutputHeader() override
+    {
+        output_header = input_headers.front();
+    }
+
+    const size_t source_bucket_count;
+};
+
+}
diff --git a/src/Processors/QueryPlan/GatherReceiveStep.cpp b/src/Processors/QueryPlan/GatherReceiveStep.cpp
new file mode 100644
index 000000000000..595035113846
--- /dev/null
+++ b/src/Processors/QueryPlan/GatherReceiveStep.cpp
@@ -0,0 +1,88 @@
+#include <Processors/QueryPlan/GatherReceiveStep.h>
+#include <Processors/QueryPlan/QueryPlanStepRegistry.h>
+#include <Processors/Sources/NativeCompressedSource.h>
+#include <Processors/QueryPlan/Serialization.h>
+#include <Processors/QueryPlan/IParameterLookup.h>
+#include <Processors/QueryPlan/ExchangeLookup.h>
+#include <Processors/QueryPlan/LogicalExchangeStep.h>
+#include <Processors/Merges/MergingSortedTransform.h>
+#include <QueryPipeline/QueryPipelineBuilder.h>
+#include <QueryPipeline/Pipe.h>
+#include <IO/WriteHelpers.h>
+#include <IO/ReadHelpers.h>
+#include <Core/SortDescription.h>
+#include <Core/Defines.h>
+
+#include <optional>
+
+
+namespace DB
+{
+
+void GatherReceiveStep::initializePipeline(QueryPipelineBuilder & pipeline, const BuildQueryPipelineSettings & settings)
+{
+    Pipes pipes;
+
+    /// Read from all buckets
+    for (size_t i = 0; i < num_buckets; ++i)
+    {
+        pipes.push_back(Pipe(settings.exchange_lookup->createSource(output_header, ExchangeStreamId(exchange_id, i, 0))));
+    }
+
+    pipeline.init(Pipe::unitePipes(std::move(pipes)));
+
+    if (maintain_sort_description && pipeline.getNumStreams() > 1)
+    {
+        pipeline.addTransform(
+            std::make_shared<MergingSortedTransform>(
+                output_header,
+                num_buckets,
+                *maintain_sort_description,
+                /* merge_block_size_rows */ DEFAULT_BLOCK_SIZE,
+                /* merge_block_size_bytes */ 0,
+                /* max_dynamic_subcolumns */ std::nullopt,
+                SortingQueueStrategy::Batch,
+                /* limit */ 0,
+                /* always_read_till_end */ false,
+                /* rows_sources_write_buf */ nullptr,
+                /* filter_column_name */ std::nullopt,
+                /* blocks_are_granules_size */ false));
+    }
+}
+
+void GatherReceiveStep::serialize(Serialization & ctx) const
+{
+    writeStringBinary(exchange_id, ctx.out);
+    writeVarUInt(num_buckets, ctx.out);
+    writeVarUInt(maintain_sort_description.has_value(), ctx.out);
+    if (maintain_sort_description.has_value())
+        serializeSortDescription(*maintain_sort_description, ctx.out);
+}
+
+std::unique_ptr<IQueryPlanStep> GatherReceiveStep::deserialize(Deserialization & ctx)
+{
+    String exchange_id;
+    readStringBinary(exchange_id, ctx.in);
+
+    size_t num_buckets = 0;
+    readVarUInt(num_buckets, ctx.in);
+
+    std::optional<SortDescription> maintain_sort_description;
+    bool has_maintain_sort_description = false;
+    readVarUInt(has_maintain_sort_description, ctx.in);
+    if (has_maintain_sort_description)
+    {
+        maintain_sort_description.emplace();
+        deserializeSortDescription(*maintain_sort_description, ctx.in);
+    }
+
+    return std::make_unique<GatherReceiveStep>(ctx.output_header, exchange_id, num_buckets, std::move(maintain_sort_description));
+}
+
+void registerGatherReceiveStep(QueryPlanStepRegistry & registry);
+void registerGatherReceiveStep(QueryPlanStepRegistry & registry)
+{
+    registry.registerStep("GatherReceive", GatherReceiveStep::deserialize);
+}
+
+}
diff --git a/src/Processors/QueryPlan/GatherReceiveStep.h b/src/Processors/QueryPlan/GatherReceiveStep.h
new file mode 100644
index 000000000000..0314a35b6f30
--- /dev/null
+++ b/src/Processors/QueryPlan/GatherReceiveStep.h
@@ -0,0 +1,37 @@
+#pragma once
+
+#include <Processors/QueryPlan/ISourceStep.h>
+
+namespace DB
+{
+
+/// Receive part of GatherExchangeStep
+class GatherReceiveStep : public ISourceStep
+{
+public:
+    GatherReceiveStep(SharedHeader header_, const String & exchange_id_, size_t num_buckets_,
+                      std::optional<SortDescription> maintain_sort_description_ = std::nullopt)
+        : ISourceStep(std::move(header_))
+        , exchange_id(exchange_id_)
+        , num_buckets(num_buckets_)
+        , maintain_sort_description(std::move(maintain_sort_description_))
+    {
+    }
+
+    String getName() const override { return "GatherReceive"; }
+
+    void initializePipeline(QueryPipelineBuilder & pipeline, const BuildQueryPipelineSettings & settings) override;
+
+    void serialize(Serialization & ctx) const override;
+    bool isSerializable() const override { return true; }
+
+    static std::unique_ptr<IQueryPlanStep> deserialize(Deserialization & ctx);
+
+private:
+    const String exchange_id;
+    const size_t num_buckets;
+    const std::optional<SortDescription> maintain_sort_description;
+};
+
+
+}
diff --git a/src/Processors/QueryPlan/GatherSendStep.cpp b/src/Processors/QueryPlan/GatherSendStep.cpp
new file mode 100644
index 000000000000..dbeaac25c70c
--- /dev/null
+++ b/src/Processors/QueryPlan/GatherSendStep.cpp
@@ -0,0 +1,60 @@
+#include <Processors/QueryPlan/GatherSendStep.h>
+#include <Processors/QueryPlan/QueryPlanStepRegistry.h>
+#include <Processors/Sinks/NativeCompressedSink.h>
+#include <Processors/QueryPlan/Serialization.h>
+#include <Processors/QueryPlan/IParameterLookup.h>
+#include <Processors/QueryPlan/ExchangeLookup.h>
+#include <Processors/QueryPlan/LogicalExchangeStep.h>
+#include <QueryPipeline/QueryPipelineBuilder.h>
+#include <QueryPipeline/Pipe.h>
+#include <IO/WriteHelpers.h>
+#include <IO/ReadHelpers.h>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int LOGICAL_ERROR;
+}
+
+QueryPipelineBuilderPtr GatherSendStep::updatePipeline(QueryPipelineBuilders pipelines, const BuildQueryPipelineSettings & settings)
+{
+    if (pipelines.size() != 1)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "GatherSendStep expects single input step");
+
+    auto & pipeline = *pipelines.front();
+
+    const String bucket = settings.parameter_lookup->getParameter("bucket_id").safeGet<String>();
+
+    /// Cannot have multiple sinks writing to the same file concurrently.
+    pipeline.resize(1);
+
+    pipeline.setSinks([&](const SharedHeader & header, Pipe::StreamType stream_type) -> ProcessorPtr
+    {
+        chassert(stream_type == Pipe::StreamType::Main);
+        return settings.exchange_lookup->createSink(header, ExchangeStreamId(exchange_id, bucket, "0"));
+    });
+
+    return std::move(pipelines.front());
+}
+
+void GatherSendStep::serialize(Serialization & ctx) const
+{
+    writeStringBinary(exchange_id, ctx.out);
+}
+
+std::unique_ptr<IQueryPlanStep> GatherSendStep::deserialize(Deserialization & ctx)
+{
+    String exchange_id;
+    readStringBinary(exchange_id, ctx.in);
+    return std::make_unique<GatherSendStep>(ctx.input_headers.front(), exchange_id);
+}
+
+void registerGatherSendStep(QueryPlanStepRegistry & registry);
+void registerGatherSendStep(QueryPlanStepRegistry & registry)
+{
+    registry.registerStep("GatherSend", GatherSendStep::deserialize);
+}
+
+}
diff --git a/src/Processors/QueryPlan/GatherSendStep.h b/src/Processors/QueryPlan/GatherSendStep.h
new file mode 100644
index 000000000000..0d6c93825390
--- /dev/null
+++ b/src/Processors/QueryPlan/GatherSendStep.h
@@ -0,0 +1,36 @@
+#pragma once
+
+#include <Processors/QueryPlan/IQueryPlanStep.h>
+
+
+namespace DB
+{
+
+/// Send part of GatherExchangeStep
+class GatherSendStep final : public IQueryPlanStep
+{
+public:
+    GatherSendStep(SharedHeader input_header_, const String & exchange_id_)
+        : exchange_id(exchange_id_)
+    {
+        updateInputHeaders({std::move(input_header_)});
+    }
+
+    String getName() const override { return "GatherSend"; }
+
+    bool hasOutputStream() const { return false; }
+
+    QueryPipelineBuilderPtr updatePipeline(QueryPipelineBuilders pipelines, const BuildQueryPipelineSettings & settings) override;
+
+    void serialize(Serialization & ctx) const override;
+    bool isSerializable() const override { return true; }
+
+    static std::unique_ptr<IQueryPlanStep> deserialize(Deserialization & ctx);
+
+private:
+    void updateOutputHeader() override {}
+
+    const String exchange_id;
+};
+
+}
diff --git a/src/Processors/QueryPlan/IParameterLookup.h b/src/Processors/QueryPlan/IParameterLookup.h
new file mode 100644
index 000000000000..772e864f442b
--- /dev/null
+++ b/src/Processors/QueryPlan/IParameterLookup.h
@@ -0,0 +1,19 @@
+#pragma once
+#include <Core/Field.h>
+#include <boost/noncopyable.hpp>
+
+namespace DB
+{
+
+/// Interface for looking up query plan parameters by name when building query pipeline.
+/// Example use case: when executing distributed query plan we run many tasks with the same query plan on differentent partitions
+/// of data. Each of this tasks will have exactly the same query plan, but a different value of parameter like `bucket_id` that
+/// will determine which partition of data the particular task will read from.
+struct IParameterLookup : boost::noncopyable
+{
+    virtual ~IParameterLookup() = default;
+
+    virtual Field getParameter(const String & name) const = 0;
+};
+
+}
diff --git a/src/Processors/QueryPlan/JoinStepLogical.h b/src/Processors/QueryPlan/JoinStepLogical.h
index c6ded27bdd2c..1a719926e7a1 100644
--- a/src/Processors/QueryPlan/JoinStepLogical.h
+++ b/src/Processors/QueryPlan/JoinStepLogical.h
@@ -104,6 +104,7 @@ class JoinStepLogical final : public IQueryPlanStep
 
     void serializeSettings(QueryPlanSerializationSettings & settings) const override;
     void serialize(Serialization & ctx) const override;
+    bool isSerializable() const override { return true; }
 
     static QueryPlanStepPtr deserialize(Deserialization & ctx);
 
diff --git a/src/Processors/QueryPlan/LogicalExchangeStep.h b/src/Processors/QueryPlan/LogicalExchangeStep.h
new file mode 100644
index 000000000000..3e81d95c1e92
--- /dev/null
+++ b/src/Processors/QueryPlan/LogicalExchangeStep.h
@@ -0,0 +1,47 @@
+#pragma once
+
+#include <Processors/QueryPlan/IQueryPlanStep.h>
+#include <Processors/QueryPlan/ITransformingStep.h>
+#include <Core/SortDescription.h>
+
+#include <optional>
+
+namespace DB
+{
+
+/// Base class for logical exchange steps.
+/// Derived classes implement createSinkAndSourcePair method that is used to create a pair of send-recieve steps when converting
+/// logical plan to a distributed plan.
+/// By default the data that is sent via the exchange might be reordered, but in cases like distributed sorting it is required to
+/// merge incoming sorted streams according to the sort description.
+class LogicalExchangeStep : public ITransformingStep
+{
+protected:
+    explicit LogicalExchangeStep(SharedHeader input_header_, std::optional<SortDescription> maintain_sort_description_ = std::nullopt)
+        : ITransformingStep(input_header_, input_header_, {})
+        , maintain_sort_description(std::move(maintain_sort_description_))
+    {
+    }
+
+public:
+
+    /// Number of buckets before the exchange. E.g. 1 for ScatterExchange
+    virtual size_t getSourceBucketCount() const = 0;
+    /// Number of buckets after the exchange. E.g. 1 for GatherExchange, num_buckets for ShuffleExchange.
+    virtual size_t getResultBucketCount() const = 0;
+
+    const std::optional<SortDescription> & getMaintainSortDescription() const
+    {
+        return maintain_sort_description;
+    }
+
+    /// Create a pair of sink and source steps for the exchange.
+    /// They are "connected" to each other via exchange_id
+    virtual std::pair<QueryPlanStepPtr, QueryPlanStepPtr> createSinkAndSourcePair(const String & exchange_id, const Strings & source_shards) const = 0;
+
+protected:
+    /// Describes required sort order of the output. Input(s) must also be sorted according to this description.
+    std::optional<SortDescription> maintain_sort_description;
+};
+
+}
diff --git a/src/Processors/QueryPlan/ObjectFilterStep.h b/src/Processors/QueryPlan/ObjectFilterStep.h
index ef35d20068ba..c77b56d8d171 100644
--- a/src/Processors/QueryPlan/ObjectFilterStep.h
+++ b/src/Processors/QueryPlan/ObjectFilterStep.h
@@ -22,6 +22,7 @@ class ObjectFilterStep : public IQueryPlanStep
     const String & getFilterColumnName() const { return filter_column_name; }
 
     void serialize(Serialization & ctx) const override;
+    bool isSerializable() const override { return true; }
 
     static std::unique_ptr<IQueryPlanStep> deserialize(Deserialization & ctx);
 
diff --git a/src/Processors/QueryPlan/Optimizations/Optimizations.h b/src/Processors/QueryPlan/Optimizations/Optimizations.h
index bce3693b4bf0..c79df6956bee 100644
--- a/src/Processors/QueryPlan/Optimizations/Optimizations.h
+++ b/src/Processors/QueryPlan/Optimizations/Optimizations.h
@@ -49,6 +49,13 @@ struct Optimization
 
         // parallel replicas
         bool parallel_replicas_filter_pushdown = false;
+
+        /// Top-K optimizations rely on a runtime `TopKThresholdTracker` shared between
+        /// `SortingStep` and `ReadFromMergeTree`, and the dynamic-filtering path adds
+        /// an internal `__topKFilter` function that is not registered in `FunctionFactory`.
+        /// Neither can survive serialization to remote workers, so we suppress the
+        /// optimization when the plan is going to be distributed.
+        bool make_distributed_plan = false;
     };
 
     using Function = size_t (*)(QueryPlan::Node *, QueryPlan::Nodes &, const ExtraSettings &);
diff --git a/src/Processors/QueryPlan/Optimizations/QueryPlanOptimizationSettings.cpp b/src/Processors/QueryPlan/Optimizations/QueryPlanOptimizationSettings.cpp
index a7154d062854..e07e0e7dcc88 100644
--- a/src/Processors/QueryPlan/Optimizations/QueryPlanOptimizationSettings.cpp
+++ b/src/Processors/QueryPlan/Optimizations/QueryPlanOptimizationSettings.cpp
@@ -1,5 +1,8 @@
 #include <Core/ServerSettings.h>
 #include <Core/Settings.h>
+#include <Common/getMultipleKeysFromConfig.h>
+#include <Poco/Util/AbstractConfiguration.h>
+
 #include <Interpreters/Cluster.h>
 #include <Interpreters/Context.h>
 #include <Processors/QueryPlan/Optimizations/QueryPlanOptimizationSettings.h>
@@ -19,6 +22,7 @@ namespace Setting
     extern const SettingsBool enable_join_runtime_filters;
     extern const SettingsBool force_optimize_projection;
     extern const SettingsBool make_distributed_plan;
+    extern const SettingsBool distributed_plan_execute_locally;
     extern const SettingsBool optimize_aggregation_in_order;
     extern const SettingsBool optimize_distinct_in_order;
     extern const SettingsBool optimize_read_in_order;
@@ -76,7 +80,7 @@ namespace Setting
     extern const SettingsUInt64 automatic_parallel_replicas_mode;
     extern const SettingsUInt64 merge_tree_min_bytes_per_task_for_remote_reading;
     extern const SettingsString cluster_for_parallel_replicas;
-    extern const SettingsUInt64 distributed_plan_default_reader_bucket_count;
+    extern const SettingsNonZeroUInt64 distributed_plan_default_reader_bucket_count;
     extern const SettingsUInt64 distributed_plan_max_rows_to_broadcast;
     extern const SettingsUInt64 join_runtime_bloom_filter_bytes;
     extern const SettingsUInt64 join_runtime_bloom_filter_hash_functions;
@@ -169,11 +173,25 @@ QueryPlanOptimizationSettings::QueryPlanOptimizationSettings(
     is_parallel_replicas_initiator_with_projection_support = is_parallel_replicas_initiator_with_projection_support_;
 
     make_distributed_plan = from[Setting::make_distributed_plan];
+
+    /// The implicit count/minmax projection counts a whole part from metadata; a distributed read
+    /// buckets the part, so the projection would be counted once per bucket and multiply the result.
+    /// Disable it for distributed plans (also forced off when a worker re-optimizes a fragment).
+    if (make_distributed_plan)
+        optimize_use_implicit_projections = false;
+
+    distributed_plan_execute_locally = from[Setting::distributed_plan_execute_locally];
     distributed_plan_default_shuffle_join_bucket_count = from[Setting::distributed_plan_default_shuffle_join_bucket_count];
     distributed_plan_default_reader_bucket_count = from[Setting::distributed_plan_default_reader_bucket_count];
     distributed_plan_optimize_exchanges = from[Setting::distributed_plan_optimize_exchanges];
 #ifdef OS_LINUX
     distributed_plan_force_exchange_kind = from[Setting::distributed_plan_force_exchange_kind].value;
+    if (!distributed_plan_force_exchange_kind.empty()
+        && distributed_plan_force_exchange_kind != "Persisted"
+        && distributed_plan_force_exchange_kind != "Streaming")
+        throw Exception(ErrorCodes::INVALID_SETTING_VALUE,
+            "Setting `distributed_plan_force_exchange_kind` must be empty, 'Persisted', or 'Streaming', got '{}'",
+            distributed_plan_force_exchange_kind);
 #else
     if (from[Setting::distributed_plan_force_exchange_kind].changed && from[Setting::distributed_plan_force_exchange_kind].value != "Persisted")
         throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "Only Persisted exchange is supported");
@@ -252,5 +270,20 @@ QueryPlanOptimizationSettings::QueryPlanOptimizationSettings(ContextPtr from)
             if (auto nodes = cluster->getAnyShardInfo().getAllNodeCount())
                 max_parallel_replicas = std::min<size_t>(nodes, max_parallel_replicas);
     }
+
+#ifdef OS_LINUX
+    /// Auto-select the exchange kind when it is not forced: use Streaming only when its listener will
+    /// run (both the port and a listen host are configured), otherwise Persisted. This avoids planning
+    /// Streaming exchanges that would connect to a listener that was never started. A forced kind and
+    /// local execution (which routes exchanges through in-memory queues) are left untouched.
+    if (distributed_plan_force_exchange_kind.empty() && !distributed_plan_execute_locally)
+    {
+        const auto & config = from->getConfigRef();
+        const bool streaming_listener_configured =
+            config.getUInt("distributed_query.streaming_exchange_port", 0) != 0
+            && !getMultipleValuesFromConfig(config, "distributed_query", "streaming_exchange_listen_host").empty();
+        distributed_plan_force_exchange_kind = streaming_listener_configured ? "Streaming" : "Persisted";
+    }
+#endif
 }
 }
diff --git a/src/Processors/QueryPlan/Optimizations/QueryPlanOptimizationSettings.h b/src/Processors/QueryPlan/Optimizations/QueryPlanOptimizationSettings.h
index 7459845687e8..a1246cc5013b 100644
--- a/src/Processors/QueryPlan/Optimizations/QueryPlanOptimizationSettings.h
+++ b/src/Processors/QueryPlan/Optimizations/QueryPlanOptimizationSettings.h
@@ -89,7 +89,8 @@ struct QueryPlanOptimizationSettings
     bool query_plan_join_shard_by_pk_ranges;
 
     bool make_distributed_plan = false;
-    bool distributed_plan_singe_stage = false;  /// For debugging purposes: force distributed plan to be single-stage
+    bool distributed_plan_execute_locally = false;  /// Run all distributed plan tasks locally (debugging)
+    bool distributed_plan_single_stage = false;  /// For debugging purposes: force distributed plan to be single-stage
     UInt64 distributed_plan_default_shuffle_join_bucket_count = 8;
     UInt64 distributed_plan_default_reader_bucket_count = 8; /// Default bucket count for read steps in distributed query plan
     bool distributed_plan_optimize_exchanges = true; /// Removes unnecessary exchanges in distributed query plan
diff --git a/src/Processors/QueryPlan/Optimizations/makeDistributed.cpp b/src/Processors/QueryPlan/Optimizations/makeDistributed.cpp
new file mode 100644
index 000000000000..4c436448749c
--- /dev/null
+++ b/src/Processors/QueryPlan/Optimizations/makeDistributed.cpp
@@ -0,0 +1,1053 @@
+#include <Processors/QueryPlan/JoinStep.h>
+#include <Processors/QueryPlan/ReadFromMergeTree.h>
+#if CLICKHOUSE_CLOUD
+#include <Processors/QueryPlan/ReadFromMergeTreeAtWorker.h>
+#endif
+#include <Processors/QueryPlan/ReadFromObjectStorageStep.h>
+#include <Processors/QueryPlan/BuildRuntimeFilterStep.h>
+#include <Processors/QueryPlan/ExpressionStep.h>
+#include <Processors/QueryPlan/FilterStep.h>
+#include <Processors/QueryPlan/AggregatingStep.h>
+#include <Processors/QueryPlan/MergingAggregatedStep.h>
+#include <Processors/QueryPlan/TotalsHavingStep.h>
+#include <Processors/QueryPlan/RollupStep.h>
+#include <Processors/QueryPlan/CubeStep.h>
+#include <Processors/QueryPlan/ExtremesStep.h>
+#include <Processors/QueryPlan/UnionStep.h>
+#include <Processors/QueryPlan/IntersectOrExceptStep.h>
+#include <Processors/QueryPlan/Optimizations/Optimizations.h>
+#include <Processors/QueryPlan/Optimizations/Utils.h>
+#include <Processors/QueryPlan/JoinStepLogical.h>
+#include <Processors/QueryPlan/LogicalExchangeStep.h>
+#include <Processors/QueryPlan/ScatterExchangeStep.h>
+#include <Processors/QueryPlan/ShuffleExchangeStep.h>
+#include <Processors/QueryPlan/BroadcastExchangeStep.h>
+#include <Processors/QueryPlan/GatherExchangeStep.h>
+#include <Processors/QueryPlan/Optimizations/joinOrder.h>
+#include <DataTypes/getLeastSupertype.h>
+#include <Columns/ColumnConst.h>
+#include <Core/Block.h>
+#include <Core/Settings.h>
+#include <Common/logger_useful.h>
+
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int LOGICAL_ERROR;
+    extern const int SUPPORT_IS_DISABLED;
+}
+
+namespace QueryPlanOptimizations
+{
+
+RelationStats estimateReadRowsCount(QueryPlan::Node & node, const ActionsDAG::Node * filter = nullptr);
+
+void tryMakeDistributedJoin(QueryPlan::Node & node, QueryPlan::Nodes & nodes, const QueryPlanOptimizationSettings & optimization_settings);
+void tryMakeDistributedAggregation(QueryPlan::Node & node, QueryPlan::Nodes & nodes, const QueryPlanOptimizationSettings & optimization_settings);
+void tryMakeDistributedSorting(QueryPlan::Node & node, QueryPlan::Nodes & nodes, const QueryPlanOptimizationSettings & optimization_settings);
+void tryMakeDistributedRead(QueryPlan::Node & node, QueryPlan::Nodes & nodes, const QueryPlanOptimizationSettings & optimization_settings);
+void tryReplaceScatterGatherWithShuffle(QueryPlan::Node * node);
+void optimizeExchanges(QueryPlan::Node & root);
+void materializeConstantsForSetOperationBranches(QueryPlan::Node & root, QueryPlan::Nodes & nodes);
+bool planHasUnsupportedDistributedStep(const QueryPlan::Node & root);
+void checkDistributedReadSupported(const QueryPlan::Node & root);
+Strings makeListOfShardsForReadStep(const IQueryPlanStep * read_step);
+String dumpQueryPlanShort(const QueryPlan & query_plan);
+DistributedQueryPlan makeDistributedPlan(QueryPlan::Nodes nodes, QueryPlan::Node * root, const QueryPlanOptimizationSettings & optimization_settings);
+
+/// Returns true if the plan contains a step the distributed pipeline cannot handle yet: WITH TOTALS
+/// (TotalsHaving) needs a separate totals stream that the exchange protocol does not carry, and
+/// ROLLUP/CUBE feed subtotals from a step the exchanges do not support. Such plans stay single-node.
+bool planHasUnsupportedDistributedStep(const QueryPlan::Node & root)
+{
+    std::vector<const QueryPlan::Node *> stack = {&root};
+    while (!stack.empty())
+    {
+        const auto * node = stack.back();
+        stack.pop_back();
+        const auto * step = node->step.get();
+        /// These steps produce non-Main pipe streams (totals/extremes) or rely on a single-node
+        /// aggregation shape; exchanges only carry the Main stream, so keep such plans local.
+        if (typeid_cast<const TotalsHavingStep *>(step)
+            || typeid_cast<const RollupStep *>(step)
+            || typeid_cast<const CubeStep *>(step)
+            || typeid_cast<const ExtremesStep *>(step))
+            return true;
+        for (const auto * child : node->children)
+            stack.push_back(child);
+    }
+    return false;
+}
+
+/// Rejects distributed reads a worker cannot reproduce: a pinned snapshot boundary
+/// (select_sequential_consistency) or the part-order virtual columns `_part_index` /
+/// `_part_starting_offset`. Done at planning time so it fails cleanly before the pipeline is built.
+void checkDistributedReadSupported(const QueryPlan::Node & root)
+{
+    std::vector<const QueryPlan::Node *> stack = {&root};
+    while (!stack.empty())
+    {
+        const auto * node = stack.back();
+        stack.pop_back();
+
+        if (const auto * read = typeid_cast<const ReadFromMergeTree *>(node->step.get()))
+        {
+            if (read->hasPinnedBlockNumbers())
+                throw Exception(ErrorCodes::SUPPORT_IS_DISABLED,
+                    "make_distributed_plan does not support a distributed read with a pinned block-number "
+                    "boundary (for example select_sequential_consistency)");
+
+            for (const auto & column : read->getAllColumnNames())
+                if (column == "_part_index" || column == "_part_starting_offset")
+                    throw Exception(ErrorCodes::SUPPORT_IS_DISABLED,
+                        "make_distributed_plan does not support a distributed read exposing the {} virtual column", column);
+        }
+
+        for (const auto * child : node->children)
+            stack.push_back(child);
+    }
+}
+
+/// Replaces LogicalJoin step with a subtree like this:
+///
+///   GatherExchange
+///     LogicalJoin
+///       ScatterExchange by hash(join_key)
+///         Expression: compute join key for right source
+///         ...
+///       ScatterExchange by hash(join_key)
+///         Expression: compute join key for left source
+///         ...
+void tryMakeDistributedJoin(QueryPlan::Node & node, QueryPlan::Nodes & nodes, const QueryPlanOptimizationSettings & optimization_settings)
+{
+    /// Is this a join step?
+    auto * join_step = typeid_cast<JoinStepLogical *>(node.step.get());
+    if (!join_step)
+        return;
+
+    /// Joining two sources?
+    if (node.children.size() != 2)
+        return;
+
+    /// Check if join is possible to be distributed.
+    const auto & join_info = join_step->getJoinOperator();
+
+    /// Must have a known locality.
+    if (join_info.locality != JoinLocality::Unspecified && join_info.locality != JoinLocality::Global)
+        return;
+
+    /// Must have at least one equi-join predicate (required for shuffle partitioning).
+    if (std::ranges::all_of(join_info.expression, [](const auto & expr) { return !expr.isFunction(JoinConditionOperator::Equals); }))
+        return;
+
+    QueryPlan::Node * source_a = node.children[0];
+    QueryPlan::Node * source_b = node.children[1];
+
+    auto row_count_b = estimateReadRowsCount(*source_b).estimated_rows;
+
+    enum DistributedJoinStrategy
+    {
+        Shuffle,
+        Broadcast
+    } strategy = Shuffle;
+
+    /// In a broadcast join, the right side is replicated to all workers and the
+    /// left side is scattered.  For RIGHT and FULL joins the right side can
+    /// produce unmatched output rows.  When the right side is replicated, every
+    /// worker independently decides which right rows are unmatched based only on
+    /// its local slice of the left side, producing duplicate unmatched rows.
+    const bool broadcast_unsafe
+        = join_info.kind == JoinKind::Right
+        || join_info.kind == JoinKind::Full;
+
+    /// Check if right table is small enough for broadcast
+    if (!broadcast_unsafe && row_count_b && row_count_b <= optimization_settings.distributed_plan_max_rows_to_broadcast)
+        strategy = Broadcast;
+
+    QueryPlan::Node * exchange_scatter_a_node = nullptr;
+    QueryPlan::Node * exchange_scatter_b_node = nullptr;
+
+    size_t bucket_count = optimization_settings.distributed_plan_default_shuffle_join_bucket_count;
+
+    if (strategy == Broadcast)
+    {
+        LOG_DEBUG(getLogger("tryMakeDistributedJoin"),
+            "Estimated number of rows in right source: {}. Using broadcast join",
+            row_count_b.transform(toString<UInt64>).value_or("unknown"));
+
+        exchange_scatter_a_node = &nodes.emplace_back();
+        exchange_scatter_b_node = &nodes.emplace_back();
+
+        /// Add scatter exchange step above read from left source
+        exchange_scatter_a_node->step = std::make_unique<ScatterExchangeStep>(source_a->step->getOutputHeader(), Names{}, bucket_count);
+        exchange_scatter_a_node->step->setStepDescription("any scatter");
+
+        /// Add broadcast exchange step above read from right source
+        exchange_scatter_b_node->step = std::make_unique<BroadcastExchangeStep>(source_b->step->getOutputHeader(), bucket_count);
+        exchange_scatter_b_node->step->setStepDescription("");
+    }
+    else
+    {
+        LOG_DEBUG(getLogger("tryMakeDistributedJoin"),
+            "Estimated number of rows in right source: {}. Using {} buckets for shuffle join",
+            row_count_b.transform(toString<UInt64>).value_or("unknown"),
+            bucket_count);
+
+        /// Extract expressions for calculating join on keys
+        auto key_dags = join_step->preCalculateKeys(source_a->step->getOutputHeader(), source_b->step->getOutputHeader());
+        if (!key_dags)
+            return;
+
+        auto get_node_name = [](const auto * e) { return e->result_name; };
+        auto join_keys_a = std::ranges::to<Names>(key_dags->first.keys | std::views::transform(get_node_name));
+        auto join_keys_b = std::ranges::to<Names>(key_dags->second.keys | std::views::transform(get_node_name));
+
+        /// For mismatched key types, hash both sides at their common supertype so matching
+        /// rows colocate. The cast is internal to scatter; output rows keep their types.
+        chassert(key_dags->first.keys.size() == key_dags->second.keys.size());
+        DataTypes hash_cast_types_a;
+        DataTypes hash_cast_types_b;
+        bool any_cast_needed = false;
+        for (size_t i = 0; i < key_dags->first.keys.size(); ++i)
+        {
+            const auto & left_type = key_dags->first.keys[i]->result_type;
+            const auto & right_type = key_dags->second.keys[i]->result_type;
+            if (left_type->equals(*right_type))
+            {
+                hash_cast_types_a.push_back(nullptr);
+                hash_cast_types_b.push_back(nullptr);
+                continue;
+            }
+
+            DataTypePtr common_type;
+            try
+            {
+                common_type = getLeastSupertype(DataTypes{left_type, right_type});
+            }
+            catch (const Exception &)
+            {
+                return;
+            }
+            hash_cast_types_a.push_back(common_type);
+            hash_cast_types_b.push_back(common_type);
+            any_cast_needed = true;
+        }
+        if (!any_cast_needed)
+        {
+            hash_cast_types_a.clear();
+            hash_cast_types_b.clear();
+        }
+
+        if (!isPassthroughActions(key_dags->first.actions_dag))
+            makeExpressionNodeOnTopOf(*source_a, std::move(key_dags->first.actions_dag), nodes, makeDescription("Calculate left join keys"));
+        if (!isPassthroughActions(key_dags->second.actions_dag))
+            makeExpressionNodeOnTopOf(*source_b, std::move(key_dags->second.actions_dag), nodes, makeDescription("Calculate right join keys"));
+
+        /// Add scatter exchange step above read from left source
+        exchange_scatter_a_node = &nodes.emplace_back();
+        exchange_scatter_a_node->step = std::make_unique<ScatterExchangeStep>(source_a->step->getOutputHeader(), join_keys_a, bucket_count, hash_cast_types_a);
+        exchange_scatter_a_node->step->setStepDescription(fmt::format("by hash([{}])", fmt::join(join_keys_a, ", ")), optimization_settings.max_step_description_length);
+
+        /// Add scatter exchange step above read from right source
+        exchange_scatter_b_node = &nodes.emplace_back();
+        exchange_scatter_b_node->step = std::make_unique<ScatterExchangeStep>(source_b->step->getOutputHeader(), join_keys_b, bucket_count, hash_cast_types_b);
+        exchange_scatter_b_node->step->setStepDescription(fmt::format("by hash([{}])", fmt::join(join_keys_b, ", ")), optimization_settings.max_step_description_length);
+    }
+
+    exchange_scatter_a_node->children = {source_a};
+    exchange_scatter_b_node->children = {source_b};
+
+    /// Move join step to a new node
+    auto & new_join_node = nodes.emplace_back();
+    new_join_node.step = std::move(node.step);
+    new_join_node.children = {exchange_scatter_a_node, exchange_scatter_b_node};
+
+    /// Add gather exchange step above join
+    QueryPlan::Node gather_node;
+    QueryPlanStepPtr exchange_gather_step = std::make_unique<GatherExchangeStep>(new_join_node.step->getOutputHeader(), bucket_count);
+    gather_node.step = std::move(exchange_gather_step);
+    gather_node.children = {&new_join_node};
+
+    /// Replace join node with gather node
+    node = std::move(gather_node);
+}
+
+
+/// One way to parallelize aggregation is to split data into buckets by hash of aggregation keys.
+/// Then results of aggregation of all buckets can just be united.
+/// The other approach is to do partial aggregation on data into aggregation states regardless of how it is split and
+/// then gather partial results and merge them finalizing aggregation states.
+///
+/// In the first approach the AggregatingStep is replaced with a subtree like this:
+///
+///   GatherExchange
+///     AggregatingStep
+///       ScatterExchange by hash(aggregation_keys)
+///
+/// In the second approach the AggregatingStep is replaced with a subtree like this:
+///
+///   MergingAggregated (merge)
+///     GatherExchange
+///       Aggregating (partial)
+///         ScatterExchange (any)
+void tryMakeDistributedAggregation(QueryPlan::Node & node, QueryPlan::Nodes & nodes, const QueryPlanOptimizationSettings & optimization_settings)
+{
+    /// Is this a aggregating step?
+    auto * aggregating_step = typeid_cast<AggregatingStep *>(node.step.get());
+    if (!aggregating_step)
+        return;
+
+    /// Only one source is expected for aggregation step
+    if (node.children.size() != 1)
+        return;
+    QueryPlan::Node * source = node.children[0];
+
+    Names aggregation_keys = aggregating_step->getParams().keys;
+
+    /// A global GROUP BY limit can't be enforced once aggregation is split per bucket.
+    if (aggregating_step->getParams().max_rows_to_group_by != 0)
+        throw Exception(ErrorCodes::SUPPORT_IS_DISABLED,
+            "make_distributed_plan does not support aggregation with max_rows_to_group_by");
+
+    enum AggregationStrategy
+    {
+        PartialAggregation, /// Do partial aggregation and then merge aggregation states
+        Shuffle,            /// Partition data by aggregation keys and do aggregation in disjoint buckets, then just unite the results
+    } strategy = PartialAggregation;
+
+    /// Choose Shuffle when the estimated number of groups is high.
+    if (!aggregation_keys.empty())
+    {
+        auto input_stats = estimateReadRowsCount(*source);
+
+        /// Use max NDV among GROUP BY keys as a lower-bound estimate for groups.
+        std::optional<UInt64> estimated_groups;
+        for (const auto & key : aggregation_keys)
+        {
+            auto it = input_stats.column_stats.find(key);
+            if (it != input_stats.column_stats.end() && it->second.num_distinct_values > 0)
+                estimated_groups = std::max(estimated_groups.value_or(0), it->second.num_distinct_values);
+        }
+
+        /// Fall back to input row count as an upper bound when NDV is unavailable.
+        if (!estimated_groups && input_stats.estimated_rows)
+            estimated_groups = input_stats.estimated_rows;
+
+        if (estimated_groups && *estimated_groups > optimization_settings.distributed_plan_max_rows_to_broadcast)
+            strategy = Shuffle;
+        else if (!estimated_groups)
+            /// No stats at all - default to Shuffle to be safe.
+            strategy = Shuffle;
+    }
+
+    /// Fallback to Shuffle strategy for the cases when partial aggregation is not supported
+    const bool can_use_partial_aggregation = !aggregating_step->inOrder() && !aggregating_step->explicitSortingRequired();
+    if (!can_use_partial_aggregation)
+        strategy = Shuffle;
+
+    if (optimization_settings.distributed_plan_force_shuffle_aggregation && !aggregation_keys.empty())
+        strategy = Shuffle;
+
+    /// Shuffle scatters by the full key set, so GROUPING SETS subtotals (over key subsets) would be
+    /// produced in several buckets and duplicated.
+    if (aggregating_step->isGroupingSets())
+        throw Exception(ErrorCodes::SUPPORT_IS_DISABLED,
+            "make_distributed_plan does not support GROUPING SETS aggregation");
+
+    if (strategy == PartialAggregation)
+    {
+        const size_t bucket_count = optimization_settings.distributed_plan_default_shuffle_join_bucket_count;    /// TODO: estimate number of buckets based on statistics and available nodes and memory
+
+        /// Add any-scatter
+        auto & exchange_scatter_node = nodes.emplace_back();
+        exchange_scatter_node.step = std::make_unique<ScatterExchangeStep>(source->step->getOutputHeader(), Names{}, bucket_count);
+        exchange_scatter_node.step->setStepDescription("any");
+        exchange_scatter_node.children = {source};
+
+        /// Params will be used by merge step
+        Aggregator::Params aggregator_params = aggregating_step->getParams();
+        GroupingSetsParamsList grouping_sets_params = aggregating_step->getGroupingSetsParamsList();
+        const bool has_grouping_sets = !grouping_sets_params.empty();
+
+        const bool should_produce_results_in_order_of_bucket_number = aggregating_step->shouldProduceResultsInBucketOrder();
+        const bool memory_bound_merging_of_aggregation_results_enabled = aggregating_step->usingMemoryBoundMerging();
+        const bool original_step_was_final = aggregating_step->getFinal();   /// Save whether the original AggregatingStep was final or partial
+
+        /// Convert Aggregation step to partial aggregation
+        auto & partial_aggregation_node = nodes.emplace_back();
+        partial_aggregation_node.step = aggregating_step->clone();
+        typeid_cast<AggregatingStep *>(partial_aggregation_node.step.get())->setFinal(false);
+        partial_aggregation_node.step->setStepDescription("partial");
+        partial_aggregation_node.children = {&exchange_scatter_node};
+
+        /// Add gather
+        auto & gather_node = nodes.emplace_back();
+        gather_node.step = std::make_unique<GatherExchangeStep>(partial_aggregation_node.step->getOutputHeader(), bucket_count);
+        gather_node.children = {&partial_aggregation_node};
+
+        /// Replace original aggregation step with MergingAggregated step
+        aggregator_params.only_merge = true;    /// Merge partial aggregation results
+        QueryPlanStepPtr final_aggregation_step = std::make_unique<MergingAggregatedStep>(
+            gather_node.step->getOutputHeader(),
+            aggregator_params,
+            grouping_sets_params,
+            /* final */ original_step_was_final,
+            /// Grouping sets don't work with distributed_aggregation_memory_efficient enabled (#43989)
+            optimization_settings.distributed_aggregation_memory_efficient && !has_grouping_sets,
+            aggregating_step->getTemporaryDataMergeThreads(),
+            should_produce_results_in_order_of_bucket_number,
+            aggregating_step->getMaxBlockSize(),
+            aggregating_step->getMaxBlockSizeForAggregationInOrder(),
+            memory_bound_merging_of_aggregation_results_enabled);
+
+        final_aggregation_step->setStepDescription("merge");
+        node.step = std::move(final_aggregation_step);
+        node.children = {&gather_node};
+    }
+    else if (strategy == Shuffle)
+    {
+        const size_t bucket_count = optimization_settings.distributed_plan_default_shuffle_join_bucket_count;    /// TODO: estimate number of buckets based on statistics and available nodes and memory
+
+        /// Add scatter exchange step above source
+        auto & exchange_scatter_node = nodes.emplace_back();
+        exchange_scatter_node.step = std::make_unique<ScatterExchangeStep>(source->step->getOutputHeader(), aggregation_keys, bucket_count);
+        exchange_scatter_node.step->setStepDescription(fmt::format("by hash([{}])", fmt::join(aggregation_keys, ", ")), optimization_settings.max_step_description_length);
+        exchange_scatter_node.children = {source};
+
+        /// Move aggregation step to a new node
+        auto & new_aggregation_node = nodes.emplace_back();
+        new_aggregation_node.step = std::move(node.step);
+        new_aggregation_node.children = {&exchange_scatter_node};
+
+        /// Add gather exchange step above aggregation
+        QueryPlan::Node gather_node;
+        QueryPlanStepPtr exchange_gather_step = std::make_unique<GatherExchangeStep>(new_aggregation_node.step->getOutputHeader(), bucket_count);
+        gather_node.step = std::move(exchange_gather_step);
+        gather_node.children = {&new_aggregation_node};
+
+        /// Replace aggregation node with gather node
+        node = std::move(gather_node);
+    }
+}
+
+/// Replaces SortingStep step with a subtree like this:
+///
+///   GatherExchange (merge sorted streams)
+///     SortingStep
+///       ScatterExchange (any partitioning)
+///
+/// NOTE: GatherExchange step is aware of sort descripiton and merges multiple sorted streams into one sorted stream.
+void tryMakeDistributedSorting(QueryPlan::Node & node, QueryPlan::Nodes & nodes, const QueryPlanOptimizationSettings & optimization_settings)
+{
+    /// Is this a sorting step?
+    auto * sorting_step = typeid_cast<SortingStep *>(node.step.get());
+    if (!sorting_step)
+        return;
+
+    /// Only one source is expected for sorting step
+    if (node.children.size() != 1)
+        return;
+    QueryPlan::Node * source = node.children[0];
+
+    const size_t bucket_count = optimization_settings.distributed_plan_default_shuffle_join_bucket_count;    /// TODO: estimate number of buckets based on statistics and available nodes and memory
+    auto sort_description = sorting_step->getSortDescription();
+
+    /// Add "any" scatter exchange step above source. It will allow to optimize out unnecessary shuffle if the input is already parallelized in any way.
+    /// TODO: need a special step with "any" partitioning?
+    auto & exchange_scatter_node = nodes.emplace_back();
+    exchange_scatter_node.step = std::make_unique<ScatterExchangeStep>(source->step->getOutputHeader(), Names{}, bucket_count);
+    exchange_scatter_node.step->setStepDescription("any scatter");
+    exchange_scatter_node.children = {source};
+
+    /// Move sorting step to a new node
+    auto & new_sorting_node = nodes.emplace_back();
+    new_sorting_node.step = std::move(node.step);
+    new_sorting_node.children = {&exchange_scatter_node};
+
+    /// Add merge sorted gather exchange step above sorting
+    QueryPlan::Node gather_node;
+    QueryPlanStepPtr exchange_gather_step = std::make_unique<GatherExchangeStep>(new_sorting_node.step->getOutputHeader(), bucket_count, sort_description);
+    exchange_gather_step->setStepDescription(fmt::format("sorted by ({})", dumpSortDescription(sort_description)), optimization_settings.max_step_description_length);
+    gather_node.step = std::move(exchange_gather_step);
+    gather_node.children = {&new_sorting_node};
+
+    /// Replace sorting node with gather node
+    node = std::move(gather_node);
+}
+
+/// Replaces ReadFromMergeTree step with a subtree like this:
+///
+///   GatherExchange
+///     (Distributed)ReadFromMergeTree
+void tryMakeDistributedRead(QueryPlan::Node & node, QueryPlan::Nodes & nodes, const QueryPlanOptimizationSettings & optimization_settings)
+{
+    /// Is this a read from MergeTree step?
+    auto * read_from_merge_tree_step = typeid_cast<ReadFromMergeTree *>(node.step.get());
+    auto * read_from_object_storage_step = typeid_cast<ReadFromObjectStorageStep *>(node.step.get());
+
+    if (!read_from_merge_tree_step && !read_from_object_storage_step)
+        return;
+
+    /// Should not have children
+    if (!node.children.empty())
+        return;
+
+    /// TODO: estimate number of buckets based on statistics and available nodes and memory
+    const size_t bucket_count = optimization_settings.distributed_plan_default_reader_bucket_count;
+
+    if (read_from_merge_tree_step)
+    {
+        /// Round-robin mark-range bucketing would split rows with the same sort key across buckets and
+        /// break FINAL dedup on engines with specialized merging (Replacing, Collapsing, ...). Fall back
+        /// to serial read until a correctness-preserving bucketing strategy exists.
+        if (read_from_merge_tree_step->isQueryWithFinal() &&
+            read_from_merge_tree_step->getMergeTreeData().merging_params.mode != MergeTreeData::MergingParams::Ordinary)
+            return;
+
+        /// Check if table is big enough for distributed read
+        /// TODO: implement better logic for choosing number of parallel readers
+        auto analysis_result = read_from_merge_tree_step->selectRangesToRead();
+        if (analysis_result && analysis_result->selected_rows <= optimization_settings.distributed_plan_max_rows_to_broadcast)
+            return;
+
+        /// Move read step to a new node and set it to distributed read
+        read_from_merge_tree_step->setDistributedRead(bucket_count);
+    }
+    else if (read_from_object_storage_step)
+    {
+#if CLICKHOUSE_CLOUD
+        /// Check if table is big enough for distributed read
+        /// TODO: implement better logic for choosing number of parallel readers
+        if (read_from_object_storage_step->totalRows() <= optimization_settings.distributed_plan_max_rows_to_broadcast)
+            return;
+
+        read_from_object_storage_step->setDistributedRead(bucket_count);
+#else
+        return;
+#endif
+    }
+
+    auto & new_read_node = nodes.emplace_back();
+    new_read_node.step = std::move(node.step);
+
+    /// Add gather exchange step above read
+    QueryPlan::Node gather_node;
+    QueryPlanStepPtr exchange_gather_step = std::make_unique<GatherExchangeStep>(new_read_node.step->getOutputHeader(), bucket_count);
+    gather_node.step = std::move(exchange_gather_step);
+    gather_node.children = {&new_read_node};
+
+    /// Replace aggregation node with gather node
+    node = std::move(gather_node);
+}
+
+
+/// If there is a Scatter step on top of Gather step then they can be replaced with Shuffle step that just
+/// repartitions data from the source set of buckets to the destination set of buckets.
+void tryReplaceScatterGatherWithShuffle(QueryPlan::Node * node)
+{
+    if (node->children.size() != 1)
+        return;
+
+    auto * scatter_step = typeid_cast<ScatterExchangeStep *>(node->step.get());
+    if (!scatter_step)
+        return;
+
+    auto * gather_step = typeid_cast<GatherExchangeStep *>(node->children[0]->step.get());
+    if (!gather_step)
+        return;
+
+    auto shuffle_step = std::make_unique<ShuffleExchangeStep>(node->children[0]->step->getOutputHeader(), scatter_step->getKeys(),
+        gather_step->getSourceBucketCount(), scatter_step->getResultBucketCount(), scatter_step->getHashCastTypes());
+    shuffle_step->setStepDescription(*scatter_step);
+    node->step = std::move(shuffle_step);
+    node->children = std::move(node->children[0]->children);
+}
+
+/// True if `column_name` is produced by `dag` as the unchanged input column of the same name (only
+/// possibly aliased). Such a column keeps its value, so a merge by it stays sort-preserving.
+static bool isSortColumnPreserved(const ActionsDAG & dag, const String & column_name)
+{
+    const auto * node = &dag.findInOutputs(column_name);
+    while (node->type == ActionsDAG::ActionType::ALIAS)
+        node = node->children.front();
+    return node->type == ActionsDAG::ActionType::INPUT && node->result_name == column_name;
+}
+
+/// 1. Moves exchanges where possible to parallelize more work. Example: if there is a Filter step on top of an GatherExchange step
+/// then filter step can be moved below the exchange step to allow parallel processing.
+/// 2. Removes unnecessary exchanges. Example: if there is a ShuffleExchange step on top of another exchange step then child
+/// exchange step can be removed.
+void optimizeExchanges(QueryPlan::Node & root)
+{
+    Stack stack;
+
+    stack.push_back({.node = &root});
+    while (!stack.empty())
+    {
+        auto & frame = stack.back();
+
+        /// Traverse all children first.
+        if (frame.next_child < frame.node->children.size())
+        {
+            auto next_frame = Frame{.node = frame.node->children[frame.next_child]};
+            ++frame.next_child;
+            stack.push_back(next_frame);
+            continue;
+        }
+        else /// After all children were processed
+        {
+            /// Try to push up GatherExchange above Expression or Filter step
+            if (frame.node->children.size() == 1 &&
+                (typeid_cast<ExpressionStep *>(frame.node->step.get()) ||
+                typeid_cast<FilterStep *>(frame.node->step.get()) ||
+                typeid_cast<BuildRuntimeFilterStep *>(frame.node->step.get())))
+            {
+                auto & child_node = *frame.node->children[0];
+                auto * gather_step = typeid_cast<GatherExchangeStep *>(child_node.step.get());
+                if (gather_step)
+                {
+                    SharedHeader expression_header = frame.node->step->getOutputHeader();
+
+                    const ActionsDAG * dag = nullptr;
+                    if (auto * expr = typeid_cast<ExpressionStep *>(frame.node->step.get()))
+                        dag = &expr->getExpression();
+                    else if (auto * filter = typeid_cast<FilterStep *>(frame.node->step.get()))
+                        dag = &filter->getExpression();
+
+                    bool can_move_gather_up = true;
+
+                    /// Per-block functions (`rowNumberInAllBlocks`, `blockNumber`, `nowInBlock`, ...)
+                    /// depend on the whole block stream; below a gather they would run per shard and
+                    /// produce different values. Keep such a step above the gather.
+                    if (dag && dagContainsNonDeterministicFunction(*dag))
+                        can_move_gather_up = false;
+
+                    /// Moving the sorted GatherExchange above the step is only valid if every sort column
+                    /// survives the step unchanged - otherwise GatherReceive would merge by a sort
+                    /// description that no longer matches the data. Expression/Filter may recompute or
+                    /// rename columns, so check their DAG; BuildRuntimeFilter leaves column values intact.
+                    if (can_move_gather_up && gather_step->getMaintainSortDescription())
+                    {
+                        const auto & sort_description = gather_step->getMaintainSortDescription().value();
+                        for (const auto & column : sort_description)
+                        {
+                            if (!expression_header->has(column.column_name)
+                                || (dag && !isSortColumnPreserved(*dag, column.column_name)))
+                            {
+                                can_move_gather_up = false;
+                                break;
+                            }
+                        }
+                    }
+
+                    if (can_move_gather_up)
+                    {
+                        std::swap(frame.node->step, child_node.step);
+                        frame.node->step->updateInputHeader(expression_header);
+                    }
+                }
+            }
+
+            tryReplaceScatterGatherWithShuffle(frame.node);
+
+            if (const auto * shuffle = dynamic_cast<const ShuffleExchangeStep *>(frame.node->step.get()))
+            {
+                /// Remove shuffle with empty keys as redundant
+                if (shuffle->getKeys().empty() && shuffle->getResultBucketCount() == shuffle->getSourceBucketCount())
+                {
+                    frame.node->step = std::move(frame.node->children[0]->step);
+                    frame.node->children = std::move(frame.node->children[0]->children);
+                }
+            }
+        }
+
+        stack.pop_back();
+    }
+}
+
+
+/// Wraps every constant column produced by a `UNION` branch in `materialize`, so the branch exposes
+/// a full column instead of a constant.
+static void materializeBranchConstants(QueryPlan::Node & branch, QueryPlan::Nodes & nodes)
+{
+    const auto & header = branch.step->getOutputHeader();
+
+    bool has_constants = false;
+    for (const auto & column : *header)
+    {
+        if (column.column && isColumnConst(*column.column))
+        {
+            has_constants = true;
+            break;
+        }
+    }
+
+    if (!has_constants)
+        return;
+
+    ActionsDAG materialize_dag(header->getColumnsWithTypeAndName());
+    for (auto *& output : materialize_dag.getOutputs())
+    {
+        if (output->column && isColumnConst(*output->column))
+            output = &materialize_dag.materializeNode(*output);
+    }
+
+    makeExpressionNodeOnTopOf(branch, std::move(materialize_dag), nodes, makeDescription("Materialize constants for set operation"));
+}
+
+/// Plan serialization stores only column names and types, so constness is re-derived per step on
+/// deserialize. A set-operation branch can then end up with a full column (aliased from an exchange,
+/// which produces full columns) while a sibling keeps a freshly computed constant, and the strict
+/// header check of `UnionStep` and `IntersectOrExceptStep` rejects the mismatch. Materialize constants
+/// on every branch so all branches are full and agree.
+///
+/// `IntersectOrExceptStep` is not itself distributed, but a materialized `UnionStep` beneath one of its
+/// inputs changes that input's constness, so it needs the same treatment to keep its header check
+/// satisfied when its input headers are refreshed.
+void materializeConstantsForSetOperationBranches(QueryPlan::Node & root, QueryPlan::Nodes & nodes)
+{
+    struct Frame
+    {
+        QueryPlan::Node * node = nullptr;
+        size_t next_child = 0;
+    };
+
+    std::vector<Frame> stack;
+    stack.push_back({.node = &root});
+
+    while (!stack.empty())
+    {
+        auto & frame = stack.back();
+
+        if (frame.next_child < frame.node->children.size())
+        {
+            /// Advance and read the child before pushing: push_back may reallocate the stack and
+            /// invalidate `frame`, so it must not be touched afterwards.
+            auto * child = frame.node->children[frame.next_child];
+            ++frame.next_child;
+            stack.push_back({.node = child});
+            continue;
+        }
+
+        auto & node = *frame.node;
+
+        if (typeid_cast<UnionStep *>(node.step.get()) || typeid_cast<IntersectOrExceptStep *>(node.step.get()))
+        {
+            for (auto * branch : node.children)
+                materializeBranchConstants(*branch, nodes);
+        }
+
+        /// Materializing a branch changed a child output header, so refresh this step's input headers
+        /// to keep the plan consistent up to the root. Update all inputs at once: a set-operation step
+        /// rechecks its header on every update, so a per-input update would run the check on a
+        /// half-refreshed set.
+        SharedHeaders child_headers;
+        child_headers.reserve(node.children.size());
+        bool headers_changed = false;
+        for (size_t i = 0; i < node.children.size(); ++i)
+        {
+            auto child_output = node.children[i]->step->getOutputHeader();
+            if (!blocksHaveEqualStructure(*node.step->getInputHeaders()[i], *child_output))
+                headers_changed = true;
+            child_headers.push_back(std::move(child_output));
+        }
+        if (headers_changed)
+            node.step->updateInputHeaders(std::move(child_headers));
+
+        stack.pop_back();
+    }
+}
+
+
+/// Tries to build list of possible shards for the read steps that can be processed in parallel.
+Strings makeListOfShardsForReadStep(const IQueryPlanStep * read_step)
+{
+    const auto * read_from_mt = dynamic_cast<const ReadFromMergeTree *>(read_step);
+    if (read_from_mt)
+        return read_from_mt->getShardsForDistributedRead();
+
+#if CLICKHOUSE_CLOUD
+    const auto * read_from_object_storage = dynamic_cast<const ReadFromObjectStorageStep *>(read_step);
+    if (read_from_object_storage)
+        return read_from_object_storage->getShardsForDistributedRead();
+#endif
+
+    return {"0"};   /// One shard by default if read step is not distributed
+}
+
+String dumpQueryPlanShort(const QueryPlan & query_plan)
+{
+    WriteBufferFromOwnString query_plan_buffer;
+    query_plan.explainPlan(query_plan_buffer, ExplainPlanOptions{});
+
+    return query_plan_buffer.str();
+}
+
+
+/// Builds distributed plan by splitting the query plan into multiple stages connected by exchanges.
+/// Exchange steps are split into ExchangeSink and ExchangeSource.
+/// This allows to build a separate plan fragment (a part of the original full plan) for each stage.
+DistributedQueryPlan makeDistributedPlan(QueryPlan::Nodes /*nodes*/, QueryPlan::Node * root, const QueryPlanOptimizationSettings & optimization_settings)
+{
+    auto logger = getLogger("makeDistributedPlan");
+
+    size_t exchange_id = 0;
+
+    DistributedQueryPlan distributed_plan;
+    DistributedQueryTask main_task;
+
+    QueryPlan plan_fragment;
+    std::unordered_map<String, String> main_stage_depends_on;
+
+    {
+        struct Frame
+        {
+            QueryPlan::Node * node = nullptr;
+            size_t next_child = 0;
+            std::vector<std::unique_ptr<QueryPlan>> child_plans{};
+            std::unordered_map<String, DistributedQueryTask> list_of_shards{};
+            std::unordered_map<String, String> depends_on_stages{};
+        };
+
+        std::vector<Frame> stack;
+        stack.push_back({.node = root});
+
+        std::unique_ptr<QueryPlan> current_plan = std::make_unique<QueryPlan>();
+        std::unordered_map<String, DistributedQueryTask> current_list_of_shards;     /// Tasks for shards that can be processed in parallel by the current_plan
+        std::unordered_map<String, String> current_stage_depends_on;
+
+        while (!stack.empty())
+        {
+            /// NOTE: frame cannot be safely used after stack was modified.
+            auto & frame = stack.back();
+
+            /// On entering the node.
+            if (frame.next_child == 0)
+            {
+                /// Nothing to do
+            }
+
+            /// Returned from child
+            if (frame.next_child > 0)
+            {
+                if (frame.next_child == 1)
+                {
+                    /// First child, take its list of shards
+                    frame.list_of_shards = std::move(current_list_of_shards);
+                    current_list_of_shards = {};
+                }
+                else
+                {
+                    /// Check that child plan has the same list of shards
+                    if (frame.list_of_shards.size() != current_list_of_shards.size())
+                        throw Exception(ErrorCodes::LOGICAL_ERROR, "Different list of shards in child plans {} and {}, last child plan: \n{}",
+                            frame.list_of_shards.size(), current_list_of_shards.size(),
+                            dumpQueryPlanShort(*frame.child_plans.back()));
+
+                    /// Add parameters and temporary files from the child plan
+                    for (auto & [shard, task] : current_list_of_shards)
+                    {
+                        auto it = frame.list_of_shards.find(shard);
+                        if (it == frame.list_of_shards.end())
+                            throw Exception(ErrorCodes::LOGICAL_ERROR, "Shard {} is missing in the list of shards", shard);
+
+                        it->second.parameters.parameters.insert(task.parameters.parameters.begin(), task.parameters.parameters.end());
+                        it->second.input_exchange_streams.insert(it->second.input_exchange_streams.end(),
+                            task.input_exchange_streams.begin(), task.input_exchange_streams.end());
+                    }
+                }
+
+                frame.child_plans.emplace_back(std::move(current_plan));
+                frame.depends_on_stages.insert(current_stage_depends_on.begin(), current_stage_depends_on.end());
+                current_stage_depends_on.clear();
+            }
+
+            /// Traverse next child
+            if (frame.next_child < frame.node->children.size())
+            {
+                auto next_frame = Frame{.node = frame.node->children[frame.next_child]};
+                ++frame.next_child;
+                stack.push_back(std::move(next_frame));
+                continue;
+            }
+
+            /// All children were traversed;
+            chassert(frame.next_child == frame.node->children.size());
+
+            if (frame.child_plans.size() > 1)
+            {
+                /// Step has multiple inputs
+                current_plan = std::make_unique<QueryPlan>();
+                current_plan->unitePlans(std::move(frame.node->step), std::move(frame.child_plans));
+            }
+            else if (frame.child_plans.size() == 1)
+            {
+                /// Step has only one input
+                current_plan = std::move(frame.child_plans.front());
+
+                const auto * exchange_step = dynamic_cast<const LogicalExchangeStep *>(frame.node->step.get());
+
+                if (exchange_step && !optimization_settings.distributed_plan_single_stage)
+                {
+                    /// Make unique name for the exchange
+                    const String stage_name = "stage_" + std::to_string(exchange_id);
+                    ExchangeDescription exchange_description;
+                    exchange_description.name = "exchange_" + std::to_string(exchange_id);
+                    ++exchange_id;
+                    exchange_description.kind = optimization_settings.distributed_plan_force_exchange_kind == "Persisted" ?
+                        ExchangeDescription::Kind::Persisted : ExchangeDescription::Kind::Streaming;
+                    exchange_description.source_bucket_count = frame.list_of_shards.size();
+                    exchange_description.destination_bucket_count = exchange_step->getResultBucketCount();
+
+                    distributed_plan.exchange_descriptions[exchange_description.name] = exchange_description;
+
+                    Strings source_shards;
+                    for (auto & [source_shard, _] : frame.list_of_shards)
+                        source_shards.push_back(source_shard);
+
+                    auto send_and_receive_steps = exchange_step->createSinkAndSourcePair(exchange_description.name, source_shards);
+                    send_and_receive_steps.first->setStepDescription(exchange_description.name, optimization_settings.max_step_description_length);
+                    send_and_receive_steps.second->setStepDescription(exchange_description.name, optimization_settings.max_step_description_length);
+
+                    Strings list_of_exchange_shards;
+                    list_of_exchange_shards.reserve(exchange_description.destination_bucket_count);
+                    for (size_t bucket = 0; bucket < exchange_description.destination_bucket_count; ++bucket)
+                        list_of_exchange_shards.push_back(toString(bucket));
+
+                    /// Finish current plan fragment with exchange sink
+                    current_plan->addStep(std::move(send_and_receive_steps.first));
+                    /// Create stage with the current plan fragment
+                    {
+                        DistributedQueryStage stage;
+
+                        /// Create a task for each of the current shards
+                        for (auto & [source_shard, source_task] : frame.list_of_shards)
+                        {
+                            source_task.task_id = stage_name + "_" + source_shard;
+
+                            /// List of output streams for the exchange sink
+                            for (const auto & destination_shard : list_of_exchange_shards)
+                                source_task.output_exchange_streams.emplace_back(ExchangeStreamId(exchange_description.name, source_shard, destination_shard));
+
+                            /// Move source tasks to the source stage
+                            stage.tasks.emplace_back(std::move(source_task));
+                        }
+                        stage.query_plan_fragment = std::move(*current_plan);
+                        distributed_plan.stages[stage_name] = std::move(stage);
+                        /// Add dependency from previous stages if any
+                        distributed_plan.stage_depends_on[stage_name] = std::move(frame.depends_on_stages);
+                        frame.depends_on_stages = {};
+                    }
+
+                    /// Prepare tasks for the next stage
+                    std::unordered_map<String, DistributedQueryTask> destination_stage_tasks;
+                    for (const auto & destination_shard : list_of_exchange_shards)
+                    {
+                        DistributedQueryTask destination_task;
+                        destination_task.parameters.parameters["bucket_id"] = Field(destination_shard);
+                        destination_task.parameters.parameters["total_buckets"] = Field(list_of_exchange_shards.size());
+
+                        /// List of input streams for the exchange source
+                        for (auto & [source_shard, source_task] : frame.list_of_shards)
+                            destination_task.input_exchange_streams.emplace_back(ExchangeStreamId(exchange_description.name, source_shard, destination_shard));
+
+                        destination_stage_tasks[destination_shard] = std::move(destination_task);
+                    }
+
+                    /// Add previous stage to the current list of dependencies
+                    frame.depends_on_stages.insert({stage_name, exchange_description.name});
+
+                    /// And start a new plan fragment with exchange source
+                    current_plan = std::make_unique<QueryPlan>();
+                    current_plan->addStep(std::move(send_and_receive_steps.second));
+                    frame.list_of_shards = std::move(destination_stage_tasks);
+                }
+                else
+                {
+                    /// Add current step on top of the current plan
+                    current_plan->addStep(std::move(frame.node->step));
+                }
+            }
+            else
+            {
+                /// No children, this means that this is a leaf step.
+
+                auto populate_shards = [&](std::vector<String> shards_for_read)
+                {
+                    for (size_t bucket = 0; bucket < shards_for_read.size(); ++bucket)
+                    {
+                        String shard_id = toString(bucket);
+                        DistributedQueryTask task;
+                        task.parameters.parameters["bucket_id"] = Field(shard_id);
+                        task.parameters.parameters["bucket_description"] = Field(shards_for_read[bucket]);
+                        task.parameters.parameters["total_buckets"] = Field(shards_for_read.size());
+                        frame.list_of_shards[shard_id] = std::move(task);
+                    }
+                };
+
+#if CLICKHOUSE_CLOUD
+                ReadFromMergeTree * read_merge_tree = typeid_cast<ReadFromMergeTree *>(frame.node->step.get());
+                if (read_merge_tree && !optimization_settings.distributed_plan_prefer_replicas_over_workers)
+                {
+                    auto worker_step = ReadFromMergeTreeAtWorker::createFrom(*read_merge_tree);
+                    auto shards_for_read = worker_step->getShardsForDistributedRead();
+
+                    current_plan = std::make_unique<QueryPlan>();
+                    current_plan->addStep(std::move(worker_step));
+
+                    populate_shards(std::move(shards_for_read));
+                }
+                else
+#endif
+                {
+                    auto shards_for_read = makeListOfShardsForReadStep(frame.node->step.get());
+
+                    current_plan = std::make_unique<QueryPlan>();
+                    current_plan->addStep(std::move(frame.node->step));
+
+                    populate_shards(std::move(shards_for_read));
+                }
+            }
+
+            current_stage_depends_on = std::move(frame.depends_on_stages);
+            current_list_of_shards = std::move(frame.list_of_shards);
+
+            LOG_TEST(logger, "Current plan:\n{}\nshard count: {}\n",
+                dumpQueryPlanShort(*current_plan), current_list_of_shards.size());
+
+            /// On leaving the last node.
+            if (stack.size() == 1)
+            {
+                plan_fragment = std::move(*current_plan);
+                main_stage_depends_on = std::move(current_stage_depends_on);
+                current_stage_depends_on = {};
+            }
+
+            stack.pop_back();
+        }
+
+        chassert(current_list_of_shards.size() == 1);
+        main_task = std::move(current_list_of_shards.begin()->second);
+    }
+
+    /// Add last plan fragment as the main stage
+    {
+        DistributedQueryStage stage;
+        stage.query_plan_fragment = std::move(plan_fragment);
+
+        main_task.task_id = "main";
+        stage.tasks.emplace_back(std::move(main_task));
+
+        distributed_plan.stages["main"] = std::move(stage);
+        distributed_plan.stage_depends_on["main"] = main_stage_depends_on;
+    }
+
+    return distributed_plan;
+}
+
+}
+
+}
diff --git a/src/Processors/QueryPlan/Optimizations/optimizeExtended.cpp b/src/Processors/QueryPlan/Optimizations/optimizeExtended.cpp
deleted file mode 100644
index 84816d06bddc..000000000000
--- a/src/Processors/QueryPlan/Optimizations/optimizeExtended.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-#include <Processors/QueryPlan/QueryPlan.h>
-#include <Processors/QueryPlan/Optimizations/QueryPlanOptimizationSettings.h>
-
-namespace DB::QueryPlanOptimizations
-{
-
-/// public repo has dummy functions here for distributed processing, we use real implementations in private
-void tryMakeDistributedJoin(QueryPlan::Node &, QueryPlan::Nodes &, const QueryPlanOptimizationSettings &) {}
-void tryMakeDistributedAggregation(QueryPlan::Node &, QueryPlan::Nodes &, const QueryPlanOptimizationSettings &) {}
-void tryMakeDistributedSorting(QueryPlan::Node &, QueryPlan::Nodes &, const QueryPlanOptimizationSettings &) {}
-void tryMakeDistributedRead(QueryPlan::Node &, QueryPlan::Nodes &, const QueryPlanOptimizationSettings &) {}
-void optimizeExchanges(QueryPlan::Node &) {}
-
-}
diff --git a/src/Processors/QueryPlan/Optimizations/optimizeJoin.cpp b/src/Processors/QueryPlan/Optimizations/optimizeJoin.cpp
index eec6001aec3d..cd1e791f1b03 100644
--- a/src/Processors/QueryPlan/Optimizations/optimizeJoin.cpp
+++ b/src/Processors/QueryPlan/Optimizations/optimizeJoin.cpp
@@ -36,6 +36,11 @@
 #include <Processors/QueryPlan/Optimizations/joinOrder.h>
 
 #include <Storages/StorageMemory.h>
+#include <Processors/QueryPlan/ReadFromObjectStorageStep.h>
+
+#include <Processors/QueryPlan/LogicalExchangeStep.h>
+#include <Processors/QueryPlan/ShuffleExchangeStep.h>
+#include <Processors/QueryPlan/GatherExchangeStep.h>
 
 #include <algorithm>
 #include <limits>
@@ -232,7 +237,8 @@ RelationStats estimateAggregatingStepStats(const AggregatingStep & aggregating_s
     return aggregation_stats;
 }
 
-RelationStats estimateReadRowsCount(QueryPlan::Node & node, const ActionsDAG::Node * filter = nullptr)
+RelationStats estimateReadRowsCount(QueryPlan::Node & node, const ActionsDAG::Node * filter = nullptr);
+RelationStats estimateReadRowsCount(QueryPlan::Node & node, const ActionsDAG::Node * filter)
 {
     IQueryPlanStep * step = node.step.get();
     if (const auto * reading = typeid_cast<const ReadFromMergeTree *>(step))
@@ -293,6 +299,9 @@ RelationStats estimateReadRowsCount(QueryPlan::Node & node, const ActionsDAG::No
         return RelationStats{.estimated_rows = analyzed_result->selected_rows, .table_name = table_display_name};
     }
 
+    if (typeid_cast<const ReadFromObjectStorageStep *>(step))
+        return RelationStats{};
+
     if (const auto * reading = typeid_cast<const ReadFromMemoryStorageStep *>(step))
     {
         UInt64 estimated_rows = reading->getStorage()->totalRows({}).value_or(0);
@@ -441,6 +450,8 @@ bool convertLogicalJoinToPhysical(
     const QueryPlanOptimizationSettings & optimization_settings)
 {
     bool keep_logical = optimization_settings.keep_logical_steps;
+    /// Distributed plan keeps logical joins steps. They are converted to physical steps afterwards, when plan fragment is executed by a worker.
+    keep_logical |= optimization_settings.make_distributed_plan;
     if (keep_logical)
         return false;
     if (!typeid_cast<JoinStepLogical *>(node.step.get()))
diff --git a/src/Processors/QueryPlan/Optimizations/optimizeTopK.cpp b/src/Processors/QueryPlan/Optimizations/optimizeTopK.cpp
index bdd835ce4af9..13f9efd7bd21 100644
--- a/src/Processors/QueryPlan/Optimizations/optimizeTopK.cpp
+++ b/src/Processors/QueryPlan/Optimizations/optimizeTopK.cpp
@@ -22,6 +22,16 @@ namespace DB::QueryPlanOptimizations
 
 size_t tryOptimizeTopK(QueryPlan::Node * parent_node, QueryPlan::Nodes & nodes, const Optimization::ExtraSettings & settings)
 {
+    /// The dynamic-filtering path injects an internal `__topKFilter` function that
+    /// is created on demand with a runtime threshold tracker and is not registered
+    /// in `FunctionFactory`. The skip-index-on-data-read path likewise relies on a
+    /// `TopKThresholdTracker` shared between `SortingStep` and `ReadFromMergeTree`.
+    /// None of this can be transmitted to remote workers, so when the plan is
+    /// going to be distributed, the remote node would fail to deserialize the
+    /// plan with `Unknown function __topKFilter` (or run with stale state).
+    if (settings.make_distributed_plan)
+        return 0;
+
     QueryPlan::Node * node = parent_node;
 
     auto * limit_step = typeid_cast<LimitStep *>(node->step.get());
diff --git a/src/Processors/QueryPlan/Optimizations/optimizeTree.cpp b/src/Processors/QueryPlan/Optimizations/optimizeTree.cpp
index 88004e01107f..ef333177ee38 100644
--- a/src/Processors/QueryPlan/Optimizations/optimizeTree.cpp
+++ b/src/Processors/QueryPlan/Optimizations/optimizeTree.cpp
@@ -10,10 +10,14 @@
 #include <Processors/QueryPlan/ReadFromLocalReplica.h>
 #include <Processors/QueryPlan/ReadFromMergeTree.h>
 #include <Processors/QueryPlan/SourceStepWithFilter.h>
+#include <Processors/QueryPlan/LogicalExchangeStep.h>
 #include <Common/Exception.h>
 
 #include <memory>
 #include <stack>
+#include <unordered_map>
+#include <utility>
+#include <vector>
 
 namespace DB
 {
@@ -42,6 +46,7 @@ namespace ErrorCodes
 extern const int INCORRECT_DATA;
 extern const int TOO_MANY_QUERY_PLAN_OPTIMIZATIONS;
 extern const int PROJECTION_NOT_USED;
+extern const int SUPPORT_IS_DISABLED;
 }
 
 namespace QueryPlanOptimizations
@@ -83,6 +88,7 @@ void optimizeTreeFirstPass(const QueryPlanOptimizationSettings & optimization_se
         optimization_settings.max_limit_for_top_k_optimization,
         optimization_settings.use_skip_indexes_on_data_read,
         optimization_settings.parallel_replicas_filter_pushdown,
+        optimization_settings.make_distributed_plan,
     };
 
     while (!stack.empty())
@@ -172,6 +178,9 @@ void tryMakeDistributedAggregation(QueryPlan::Node & node, QueryPlan::Nodes & no
 void tryMakeDistributedSorting(QueryPlan::Node & node, QueryPlan::Nodes & nodes, const QueryPlanOptimizationSettings & optimization_settings);
 void tryMakeDistributedRead(QueryPlan::Node & node, QueryPlan::Nodes & nodes, const QueryPlanOptimizationSettings & optimization_settings);
 void optimizeExchanges(QueryPlan::Node & root);
+void materializeConstantsForSetOperationBranches(QueryPlan::Node & root, QueryPlan::Nodes & nodes);
+bool planHasUnsupportedDistributedStep(const QueryPlan::Node & root);
+void checkDistributedReadSupported(const QueryPlan::Node & root);
 
 void optimizeTreeSecondPass(
     const QueryPlanOptimizationSettings & optimization_settings, QueryPlan::Node & root, QueryPlan::Nodes & nodes, QueryPlan & query_plan)
@@ -192,6 +201,7 @@ void optimizeTreeSecondPass(
         optimization_settings.max_limit_for_top_k_optimization,
         optimization_settings.use_skip_indexes_on_data_read,
         optimization_settings.parallel_replicas_filter_pushdown,
+        optimization_settings.make_distributed_plan,
     };
 
     Stack stack;
@@ -282,6 +292,17 @@ void optimizeTreeSecondPass(
             });
     }
 
+    /// WITH TOTALS / ROLLUP / CUBE / extremes produce extra streams the exchange protocol does not
+    /// carry, so such plans cannot be distributed. make_distributed_plan is explicit, so fail rather
+    /// than silently running single-node.
+    if (optimization_settings.make_distributed_plan && planHasUnsupportedDistributedStep(root))
+        throw Exception(ErrorCodes::SUPPORT_IS_DISABLED,
+            "make_distributed_plan does not support WITH TOTALS, ROLLUP, CUBE or extremes");
+    /// Reject reads whose coordinator snapshot/part-order state a worker cannot reproduce.
+    if (optimization_settings.make_distributed_plan)
+        checkDistributedReadSupported(root);
+    const bool make_distributed_plan = optimization_settings.make_distributed_plan;
+
     traverseQueryPlan(stack, root,
         [&](auto & frame_node)
         {
@@ -294,7 +315,7 @@ void optimizeTreeSecondPass(
         [&](auto & frame_node)
         {
             /// After all children were processed, try to apply distributed read, join and aggregation optimizations.
-            if (optimization_settings.make_distributed_plan)
+            if (make_distributed_plan)
             {
                 tryMakeDistributedJoin(frame_node, nodes, optimization_settings);
                 tryMakeDistributedAggregation(frame_node, nodes, optimization_settings);
@@ -415,6 +436,11 @@ void optimizeTreeSecondPass(
     if (optimization_settings.make_distributed_plan && optimization_settings.distributed_plan_optimize_exchanges)
         optimizeExchanges(root);
 
+    /// Force set-operation branches to expose full columns so they agree after a fragment is serialized
+    /// and constness is re-derived per step.
+    if (optimization_settings.make_distributed_plan)
+        materializeConstantsForSetOperationBranches(root, nodes);
+
     /// Vector search first pass optimization sets up everything for vector index usage.
     /// In the 2nd pass, we optimize further by attempting to do an "index-only scan".
     if (optimization_settings.try_use_vector_search && !extra_settings.vector_search_with_rescoring)
@@ -524,5 +550,6 @@ void addStepsToBuildSets(
     }
 }
 
+
 }
 }
diff --git a/src/Processors/QueryPlan/QueryPlan.cpp b/src/Processors/QueryPlan/QueryPlan.cpp
index 35aad85cb4c0..bddd66e6bffa 100644
--- a/src/Processors/QueryPlan/QueryPlan.cpp
+++ b/src/Processors/QueryPlan/QueryPlan.cpp
@@ -2,19 +2,30 @@
 #include <memory>
 #include <stack>
 
+#include <Common/CurrentThread.h>
 #include <Common/JSONBuilder.h>
+#include <Common/logger_useful.h>
 
 #include <IO/Operators.h>
 #include <IO/WriteBuffer.h>
+#include <IO/WriteBufferFromString.h>
+#include <Interpreters/Context.h>
 
+#include <Processors/ConcatProcessor.h>
 #include <Processors/QueryPlan/BuildQueryPipelineSettings.h>
+#include <Processors/QueryPlan/ExchangeLookup.h>
 #include <Processors/QueryPlan/ExpressionStep.h>
+#include <Processors/QueryPlan/GatherSendStep.h>
 #include <Processors/QueryPlan/Optimizations/Optimizations.h>
 #include <Processors/QueryPlan/Optimizations/QueryPlanOptimizationSettings.h>
 #include <Processors/QueryPlan/QueryPlan.h>
 #include <Processors/QueryPlan/ReadFromMergeTree.h>
+#include <Processors/QueryPlan/ReadFromPreparedSource.h>
 #include <Processors/QueryPlan/QueryPlanVisitor.h>
+#include <Processors/Sources/DelayedSource.h>
+#include <Processors/Sources/ReadFromDistributedPlanSource.h>
 
+#include <QueryPipeline/DistributedPlanExecutor.h>
 #include <QueryPipeline/QueryPipelineBuilder.h>
 #include <Planner/Utils.h>
 
@@ -29,6 +40,33 @@ namespace DB
 namespace ErrorCodes
 {
     extern const int LOGICAL_ERROR;
+    extern const int SUPPORT_IS_DISABLED;
+}
+
+namespace
+{
+
+/// A stage fragment is shipped to workers by serializing its query plan, so every step must support
+/// serialization. Check up front (without serializing) so an unsupported plan fails early with a clear
+/// message instead of late, mid-execution, with a generic error.
+void assertFragmentSerializable(const QueryPlan & fragment, const String & stage_name)
+{
+    std::vector<const QueryPlan::Node *> stack;
+    if (fragment.getRootNode())
+        stack.push_back(fragment.getRootNode());
+    while (!stack.empty())
+    {
+        const auto * node = stack.back();
+        stack.pop_back();
+        if (node->step && !node->step->isSerializable())
+            throw Exception(ErrorCodes::SUPPORT_IS_DISABLED,
+                "make_distributed_plan cannot distribute this query: step '{}' in stage '{}' is not "
+                "serializable for remote execution", node->step->getName(), stage_name);
+        for (const auto * child : node->children)
+            stack.push_back(child);
+    }
+}
+
 }
 
 SettingsChanges ExplainPlanOptions::toSettingsChanges() const
@@ -183,6 +221,9 @@ QueryPipelineBuilderPtr QueryPlan::buildQueryPipeline(
     if (do_optimize)
         optimize(optimization_settings);
 
+    if (optimization_settings.make_distributed_plan)
+        convertToDistributed(optimization_settings);
+
     struct Frame
     {
         Node * node = {};
@@ -655,6 +696,157 @@ void QueryPlan::optimize(const QueryPlanOptimizationSettings & optimization_sett
         QueryPlanOptimizations::addStepsToBuildSets(optimization_settings, *this, *root, nodes);
 }
 
+namespace QueryPlanOptimizations
+{
+
+DistributedQueryPlan makeDistributedPlan(QueryPlan::Nodes nodes, QueryPlan::Node * root, const QueryPlanOptimizationSettings & optimization_settings);
+
+}
+
+void QueryPlan::convertToDistributed(const QueryPlanOptimizationSettings & optimization_settings)
+{
+    SharedHeader result_header = root->step->getOutputHeader();
+
+    QueryPlan::Nodes old_nodes = std::move(nodes);
+    QueryPlan::Node * old_root = root;
+    root = nullptr;
+    auto distributed_plan = QueryPlanOptimizations::makeDistributedPlan(std::move(old_nodes), old_root, optimization_settings);
+
+    for (const auto & stage : distributed_plan.stages)
+    {
+        auto it = distributed_plan.stage_depends_on.find(stage.first);
+        const auto & dependencies = it != distributed_plan.stage_depends_on.end() ? it->second : std::unordered_map<String, String>{};
+        LOG_TRACE(getLogger("optimize"), "Distributed stage: '{}' depends on: [{}] plan:\n{}",
+            stage.first, fmt::join(dependencies, ", "), dumpQueryPlan(stage.second.query_plan_fragment));
+    }
+
+    if (distributed_plan.stages.size() == 1)
+    {
+        /// For now just replace the plan with the first and only fragment, but preserve
+        /// table locks and storage holders accumulated during planning.
+        QueryPlanResourceHolder preserved_resources = std::move(resources);
+        *this = std::move(distributed_plan.stages.begin()->second.query_plan_fragment);
+        /// QueryPlanResourceHolder's move-assignment appends rhs into lhs without dropping existing entries.
+        resources = std::move(preserved_resources);
+
+        QueryPlanOptimizationSettings local_settings = optimization_settings;
+        local_settings.make_distributed_plan = false;
+        QueryPlanOptimizations::optimizeTreeSecondPass(local_settings, *root, nodes, *this);
+    }
+    else
+    {
+        ExchangeDescription final_result_exchange
+        {
+            .name = "final_result",
+            .kind = optimization_settings.distributed_plan_force_exchange_kind == "Persisted" ? ExchangeDescription::Kind::Persisted : ExchangeDescription::Kind::Streaming,
+            .source_bucket_count = 1,
+            .destination_bucket_count = 1
+        };
+        auto result_stream_id = ExchangeStreamId(final_result_exchange.name, 0, 0);
+
+        /// Add a step that writes the result of the main stage to the file
+        auto & main_stage = distributed_plan.stages["main"];
+        if (!main_stage.query_plan_fragment.isCompleted())
+        {
+            main_stage.query_plan_fragment.addStep(std::make_unique<GatherSendStep>(result_header, final_result_exchange.name));
+            main_stage.tasks.front().output_exchange_streams.emplace_back(result_stream_id);
+            distributed_plan.exchange_descriptions[final_result_exchange.name] = final_result_exchange;
+            distributed_plan.final_result_stream_name = result_stream_id.toString();
+        }
+
+        /// Fail early (before execution) if any fragment contains a step that cannot be serialized
+        /// for remote execution, instead of throwing late from serializeQueryPlan.
+        for (const auto & [stage_name, stage] : distributed_plan.stages)
+            assertFragmentSerializable(stage.query_plan_fragment, stage_name);
+
+        /// Collect the list of all temporary files
+        Strings all_temporary_files_for_cleanup;
+        for (const auto & stage : distributed_plan.stages)
+        {
+            for (const auto & task : stage.second.tasks)
+            {
+                for (const auto & stream_id : task.output_exchange_streams)
+                {
+                    if (distributed_plan.exchange_descriptions.at(stream_id.exchange_id).kind == ExchangeDescription::Kind::Persisted)
+                        all_temporary_files_for_cleanup.push_back(stream_id.toString());
+                }
+            }
+        }
+
+        auto context = CurrentThread::tryGetQueryContext();
+        chassert(context);
+        /// Local execution runs every task in-process and needs no worker hosts; constructing
+        /// TaskToHostMap would require a configured worker cluster and fail on a plain single server.
+        TaskToHostMapPtr task_to_host_map = optimization_settings.distributed_plan_execute_locally
+            ? nullptr
+            : std::make_shared<TaskToHostMap>(distributed_plan, context);
+
+        /// Generate random unique id for the query
+        /// We cannot use query_id from the context because user can put any string there and it might be not unique
+        UUID unique_query_id = UUIDHelpers::generateV4();
+
+        /// Make plan stub that reads from the executor that executes the distributed plan
+        Pipe run_distributed_plan(std::make_shared<ReadFromDistributedPlanSource>(result_header, unique_query_id, std::move(distributed_plan), task_to_host_map));
+        Pipes pipes;
+        pipes.emplace_back(std::move(run_distributed_plan));
+
+        auto [object_storage, object_storage_path] = getObjectStorageForTemporaryFiles(toString(unique_query_id), context);
+
+        /// TODO: do this only if final_result_exchange is persisted
+        auto temporary_files = createTemporaryFilesLookup(
+            object_storage, object_storage_path, {result_stream_id.toString()}, {});
+
+        ExchangeDescriptions exchange_descriptions;
+        exchange_descriptions[final_result_exchange.name] = final_result_exchange;
+        auto exchange_lookup = createExchangeLookup(
+            toString(unique_query_id),
+            exchange_descriptions,
+            task_to_host_map ? ExchangeStreamSources{task_to_host_map->getExchangeStreamSourceHosts()} : ExchangeStreamSources{},
+            temporary_files,
+            context);
+
+        auto lazily_create_result_reader = [result_header, exchange_lookup, result_stream_id]() -> QueryPipelineBuilder
+        {
+            Pipe read_result_from(exchange_lookup->createSource(result_header, result_stream_id));
+            QueryPipelineBuilder builder;
+            builder.init(std::move(read_result_from));
+            return builder;
+        };
+        pipes.emplace_back(createDelayedPipe(result_header, lazily_create_result_reader, false, false));
+
+        Pipe inputs = Pipe::unitePipes(std::move(pipes));
+        /// For streaming exchange we start both inputs in parallel to let the main task send back the result to the initiator.
+        /// In case of persisted exchange use ConcatProcessor to first execute the whole distributed plan and after that read the result from the file.
+        if (final_result_exchange.kind == ExchangeDescription::Kind::Persisted)
+            inputs.addTransform(std::make_shared<ConcatProcessor>(inputs.getSharedHeader(), inputs.numOutputPorts()));
+
+        /// Plan stub that will be used if distributed plan is enabled
+        QueryPlan read_from_distributed;
+
+        read_from_distributed.addStep(std::make_unique<ReadFromPreparedSource>(std::move(inputs)));
+
+        /// Preserve original table locks and storage holders across the move-assign
+        /// so the final pipeline keeps the tables referenced by serialized fragments alive.
+        QueryPlanResourceHolder preserved_resources = std::move(resources);
+        *this = std::move(read_from_distributed);
+        resources = std::move(preserved_resources);
+
+        /// In-memory exchanges (execute_locally) must outlive the executor: the result reader drains
+        /// final_result after the driver has finished. Remove them when the pipeline resources go away.
+        resources.custom_resources.emplace_back(makeInMemoryExchangesCleaner(toString(unique_query_id)));
+
+        /// Add temporary files cleaner to the resources so that all temporary files are removed after the pipeline is executed
+        if (final_result_exchange.kind == ExchangeDescription::Kind::Persisted)
+            all_temporary_files_for_cleanup.push_back(result_stream_id.toString());
+
+        if (object_storage)
+        {
+            auto temporary_files_cleaner = makeTemporaryFilesCleaner(object_storage, object_storage_path, all_temporary_files_for_cleanup);
+            resources.custom_resources.emplace_back(std::move(temporary_files_cleaner));
+        }
+    }
+}
+
 void QueryPlan::explainEstimate(MutableColumns & columns) const
 {
     checkInitialized();
diff --git a/src/Processors/QueryPlan/QueryPlan.h b/src/Processors/QueryPlan/QueryPlan.h
index 8397e9e3aeda..7aece7baa08b 100644
--- a/src/Processors/QueryPlan/QueryPlan.h
+++ b/src/Processors/QueryPlan/QueryPlan.h
@@ -220,4 +220,58 @@ struct QueryPlanAndSets
 std::string debugExplainStep(IQueryPlanStep & step);
 std::string debugExplainPlan(const QueryPlan & plan);
 
+
+struct ExchangeDescription
+{
+    enum class Kind
+    {
+        Persisted = 1,  /// Exchange data between tasks using temporary files
+        Streaming = 2,  /// Exchange data between tasks using network
+    };
+
+    String name;
+    Kind kind = Kind::Persisted;
+    size_t source_bucket_count = 0;
+    size_t destination_bucket_count = 0;
+};
+
+using ExchangeDescriptions = std::unordered_map<String, ExchangeDescription>;
+
+
+/// Stores named parameters for query plan.
+/// This is aimed to share the same plan with different values of parameters like bucket id for shuffle.
+struct QueryPlanParameters
+{
+    std::unordered_map<String, Field> parameters;
+};
+
+/// Represents a single local task in a distributed query plan
+struct DistributedQueryTask
+{
+    String task_id;
+    QueryPlanParameters parameters;
+    std::vector<ExchangeStreamId> input_exchange_streams;
+    std::vector<ExchangeStreamId> output_exchange_streams;
+};
+
+/// A group of tasks with the same plan fragment and differenet parameters
+/// Tasks can be executed in parallel on different partitions of data
+struct DistributedQueryStage
+{
+    QueryPlan query_plan_fragment;   /// Common for all tasks
+    std::vector<DistributedQueryTask> tasks;   /// Individual set of parameter values for each task
+};
+
+/// Represents a graph of stages
+/// A stage typically contains a fragment of the query plan that can be executed by multiple workers in parallel on different partitions of data
+struct DistributedQueryPlan
+{
+    std::unordered_map<String, DistributedQueryStage> stages;
+    /// Maps stage name to stages it depends on and the corresponding exchange_id
+    std::unordered_map<String, std::unordered_map<String, String>> stage_depends_on;
+    /// Maps exchange_id to exchange description
+    ExchangeDescriptions exchange_descriptions;
+    String final_result_stream_name;
+};
+
 }
diff --git a/src/Processors/QueryPlan/QueryPlanStepRegistry.cpp b/src/Processors/QueryPlan/QueryPlanStepRegistry.cpp
index b140edef0ca4..0dc63e2e12d1 100644
--- a/src/Processors/QueryPlan/QueryPlanStepRegistry.cpp
+++ b/src/Processors/QueryPlan/QueryPlanStepRegistry.cpp
@@ -56,11 +56,22 @@ void registerTotalsHavingStep(QueryPlanStepRegistry & registry);
 void registerExtremesStep(QueryPlanStepRegistry & registry);
 void registerJoinStep(QueryPlanStepRegistry & registry);
 void registerObjectFilterStep(QueryPlanStepRegistry & registry);
+void registerShuffleSendStep(QueryPlanStepRegistry & registry);
+void registerShuffleReceiveStep(QueryPlanStepRegistry & registry);
+void registerGatherSendStep(QueryPlanStepRegistry & registry);
+void registerGatherReceiveStep(QueryPlanStepRegistry & registry);
+void registerBroadcastSendStep(QueryPlanStepRegistry & registry);
+void registerBroadcastReceiveStep(QueryPlanStepRegistry & registry);
+void registerReadFromMergeTreeStep(QueryPlanStepRegistry & registry);
 
 void registerReadFromTableStep(QueryPlanStepRegistry & registry);
 void registerReadFromTableFunctionStep(QueryPlanStepRegistry & registry);
 void registerBuildRuntimeFilterStep(QueryPlanStepRegistry & registry);
 
+
+void registerReadFromStorageStep(QueryPlanStepRegistry & registry);
+
+
 void QueryPlanStepRegistry::registerPlanSteps()
 {
     QueryPlanStepRegistry & registry = QueryPlanStepRegistry::instance();
@@ -84,10 +95,21 @@ void QueryPlanStepRegistry::registerPlanSteps()
     registerExtremesStep(registry);
     registerJoinStep(registry);
 
+    registerShuffleSendStep(registry);
+    registerShuffleReceiveStep(registry);
+    registerGatherSendStep(registry);
+    registerGatherReceiveStep(registry);
+    registerBroadcastSendStep(registry);
+    registerBroadcastReceiveStep(registry);
+    registerReadFromMergeTreeStep(registry);
+
     registerReadFromTableStep(registry);
     registerReadFromTableFunctionStep(registry);
     registerBuildRuntimeFilterStep(registry);
     registerObjectFilterStep(registry);
+
+
+    registerReadFromStorageStep(registry);
 }
 
 }
diff --git a/src/Processors/QueryPlan/ReadFromMergeTree.cpp b/src/Processors/QueryPlan/ReadFromMergeTree.cpp
index ce73387cf692..ecbc502532ed 100644
--- a/src/Processors/QueryPlan/ReadFromMergeTree.cpp
+++ b/src/Processors/QueryPlan/ReadFromMergeTree.cpp
@@ -6,6 +6,7 @@
 #include <IO/Operators.h>
 #include <Interpreters/Cluster.h>
 #include <Interpreters/Context.h>
+#include <Interpreters/DatabaseCatalog.h>
 #include <Interpreters/ExpressionAnalyzer.h>
 #include <Interpreters/ExpressionActions.h>
 #include <Interpreters/InterpreterSelectQuery.h>
@@ -26,10 +27,12 @@
 #include <Processors/Merges/ReplacingSortedTransform.h>
 #include <Processors/Merges/SummingSortedTransform.h>
 #include <Processors/Merges/VersionedCollapsingTransform.h>
+#include <Processors/QueryPlan/IParameterLookup.h>
 #include <Processors/QueryPlan/IQueryPlanStep.h>
 #include <Processors/QueryPlan/PartsSplitter.h>
 #include <Processors/QueryPlan/LazilyReadFromMergeTree.h>
 #include <Processors/QueryPlan/QueryIdHolder.h>
+#include <Processors/QueryPlan/QueryPlanStepRegistry.h>
 #include <Processors/Sources/NullSource.h>
 #include <Processors/Transforms/ExpressionTransform.h>
 #include <Processors/Transforms/FilterTransform.h>
@@ -249,6 +252,8 @@ namespace ErrorCodes
     extern const int INDEX_NOT_USED;
     extern const int LOGICAL_ERROR;
     extern const int TOO_MANY_PARTITIONS;
+    extern const int NO_SUCH_DATA_PART;
+    extern const int SUPPORT_IS_DISABLED;
 }
 
 static bool checkAllPartsOnRemoteFS(const RangesInDataParts & parts)
@@ -3275,10 +3280,135 @@ bool ReadFromMergeTree::supportsSkipIndexesOnDataRead() const
     return true;
 }
 
-void ReadFromMergeTree::initializePipeline(QueryPipelineBuilder & pipeline, const BuildQueryPipelineSettings &)
+
+static const char * indexTypeToString(ReadFromMergeTree::IndexType type);
+
+void ReadFromMergeTree::logPredicateStatistics(const AnalysisResult & result) const
+{
+    UInt64 sample_rate = context->getSettingsRef()[Setting::predicate_statistics_sample_rate];
+    if (sample_rate == 0)
+        return;
+
+    if (sample_rate > 1)
+    {
+        auto qid = CurrentThread::getQueryId();
+        if (CityHash_v1_0_2::CityHash64(qid.data(), qid.size()) % sample_rate != 0)
+            return;
+    }
+
+    auto predicate_stats_log = context->getPredicateStatisticsLog();
+    if (!predicate_stats_log)
+        return;
+
+    if (result.index_stats.empty())
+        return;
+
+    auto storage_id = data.getStorageID();
+    if (storage_id.database_name.empty())
+        return;
+
+    PredicateStatisticsLogElement elem;
+    auto now = time(nullptr);
+    elem.event_date = static_cast<UInt16>(DateLUT::instance().toDayNum(now));
+    elem.event_time = now;
+    elem.database = storage_id.database_name;
+    elem.table = storage_id.table_name;
+    elem.query_id = String(CurrentThread::getQueryId());
+
+    UInt64 prev_granules = 0;
+    for (const auto & stat : result.index_stats)
+    {
+        if (stat.type == IndexType::None)
+        {
+            prev_granules = stat.num_granules_after;
+            continue;
+        }
+
+        if (!stat.part_name.empty())
+            continue;
+
+        UInt64 total = prev_granules > 0 ? prev_granules : stat.num_granules_after;
+        UInt64 after = stat.num_granules_after;
+
+        elem.index_names.push_back(stat.name.empty() ? indexTypeToString(stat.type) : stat.name);
+        elem.index_types.push_back(indexTypeToString(stat.type));
+        elem.total_granules.push_back(total);
+        elem.granules_after.push_back(after);
+        elem.index_selectivities.push_back(total > 0 ? static_cast<Float64>(after) / static_cast<Float64>(total) : 1.0);
+
+        prev_granules = after;
+    }
+
+    if (!elem.index_names.empty())
+        predicate_stats_log->add(std::move(elem));
+}
+
+MarkRanges filterMarkRangesForBucket(const MarkRanges & ranges, size_t & effective_bucket_index, size_t total_buckets)
+{
+    MarkRanges result;
+    for (const auto & range : ranges)
+    {
+        size_t length = range.end - range.begin;
+        size_t length_per_bucket = std::max<size_t>(1, (length + total_buckets - 1) / total_buckets);
+        size_t updated_begin = range.begin + effective_bucket_index * length_per_bucket;
+        size_t updated_end = std::min(range.end, updated_begin + length_per_bucket);
+        if (updated_begin < updated_end)
+            result.emplace_back(updated_begin, updated_end);
+        effective_bucket_index = (effective_bucket_index + 1) % total_buckets;
+    }
+    return result;
+}
+
+void ReadFromMergeTree::initializePipeline(QueryPipelineBuilder & pipeline, [[maybe_unused]] const BuildQueryPipelineSettings & settings)
 {
     auto & result = getAnalysisResult();
 
+    logPredicateStatistics(result);
+
+    /// Filter ranges by 'bucket_id' parameter so that each distributed worker reads only its slice of the parts.
+    if (distributed_read_bucket_count > 0 && settings.parameter_lookup)
+    {
+        /// Bucket over the coordinator-selected parts in a fixed order, so every worker partitions the
+        /// same ordered list (replicas can have different local part layouts). A missing part is a
+        /// retryable error rather than a silently divergent read.
+        if (!distributed_read_part_names.empty())
+        {
+            std::unordered_map<String, RangesInDataPart> parts_by_name;
+            for (auto & part : result.parts_with_ranges)
+                parts_by_name.emplace(part.data_part->name, std::move(part));
+
+            RangesInDataParts coordinator_parts;
+            coordinator_parts.reserve(distributed_read_part_names.size());
+            for (const auto & part_name : distributed_read_part_names)
+            {
+                auto it = parts_by_name.find(part_name);
+                if (it == parts_by_name.end())
+                    throw Exception(ErrorCodes::NO_SUCH_DATA_PART,
+                        "Distributed read: part {} selected by the coordinator is not available on this replica "
+                        "(diverged by merge or replication lag); retry the query", part_name);
+                coordinator_parts.push_back(std::move(it->second));
+            }
+            result.parts_with_ranges = std::move(coordinator_parts);
+        }
+
+        const size_t bucket_id = parse<UInt64>(settings.parameter_lookup->getParameter("bucket_id").safeGet<String>());
+        const size_t total_buckets = settings.parameter_lookup->getParameter("total_buckets").safeGet<UInt64>();
+
+        size_t effective_bucket_index = bucket_id;
+        RangesInDataParts filtered_parts;
+        for (const auto & part : result.parts_with_ranges)
+        {
+            auto filtered_part = part;
+            filtered_part.ranges = filterMarkRangesForBucket(part.ranges, effective_bucket_index, total_buckets);
+            if (!filtered_part.ranges.empty())
+                filtered_parts.push_back(std::move(filtered_part));
+        }
+        result.parts_with_ranges = std::move(filtered_parts);
+
+        /// Cannot cache PREWHERE results when ranges are filtered by bucket_id.
+        reader_settings.use_query_condition_cache = false;
+    }
+
     if (enable_remove_parts_from_snapshot_optimization)
     {
         /// Do not keep data parts in snapshot.
@@ -4294,4 +4424,227 @@ bool ReadFromMergeTree::canRemoveColumnsFromOutput() const
 
     return canRemoveUnusedColumns() && output_header->columns() > 0;
 }
+
+void ReadFromMergeTree::setDistributedRead(size_t bucket_count)
+{
+    distributed_read_bucket_count = bucket_count;
+}
+
+void ReadFromMergeTree::setDistributedReadParts(Names part_names)
+{
+    distributed_read_part_names = std::move(part_names);
+}
+
+Strings ReadFromMergeTree::getShardsForDistributedRead() const
+{
+    Strings default_shard_list = {"0"};
+
+    if (distributed_read_bucket_count == 0)
+        return default_shard_list;
+
+    auto analysis_result = selectRangesToRead();
+    if (!analysis_result)
+        return default_shard_list;
+
+    /// TODO: take into account selected ranges?
+
+    Strings list_of_shards;
+    for (size_t i = 0; i < distributed_read_bucket_count; ++i)
+        list_of_shards.push_back(std::to_string(i));
+
+    return list_of_shards;
+}
+
+
+void ReadFromMergeTree::serialize(Serialization & ctx) const
+{
+    /// Serializing the STREAM modifier is not implemented yet, so reject it instead of silently
+    /// reading a plain snapshot. (Pinned block boundaries and part-order virtual columns are rejected
+    /// earlier in checkDistributedReadSupported.)
+    if (query_info.isStream())
+        throw Exception(ErrorCodes::SUPPORT_IS_DISABLED,
+            "make_distributed_plan does not support a distributed read with the STREAM modifier");
+
+    /// Needed only for a bucketed read: it is pinned to the coordinator's part list and cannot
+    /// re-derive read-in-order, deferred FINAL filters, a projection, or text index tasks. A
+    /// non-bucket read is rebuilt and re-optimized on the worker, which re-derives them.
+    if (distributed_read_bucket_count > 0)
+    {
+        if (query_info.input_order_info)
+            throw Exception(ErrorCodes::SUPPORT_IS_DISABLED,
+                "make_distributed_plan does not support a read-in-order distributed read");
+        if (deferred_row_level_filter || deferred_prewhere_info)
+            throw Exception(ErrorCodes::SUPPORT_IS_DISABLED,
+                "make_distributed_plan does not support a distributed read with deferred FINAL filters");
+        if (analyzed_result_ptr && analyzed_result_ptr->readFromProjection())
+            throw Exception(ErrorCodes::SUPPORT_IS_DISABLED,
+                "make_distributed_plan does not support a distributed read from a projection");
+        if (!index_read_tasks.empty())
+            throw Exception(ErrorCodes::SUPPORT_IS_DISABLED,
+                "make_distributed_plan does not support a distributed read using direct text index tasks");
+    }
+
+    StorageID table_id = data.getStorageID();
+    writeStringBinary(table_id.getDatabaseName(), ctx.out);
+    writeStringBinary(table_id.getTableName(), ctx.out);
+    writeVarUInt(getAllColumnNames().size(), ctx.out);
+    for (const auto & column : getAllColumnNames())
+        writeStringBinary(column, ctx.out);
+
+    /// TODO: not sure that these fields should be serialized, maybe they should be recalculated at target
+    writeVarUInt(getMaxBlockSize(), ctx.out);
+    writeVarUInt(getNumStreams(), ctx.out);
+
+    const auto & table_expression_modifiers = query_info.table_expression_modifiers;
+
+    UInt8 flags = 0;
+    if (table_expression_modifiers && table_expression_modifiers->hasFinal())
+        flags |= 1;
+    if (table_expression_modifiers && table_expression_modifiers->hasSampleSizeRatio())
+        flags |= 2;
+    if (table_expression_modifiers && table_expression_modifiers->hasSampleOffsetRatio())
+        flags |= 4;
+    if (query_info.row_level_filter != nullptr)
+        flags |= 8;
+    if (query_info.prewhere_info != nullptr)
+        flags |= 16;
+
+    writeIntBinary(flags, ctx.out);
+    if (table_expression_modifiers && table_expression_modifiers->hasSampleSizeRatio())
+        serializeRational(*table_expression_modifiers->getSampleSizeRatio(), ctx.out);
+
+    if (table_expression_modifiers && table_expression_modifiers->hasSampleOffsetRatio())
+        serializeRational(*table_expression_modifiers->getSampleOffsetRatio(), ctx.out);
+
+    if (query_info.row_level_filter)
+        query_info.row_level_filter->serialize(ctx);
+
+    if (query_info.prewhere_info)
+        query_info.prewhere_info->serialize(ctx);
+
+    writeVarUInt(distributed_read_bucket_count, ctx.out);
+
+    /// Pin the coordinator-selected parts so all workers bucket over the same ordered list.
+    if (distributed_read_bucket_count)
+    {
+        Names part_names;
+        if (auto analysis = selectRangesToRead())
+        {
+            part_names.reserve(analysis->parts_with_ranges.size());
+            for (const auto & part : analysis->parts_with_ranges)
+                part_names.push_back(part.data_part->name);
+        }
+        std::sort(part_names.begin(), part_names.end());
+
+        writeVarUInt(part_names.size(), ctx.out);
+        for (const auto & part_name : part_names)
+            writeStringBinary(part_name, ctx.out);
+    }
+}
+
+std::unique_ptr<IQueryPlanStep> ReadFromMergeTree::deserialize(Deserialization & ctx)
+{
+    String database_name;
+    String table_name;
+    readStringBinary(database_name, ctx.in);
+    readStringBinary(table_name, ctx.in);
+
+    size_t num_columns = 0;
+    readVarUInt(num_columns, ctx.in);
+    Names column_names;
+    column_names.reserve(num_columns);
+    for (size_t i = 0; i < num_columns; ++i)
+    {
+        String column_name;
+        readStringBinary(column_name, ctx.in);
+        column_names.push_back(column_name);
+    }
+
+    UInt64 max_block_size = 0;
+    readVarUInt(max_block_size, ctx.in);
+    size_t num_streams = 0;
+    readVarUInt(num_streams, ctx.in);
+
+    UInt8 flags = 0;
+    readIntBinary(flags, ctx.in);
+
+    const bool has_final = flags & 1;
+    const bool has_sample_size_ratio = flags & 2;
+    const bool has_sample_offset_ratio = flags & 4;
+    const bool has_row_level_filter = flags & 8;
+    const bool has_prewhere_info = flags & 16;
+
+    std::optional<TableExpressionModifiers::Rational> sample_size_ratio;
+    std::optional<TableExpressionModifiers::Rational> sample_offset_ratio;
+    if (has_sample_size_ratio)
+        sample_size_ratio = deserializeRational(ctx.in);
+    if (has_sample_offset_ratio)
+        sample_offset_ratio = deserializeRational(ctx.in);
+
+    SelectQueryInfo query_info;
+    query_info.table_expression_modifiers.emplace(has_final, sample_size_ratio, sample_offset_ratio);
+
+    if (has_row_level_filter)
+        query_info.row_level_filter = std::make_shared<FilterDAGInfo>(FilterDAGInfo::deserialize(ctx));
+    if (has_prewhere_info)
+        query_info.prewhere_info = std::make_shared<PrewhereInfo>(PrewhereInfo::deserialize(ctx));
+
+    size_t distributed_read_bucket_count = 0;
+    readVarUInt(distributed_read_bucket_count, ctx.in);
+
+    Names distributed_read_part_names;
+    if (distributed_read_bucket_count)
+    {
+        size_t num_parts = 0;
+        readVarUInt(num_parts, ctx.in);
+        distributed_read_part_names.reserve(num_parts);
+        for (size_t i = 0; i < num_parts; ++i)
+        {
+            String part_name;
+            readStringBinary(part_name, ctx.in);
+            distributed_read_part_names.push_back(std::move(part_name));
+        }
+    }
+
+    StorageID table_id(database_name, table_name);
+    auto storage_ptr = DatabaseCatalog::instance().tryGetTable(table_id, ctx.context);
+    if (!storage_ptr)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Table {}.{} not found in catalog", database_name, table_name);
+
+    MergeTreeData & table = dynamic_cast<MergeTreeData &>(*storage_ptr);
+    MergeTreeDataSelectExecutor executor(table);
+
+    StorageSnapshotPtr storage_snapshot = table.getStorageSnapshot(table.getInMemoryMetadataPtr(ctx.context, false), ctx.context);
+    const auto & snapshot_data = assert_cast<const MergeTreeData::SnapshotData &>(*storage_snapshot->data);
+
+    auto step = executor.readFromParts(
+        snapshot_data.parts,
+        snapshot_data.mutations_snapshot,
+        column_names,
+        storage_snapshot,
+        query_info,
+        ctx.context,
+        max_block_size,
+        num_streams);
+
+    if (distributed_read_bucket_count)
+    {
+        auto * read_from_merge_tree_step = dynamic_cast<ReadFromMergeTree *>(step.get());
+        if (!read_from_merge_tree_step)
+            throw Exception(ErrorCodes::LOGICAL_ERROR, "ReadFromMergeTree step is expected to be created by readFromParts");
+        read_from_merge_tree_step->setDistributedRead(distributed_read_bucket_count);
+        read_from_merge_tree_step->setDistributedReadParts(std::move(distributed_read_part_names));
+    }
+
+    /// Need to keep shared pointer to MergeTree table till the end of plan execution
+    ctx.storage_holders.push_back(storage_ptr);
+    return step;
+}
+
+void registerReadFromMergeTreeStep(QueryPlanStepRegistry & registry);
+void registerReadFromMergeTreeStep(QueryPlanStepRegistry & registry)
+{
+    registry.registerStep("ReadFromMergeTree", ReadFromMergeTree::deserialize);
+}
+
 }
diff --git a/src/Processors/QueryPlan/ReadFromMergeTree.h b/src/Processors/QueryPlan/ReadFromMergeTree.h
index 30df4e7e72e2..bdcb8b6a3a55 100644
--- a/src/Processors/QueryPlan/ReadFromMergeTree.h
+++ b/src/Processors/QueryPlan/ReadFromMergeTree.h
@@ -259,6 +259,10 @@ class ReadFromMergeTree final : public SourceStepWithFilter
 
     const Names & getAllColumnNames() const { return all_column_names; }
 
+    /// True if a coordinator-side snapshot boundary is pinned (e.g. select_sequential_consistency).
+    /// Such a read cannot be distributed: a worker reads from its own snapshot and cannot reproduce it.
+    bool hasPinnedBlockNumbers() const { return max_block_numbers_to_read != nullptr; }
+
     StorageID getStorageID() const { return data.getStorageID(); }
     UInt64 getSelectedParts() const { return selected_parts; }
     UInt64 getSelectedRows() const { return selected_rows; }
@@ -381,6 +385,8 @@ class ReadFromMergeTree final : public SourceStepWithFilter
     bool isSkipIndexAvailableForTopK(const String & sort_column) const;
     const ProjectionIndexReadDescription & getProjectionIndexReadDescription() const { return projection_index_read_desc; }
     ProjectionIndexReadDescription & getProjectionIndexReadDescription() { return projection_index_read_desc; }
+    /// Parts (by name) every worker buckets over, so the partition is identical across replicas.
+    void setDistributedReadParts(Names part_names);
 
     bool canRemoveUnusedColumns() const override;
     RemovedUnusedColumns removeUnusedColumns(NameMultiSet required_outputs, bool remove_inputs) override;
@@ -393,6 +399,10 @@ class ReadFromMergeTree final : public SourceStepWithFilter
 
     void deferFiltersAfterFinalIfNeeded();
 
+    void serialize(Serialization & ctx) const override;
+    bool isSerializable() const override { return true; }
+    static std::unique_ptr<IQueryPlanStep> deserialize(Deserialization & ctx);
+
 private:
     MergeTreeSettingsPtr data_settings;
     MergeTreeReaderSettings reader_settings;
@@ -533,6 +543,8 @@ class ReadFromMergeTree final : public SourceStepWithFilter
 
     std::optional<TopKFilterInfo> top_k_filter_info;
     ProjectionIndexReadDescription projection_index_read_desc;
+    /// Coordinator-selected parts a distributed-read worker buckets over. Empty otherwise.
+    Names distributed_read_part_names;
 };
 
 }
diff --git a/src/Processors/QueryPlan/ReadFromObjectStorageStep.h b/src/Processors/QueryPlan/ReadFromObjectStorageStep.h
index b3a2bdb0cce3..3c4f565aa8ee 100644
--- a/src/Processors/QueryPlan/ReadFromObjectStorageStep.h
+++ b/src/Processors/QueryPlan/ReadFromObjectStorageStep.h
@@ -39,6 +39,7 @@ class ReadFromObjectStorageStep : public SourceStepWithFilter
 
     void initializePipeline(QueryPipelineBuilder & pipeline, const BuildQueryPipelineSettings &) override;
     QueryPlanStepPtr clone() const override;
+    bool isSerializable() const override { return true; }
 
     bool requestReadingInOrder() const;
 
diff --git a/src/Processors/QueryPlan/ReadFromPreparedSource.cpp b/src/Processors/QueryPlan/ReadFromPreparedSource.cpp
index d3f9ea11a7bf..0fb1514e799a 100644
--- a/src/Processors/QueryPlan/ReadFromPreparedSource.cpp
+++ b/src/Processors/QueryPlan/ReadFromPreparedSource.cpp
@@ -5,6 +5,15 @@
 #include <Core/Settings.h>
 #include <Interpreters/Context.h>
 
+
+#include <Processors/QueryPlan/Serialization.h>
+#include <Processors/QueryPlan/QueryPlanStepRegistry.h>
+#include <Processors/Sources/SourceFromSingleChunk.h>
+#include <Columns/ColumnConst.h>
+#include <DataTypes/DataTypesNumber.h>
+#include <IO/WriteHelpers.h>
+#include <IO/ReadHelpers.h>
+
 namespace DB
 {
 
@@ -13,6 +22,12 @@ namespace Setting
     extern const SettingsUInt64 query_plan_max_step_description_length;
 }
 
+namespace ErrorCodes
+{
+    extern const int LOGICAL_ERROR;
+    extern const int NOT_IMPLEMENTED;
+}
+
 ReadFromPreparedSource::ReadFromPreparedSource(Pipe pipe_)
     : ISourceStep(pipe_.getSharedHeader())
     , pipe(std::move(pipe_))
@@ -44,4 +59,42 @@ ReadFromStorageStep::ReadFromStorageStep(
         processor->setStorageLimits(query_info.storage_limits);
 }
 
+void ReadFromStorageStep::serialize(Serialization & ctx) const
+{
+    /// Not a logical error: a caller (e.g. the distributed-plan serializability check) may probe an
+    /// unsupported plan, and a logical error would abort debug/fuzzer builds instead of being handled.
+    if (storage->getName() != "SystemOne")
+        throw Exception(ErrorCodes::NOT_IMPLEMENTED, "ReadFromStorageStep serialization is implemented only for StorageSystemOne, got: {}", storage->getName());
+
+    writeStringBinary(storage->getName(), ctx.out);
+}
+
+bool ReadFromStorageStep::isSerializable() const
+{
+    return storage && storage->getName() == "SystemOne";
+}
+
+std::unique_ptr<IQueryPlanStep> ReadFromStorageStep::deserialize(Deserialization & ctx)
+{
+    String storage_name;
+    readStringBinary(storage_name, ctx.in);
+    if (storage_name != "SystemOne")
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "ReadFromStorageStep deserialization is implemented only for StorageSystemOne, got: {}", storage_name);
+
+    /// "Fake" system.one represented by a chunk with single row
+    auto column = DataTypeUInt8().createColumnConst(1, 0u)->convertToFullColumnIfConst();
+    Chunk chunk({ std::move(column) }, 1);
+
+    auto source = std::make_shared<SourceFromSingleChunk>(ctx.output_header, std::move(chunk));
+    source->addTotalRowsApprox(1);
+
+    return std::make_unique<ReadFromPreparedSource>(Pipe(source));
+}
+
+void registerReadFromStorageStep(QueryPlanStepRegistry & registry);
+void registerReadFromStorageStep(QueryPlanStepRegistry & registry)
+{
+    registry.registerStep("ReadFromStorage", ReadFromStorageStep::deserialize);
+}
+
 }
diff --git a/src/Processors/QueryPlan/ReadFromPreparedSource.h b/src/Processors/QueryPlan/ReadFromPreparedSource.h
index ea559f32b88c..415601aaf22b 100644
--- a/src/Processors/QueryPlan/ReadFromPreparedSource.h
+++ b/src/Processors/QueryPlan/ReadFromPreparedSource.h
@@ -33,6 +33,12 @@ class ReadFromStorageStep final : public ReadFromPreparedSource
 
     const StoragePtr & getStorage() const { return storage; }
 
+    void serialize(Serialization & ctx) const override;
+    /// serialize is implemented only for StorageSystemOne.
+    bool isSerializable() const override;
+
+    static std::unique_ptr<IQueryPlanStep> deserialize(Deserialization & ctx);
+
 private:
     StoragePtr storage;
 
diff --git a/src/Processors/QueryPlan/ReadFromTableFunctionStep.cpp b/src/Processors/QueryPlan/ReadFromTableFunctionStep.cpp
index 1b20a64d05c8..cd4fbd6d4d3c 100644
--- a/src/Processors/QueryPlan/ReadFromTableFunctionStep.cpp
+++ b/src/Processors/QueryPlan/ReadFromTableFunctionStep.cpp
@@ -27,20 +27,6 @@ void ReadFromTableFunctionStep::initializePipeline(QueryPipelineBuilder &, const
     throw Exception(ErrorCodes::NOT_IMPLEMENTED, "initializePipeline is not implementad for ReadFromTableFunctionStep");
 }
 
-static void serializeRational(TableExpressionModifiers::Rational val, WriteBuffer & out)
-{
-    writeIntBinary(val.numerator, out);
-    writeIntBinary(val.denominator, out);
-}
-
-static TableExpressionModifiers::Rational deserializeRational(ReadBuffer & in)
-{
-    TableExpressionModifiers::Rational val;
-    readIntBinary(val.numerator, in);
-    readIntBinary(val.denominator, in);
-    return val;
-}
-
 enum class TableFunctionSerializationKind : UInt8
 {
     AST = 0,
diff --git a/src/Processors/QueryPlan/ReadFromTableFunctionStep.h b/src/Processors/QueryPlan/ReadFromTableFunctionStep.h
index fff66e78dcdf..1c471a228e65 100644
--- a/src/Processors/QueryPlan/ReadFromTableFunctionStep.h
+++ b/src/Processors/QueryPlan/ReadFromTableFunctionStep.h
@@ -15,6 +15,7 @@ class ReadFromTableFunctionStep : public ISourceStep
     void initializePipeline(QueryPipelineBuilder & pipeline, const BuildQueryPipelineSettings &) override;
 
     void serialize(Serialization & ctx) const override;
+    bool isSerializable() const override { return true; }
     static QueryPlanStepPtr deserialize(Deserialization & ctx);
 
     const std::string & getSerializedAST() const { return serialized_ast; }
diff --git a/src/Processors/QueryPlan/ReadFromTableStep.cpp b/src/Processors/QueryPlan/ReadFromTableStep.cpp
index 0a5cce2abb11..9555ba51a20b 100644
--- a/src/Processors/QueryPlan/ReadFromTableStep.cpp
+++ b/src/Processors/QueryPlan/ReadFromTableStep.cpp
@@ -29,20 +29,6 @@ void ReadFromTableStep::initializePipeline(QueryPipelineBuilder &, const BuildQu
     throw Exception(ErrorCodes::NOT_IMPLEMENTED, "initializePipeline is not implementad for ReadFromTableStep");
 }
 
-static void serializeRational(TableExpressionModifiers::Rational val, WriteBuffer & out)
-{
-    writeIntBinary(val.numerator, out);
-    writeIntBinary(val.denominator, out);
-}
-
-static TableExpressionModifiers::Rational deserializeRational(ReadBuffer & in)
-{
-    TableExpressionModifiers::Rational val;
-    readIntBinary(val.numerator, in);
-    readIntBinary(val.denominator, in);
-    return val;
-}
-
 void ReadFromTableStep::serialize(Serialization & ctx) const
 {
     writeStringBinary(table_name, ctx.out);
diff --git a/src/Processors/QueryPlan/ReadFromTableStep.h b/src/Processors/QueryPlan/ReadFromTableStep.h
index 323521ecf245..3d6b68116d54 100644
--- a/src/Processors/QueryPlan/ReadFromTableStep.h
+++ b/src/Processors/QueryPlan/ReadFromTableStep.h
@@ -16,6 +16,7 @@ class ReadFromTableStep : public ISourceStep
     void initializePipeline(QueryPipelineBuilder & pipeline, const BuildQueryPipelineSettings &) override;
 
     void serialize(Serialization & ctx) const override;
+    bool isSerializable() const override { return true; }
     static QueryPlanStepPtr deserialize(Deserialization & ctx);
 
     const String & getTable() const { return table_name; }
diff --git a/src/Processors/QueryPlan/ScatterExchangeStep.cpp b/src/Processors/QueryPlan/ScatterExchangeStep.cpp
new file mode 100644
index 000000000000..0563a5b2f383
--- /dev/null
+++ b/src/Processors/QueryPlan/ScatterExchangeStep.cpp
@@ -0,0 +1,28 @@
+#include <Processors/QueryPlan/ScatterExchangeStep.h>
+#include <Processors/QueryPlan/ShuffleSendStep.h>
+#include <Processors/QueryPlan/ShuffleReceiveStep.h>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int LOGICAL_ERROR;
+}
+
+/// Scatter is a special case of Shuffle where the number of source buckets is 1.
+/// So we can use ShuffleSend and ShuffleReceive steps as sink and source respectively.
+std::pair<QueryPlanStepPtr, QueryPlanStepPtr> ScatterExchangeStep::createSinkAndSourcePair(const String & exchange_id, const Strings & source_shards) const
+{
+    if (source_shards.size() != 1)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "ScatterExchangeStep should have one source shard, got {}", source_shards.size());
+
+    size_t num_buckets = getResultBucketCount();
+    auto sink = std::make_unique<ShuffleSendStep>(input_headers.front(), exchange_id, key_names, num_buckets, hash_cast_types);
+
+    auto source = std::make_unique<ShuffleReceiveStep>(output_header, exchange_id, source_shards);
+
+    return {std::move(sink), std::move(source)};
+}
+
+}
diff --git a/src/Processors/QueryPlan/ScatterExchangeStep.h b/src/Processors/QueryPlan/ScatterExchangeStep.h
new file mode 100644
index 000000000000..9c2a0a5be146
--- /dev/null
+++ b/src/Processors/QueryPlan/ScatterExchangeStep.h
@@ -0,0 +1,64 @@
+#pragma once
+
+#include <DataTypes/IDataType.h>
+#include <Processors/QueryPlan/LogicalExchangeStep.h>
+
+namespace DB
+{
+
+/// Partitions the data from 1 logical streams into N logical streams.
+class ScatterExchangeStep final : public LogicalExchangeStep
+{
+public:
+    /// `hash_cast_types` (one entry per key, optional) selects a type to cast each key to
+    /// before hashing, used to align buckets across both sides of a shuffle join.
+    ScatterExchangeStep(SharedHeader input_header_, Names key_names_, size_t result_bucket_count_, DataTypes hash_cast_types_ = {})
+        : LogicalExchangeStep(input_header_)
+        , key_names(std::move(key_names_))
+        , hash_cast_types(std::move(hash_cast_types_))
+        , result_bucket_count(result_bucket_count_)
+    {
+        chassert(hash_cast_types.empty() || hash_cast_types.size() == key_names.size());
+    }
+
+    String getName() const override { return "ScatterExchange"; }
+
+    void transformPipeline(QueryPipelineBuilder & /*pipeline*/, const BuildQueryPipelineSettings &) override
+    {
+        /// Doesn't change the pipeline if executed directly
+    }
+
+    const Names & getKeys() const
+    {
+        return key_names;
+    }
+
+    const DataTypes & getHashCastTypes() const
+    {
+        return hash_cast_types;
+    }
+
+    size_t getSourceBucketCount() const override
+    {
+        return 1;
+    }
+
+    size_t getResultBucketCount() const override
+    {
+        return result_bucket_count;
+    }
+
+    std::pair<QueryPlanStepPtr, QueryPlanStepPtr> createSinkAndSourcePair(const String & exchange_id, const Strings & source_shards) const override;
+
+private:
+    void updateOutputHeader() override
+    {
+        output_header = input_headers.front();
+    }
+
+    const Names key_names;
+    const DataTypes hash_cast_types;
+    const size_t result_bucket_count;
+};
+
+}
diff --git a/src/Processors/QueryPlan/ShuffleExchangeStep.cpp b/src/Processors/QueryPlan/ShuffleExchangeStep.cpp
new file mode 100644
index 000000000000..3cab150e10b9
--- /dev/null
+++ b/src/Processors/QueryPlan/ShuffleExchangeStep.cpp
@@ -0,0 +1,18 @@
+#include <Processors/QueryPlan/ShuffleExchangeStep.h>
+#include <Processors/QueryPlan/ShuffleSendStep.h>
+#include <Processors/QueryPlan/ShuffleReceiveStep.h>
+
+namespace DB
+{
+
+std::pair<QueryPlanStepPtr, QueryPlanStepPtr> ShuffleExchangeStep::createSinkAndSourcePair(const String & exchange_id, const Strings & source_shards) const
+{
+    size_t num_buckets = getResultBucketCount();
+    auto sink = std::make_unique<ShuffleSendStep>(input_headers.front(), exchange_id, key_names, num_buckets, hash_cast_types);
+
+    auto source = std::make_unique<ShuffleReceiveStep>(output_header, exchange_id, source_shards);
+
+    return {std::move(sink), std::move(source)};
+}
+
+}
diff --git a/src/Processors/QueryPlan/ShuffleExchangeStep.h b/src/Processors/QueryPlan/ShuffleExchangeStep.h
new file mode 100644
index 000000000000..e10ad5ae15c3
--- /dev/null
+++ b/src/Processors/QueryPlan/ShuffleExchangeStep.h
@@ -0,0 +1,62 @@
+#pragma once
+
+#include <DataTypes/IDataType.h>
+#include <Processors/QueryPlan/LogicalExchangeStep.h>
+#include <Core/Names.h>
+
+namespace DB
+{
+
+/// Repartitions the data from M logical streams into N logical streams.
+class ShuffleExchangeStep final : public LogicalExchangeStep
+{
+public:
+    /// `hash_cast_types` (one entry per key, optional) selects a type to cast each key to
+    /// before hashing, used to align buckets across both sides of a shuffle join.
+    ShuffleExchangeStep(SharedHeader input_header_, Names key_names_, size_t source_bucket_count_, size_t result_bucket_count_, DataTypes hash_cast_types_ = {})
+        : LogicalExchangeStep(input_header_)
+        , key_names(std::move(key_names_))
+        , hash_cast_types(std::move(hash_cast_types_))
+        , source_bucket_count(source_bucket_count_)
+        , result_bucket_count(result_bucket_count_)
+    {
+        chassert(hash_cast_types.empty() || hash_cast_types.size() == key_names.size());
+    }
+
+    String getName() const override { return "ShuffleExchange"; }
+
+    void transformPipeline(QueryPipelineBuilder & /*pipeline*/, const BuildQueryPipelineSettings &) override
+    {
+        /// Doesn't change the pipeline if executed directly
+    }
+
+    const Names & getKeys() const
+    {
+        return key_names;
+    }
+
+    size_t getSourceBucketCount() const override
+    {
+        return source_bucket_count;
+    }
+
+    size_t getResultBucketCount() const override
+    {
+        return result_bucket_count;
+    }
+
+    std::pair<QueryPlanStepPtr, QueryPlanStepPtr> createSinkAndSourcePair(const String & exchange_id, const Strings & source_shards) const override;
+
+private:
+    void updateOutputHeader() override
+    {
+        output_header = input_headers.front();
+    }
+
+    const Names key_names;
+    const DataTypes hash_cast_types;
+    const size_t source_bucket_count;
+    const size_t result_bucket_count;
+};
+
+}
diff --git a/src/Processors/QueryPlan/ShuffleReceiveStep.cpp b/src/Processors/QueryPlan/ShuffleReceiveStep.cpp
new file mode 100644
index 000000000000..a48ec117c080
--- /dev/null
+++ b/src/Processors/QueryPlan/ShuffleReceiveStep.cpp
@@ -0,0 +1,64 @@
+#include <Processors/QueryPlan/ShuffleReceiveStep.h>
+#include <Processors/QueryPlan/QueryPlanStepRegistry.h>
+#include <Processors/Sources/NativeCompressedSource.h>
+#include <Processors/QueryPlan/Serialization.h>
+#include <Processors/QueryPlan/IParameterLookup.h>
+#include <Processors/QueryPlan/ExchangeLookup.h>
+#include <Processors/QueryPlan/LogicalExchangeStep.h>
+#include <QueryPipeline/QueryPipelineBuilder.h>
+#include <QueryPipeline/Pipe.h>
+#include <IO/WriteHelpers.h>
+#include <IO/ReadHelpers.h>
+
+namespace DB
+{
+
+void ShuffleReceiveStep::initializePipeline(QueryPipelineBuilder & pipeline, const BuildQueryPipelineSettings & settings)
+{
+    const String bucket_id = settings.parameter_lookup->getParameter("bucket_id").safeGet<String>();
+
+    VectorWithMemoryTracking<std::unique_ptr<QueryPipelineBuilder>> pipelines;
+
+    /// Read all shards
+    for (const String & shard_id : source_shards)
+    {
+        std::unique_ptr<QueryPipelineBuilder> pipeline_ptr = std::make_unique<QueryPipelineBuilder>();
+        pipeline_ptr->init(Pipe(settings.exchange_lookup->createSource(output_header, ExchangeStreamId(exchange_id, shard_id, bucket_id))));
+        pipelines.emplace_back(std::move(pipeline_ptr));
+    }
+
+    pipeline = QueryPipelineBuilder::unitePipelines(std::move(pipelines), 0, &processors);
+}
+
+void ShuffleReceiveStep::serialize(Serialization & ctx) const
+{
+    writeStringBinary(exchange_id, ctx.out);
+    writeVarUInt(source_shards.size(), ctx.out);
+    for (const String & shard_id : source_shards)
+        writeStringBinary(shard_id, ctx.out);
+}
+
+std::unique_ptr<IQueryPlanStep> ShuffleReceiveStep::deserialize(Deserialization & ctx)
+{
+    String exchange_id;
+    readStringBinary(exchange_id, ctx.in);
+    size_t shard_id_count = 0;
+    readVarUInt(shard_id_count, ctx.in);
+    Strings list_of_shard_ids;
+    list_of_shard_ids.reserve(shard_id_count);
+    for (size_t i = 0; i < shard_id_count; ++i)
+    {
+        String shard_id;
+        readStringBinary(shard_id, ctx.in);
+        list_of_shard_ids.push_back(std::move(shard_id));
+    }
+    return std::make_unique<ShuffleReceiveStep>(ctx.output_header, exchange_id, list_of_shard_ids);
+}
+
+void registerShuffleReceiveStep(QueryPlanStepRegistry & registry);
+void registerShuffleReceiveStep(QueryPlanStepRegistry & registry)
+{
+    registry.registerStep("ShuffleReceive", ShuffleReceiveStep::deserialize);
+}
+
+}
diff --git a/src/Processors/QueryPlan/ShuffleReceiveStep.h b/src/Processors/QueryPlan/ShuffleReceiveStep.h
new file mode 100644
index 000000000000..8a432926fe23
--- /dev/null
+++ b/src/Processors/QueryPlan/ShuffleReceiveStep.h
@@ -0,0 +1,34 @@
+#pragma once
+
+#include <Processors/QueryPlan/ISourceStep.h>
+
+namespace DB
+{
+
+/// Reads data corresponding to one shuffle bucket.
+/// The data itself might have multiple shards (files) and we read them all.
+class ShuffleReceiveStep : public ISourceStep
+{
+public:
+    ShuffleReceiveStep(SharedHeader header_, const String & exchange_id_, const Strings & source_shards_)
+        : ISourceStep(std::move(header_))
+        , exchange_id(exchange_id_)
+        , source_shards(source_shards_)
+    {
+    }
+
+    String getName() const override { return "ShuffleReceive"; }
+
+    void initializePipeline(QueryPipelineBuilder & pipeline, const BuildQueryPipelineSettings & settings) override;
+
+    void serialize(Serialization & ctx) const override;
+    bool isSerializable() const override { return true; }
+
+    static std::unique_ptr<IQueryPlanStep> deserialize(Deserialization & ctx);
+
+private:
+    const String exchange_id;
+    const Strings source_shards;
+};
+
+}
diff --git a/src/Processors/QueryPlan/ShuffleSendStep.cpp b/src/Processors/QueryPlan/ShuffleSendStep.cpp
new file mode 100644
index 000000000000..573fdaa3260e
--- /dev/null
+++ b/src/Processors/QueryPlan/ShuffleSendStep.cpp
@@ -0,0 +1,121 @@
+#include <Processors/QueryPlan/ShuffleSendStep.h>
+#include <Processors/QueryPlan/QueryPlanStepRegistry.h>
+#include <Processors/Sinks/NativeCompressedSink.h>
+#include <Processors/QueryPlan/Serialization.h>
+#include <Processors/QueryPlan/IParameterLookup.h>
+#include <Processors/QueryPlan/ExchangeLookup.h>
+#include <Processors/QueryPlan/LogicalExchangeStep.h>
+#include <Processors/Transforms/ScatterByPartitionTransform.h>
+#include <QueryPipeline/QueryPipelineBuilder.h>
+#include <QueryPipeline/Pipe.h>
+#include <IO/WriteHelpers.h>
+#include <IO/ReadHelpers.h>
+#include <Core/ColumnNumbers.h>
+#include <DataTypes/DataTypeFactory.h>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int LOGICAL_ERROR;
+}
+
+QueryPipelineBuilderPtr ShuffleSendStep::updatePipeline(QueryPipelineBuilders pipelines, const BuildQueryPipelineSettings & settings)
+{
+    /// Add calculation of hash of key columns and bucket id based on the hash
+    /// Add fork processor to send data to num_buckets outputs
+    auto & pipeline = *pipelines.front();
+    auto stream_header = pipeline.getSharedHeader();
+    {
+        ColumnNumbers key_columns;
+        for (const auto & key_name : key_names)
+            key_columns.push_back(stream_header->getPositionByName(key_name));
+
+        pipeline.resize(1);
+        auto scatter = std::make_shared<ScatterByPartitionTransform>(stream_header, num_buckets, key_columns, hash_cast_types);
+        pipeline.addTransform(scatter);
+    }
+
+    const String shard_id = settings.parameter_lookup->getParameter("bucket_id").safeGet<String>();
+
+    /// Add sink for each bucket
+    size_t bucket = 0;
+    pipeline.setSinks([&](const SharedHeader & header, Pipe::StreamType stream_type)
+    {
+        chassert(stream_type == Pipe::StreamType::Main);
+        String destination_bucket_id = toString(bucket);
+        ++bucket;   /// TODO: this is a hack. Find a better way to assigning bucket id to each sink.
+        return settings.exchange_lookup->createSink(header, ExchangeStreamId(exchange_id, shard_id, destination_bucket_id));
+    });
+
+    if (bucket != num_buckets)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "ShuffleSendStep: expected {} buckets, but created only {}", num_buckets, bucket);
+
+    return std::move(pipelines.front());
+}
+
+namespace
+{
+
+void serializeNames(const Names & names, WriteBuffer & out)
+{
+    writeVarUInt(names.size(), out);
+    for (const String & name : names)
+        writeStringBinary(name, out);
+}
+
+void deserializeNames(Names & names, ReadBuffer & in)
+{
+    size_t size = 0;
+    readVarUInt(size, in);
+    names.resize(size);
+    for (size_t i = 0; i < size; ++i)
+        readStringBinary(names[i], in);
+}
+
+}
+
+void ShuffleSendStep::serialize(Serialization & ctx) const
+{
+    writeStringBinary(exchange_id, ctx.out);
+    serializeNames(key_names, ctx.out);
+    writeVarUInt(num_buckets, ctx.out);
+
+    writeVarUInt(hash_cast_types.size(), ctx.out);
+    for (const auto & type : hash_cast_types)
+        writeStringBinary(type ? type->getName() : "", ctx.out);
+}
+
+std::unique_ptr<IQueryPlanStep> ShuffleSendStep::deserialize(Deserialization & ctx)
+{
+    String exchange_id;
+    readStringBinary(exchange_id, ctx.in);
+
+    Names key_names;
+    deserializeNames(key_names, ctx.in);
+
+    size_t num_buckets = 0;
+    readVarUInt(num_buckets, ctx.in);
+
+    size_t hash_cast_count = 0;
+    readVarUInt(hash_cast_count, ctx.in);
+    DataTypes hash_cast_types;
+    hash_cast_types.reserve(hash_cast_count);
+    for (size_t i = 0; i < hash_cast_count; ++i)
+    {
+        String type_name;
+        readStringBinary(type_name, ctx.in);
+        hash_cast_types.push_back(type_name.empty() ? nullptr : DataTypeFactory::instance().get(type_name));
+    }
+
+    return std::make_unique<ShuffleSendStep>(ctx.input_headers.front(), exchange_id, std::move(key_names), num_buckets, std::move(hash_cast_types));
+}
+
+void registerShuffleSendStep(QueryPlanStepRegistry & registry);
+void registerShuffleSendStep(QueryPlanStepRegistry & registry)
+{
+    registry.registerStep("ShuffleSend", ShuffleSendStep::deserialize);
+}
+
+}
diff --git a/src/Processors/QueryPlan/ShuffleSendStep.h b/src/Processors/QueryPlan/ShuffleSendStep.h
new file mode 100644
index 000000000000..aca35919cdfe
--- /dev/null
+++ b/src/Processors/QueryPlan/ShuffleSendStep.h
@@ -0,0 +1,48 @@
+#pragma once
+
+#include <DataTypes/IDataType.h>
+#include <Processors/QueryPlan/IQueryPlanStep.h>
+
+
+namespace DB
+{
+
+/// Send part of ShuffleExchangeStep
+class ShuffleSendStep final : public IQueryPlanStep
+{
+public:
+    /// `hash_cast_types` (one entry per key, optional) selects a type to cast each key to
+    /// before hashing, used to align buckets across both sides of a shuffle join.
+    ShuffleSendStep(SharedHeader input_header_, const String & exchange_id_, Names key_names_, size_t num_buckets_, DataTypes hash_cast_types_ = {})
+        : exchange_id(exchange_id_)
+        , key_names(std::move(key_names_))
+        , hash_cast_types(std::move(hash_cast_types_))
+        , num_buckets(num_buckets_)
+    {
+        chassert(num_buckets > 0);
+        chassert(hash_cast_types.empty() || hash_cast_types.size() == key_names.size());
+        updateInputHeaders({std::move(input_header_)});
+    }
+
+
+    String getName() const override { return "ShuffleSend"; }
+
+    bool hasOutputStream() const { return false; }
+
+    QueryPipelineBuilderPtr updatePipeline(QueryPipelineBuilders pipelines, const BuildQueryPipelineSettings & settings) override;
+
+    void serialize(Serialization & ctx) const override;
+    bool isSerializable() const override { return true; }
+
+    static std::unique_ptr<IQueryPlanStep> deserialize(Deserialization & ctx);
+
+private:
+    void updateOutputHeader() override {}
+
+    const String exchange_id;
+    const Names key_names;
+    const DataTypes hash_cast_types;
+    const size_t num_buckets;
+};
+
+}
diff --git a/src/Processors/Sinks/NativeCompressedSink.cpp b/src/Processors/Sinks/NativeCompressedSink.cpp
new file mode 100644
index 000000000000..4f0e5db2a802
--- /dev/null
+++ b/src/Processors/Sinks/NativeCompressedSink.cpp
@@ -0,0 +1,84 @@
+#include <Processors/Sinks/NativeCompressedSink.h>
+#include <Processors/Transforms/AggregatingTransform.h>
+#include <Core/ProtocolDefines.h>
+#include <IO/WriteHelpers.h>
+#include <Common/logger_useful.h>
+
+namespace DB
+{
+
+NativeCompressedSink::~NativeCompressedSink()
+{
+    writer.reset();
+
+    if (compressed_buf && !compressed_buf->isFinalized())
+        compressed_buf->cancel();
+
+    if (!out.isFinalized())
+        out.cancel();
+}
+
+void NativeCompressedSink::initWriterOnce()
+{
+    if (input.getHeader().empty()) /// No input columns? (case of `SELECT count()`)
+        return;
+
+    if (!writer)
+    {
+        compressed_buf = std::make_unique<CompressedWriteBuffer>(out);
+        writer = std::make_unique<NativeWriter>(*compressed_buf, DBMS_TCP_PROTOCOL_VERSION, input.getSharedHeader());
+    }
+}
+
+void NativeCompressedSink::consume(Chunk chunk)
+{
+    rows_written += chunk.getNumRows();
+
+    if (input.getHeader().empty()) /// Blocks without columns will not be written, we will write total rows count only at the end.
+        return;
+
+    initWriterOnce();
+
+    LOG_TEST(log, "Writing chunk with {} rows to stream {}", chunk.getNumRows(), stream_name);
+
+    Block block = input.getHeader().cloneWithColumns(chunk.getColumns());
+    auto agg_info = chunk.getChunkInfos().get<AggregatedChunkInfo>();
+
+    /// Prefix each block with a per-block flag so a stream may freely mix blocks with and without
+    /// aggregation metadata. A stream-level flag would lose or fabricate metadata on a mixed stream.
+    UInt64 block_flags = 0;
+    if (agg_info)
+        block_flags |= 1;
+    writeVarUInt(block_flags, *compressed_buf);
+
+    if (agg_info)
+    {
+        /// Carry most aggregation metadata in block.info so the reader can reconstruct AggregatedChunkInfo.
+        block.info.bucket_num = agg_info->bucket_num;
+        block.info.is_overflows = agg_info->is_overflows;
+        block.info.out_of_order_buckets = agg_info->out_of_order_buckets;
+        /// chunk_num has no BlockInfo field; write it next to the block so memory-bound merging can restore order.
+        writeVarUInt(agg_info->chunk_num, *compressed_buf);
+    }
+    writer->write(block);
+}
+
+void NativeCompressedSink::onFinish()
+{
+    if (input.getHeader().empty())
+    {
+        /// Only write total rows count.
+        writeVarUInt(rows_written, out);
+    }
+    else
+    {
+        initWriterOnce();    /// In case no chunks were written
+        writer->flush();
+        compressed_buf->finalize();
+    }
+    out.finalize();
+
+    LOG_TEST(log, "Finished writing to stream {}, total rows: {}, bytes: {}", stream_name, rows_written, out.count());
+}
+
+}
diff --git a/src/Processors/Sinks/NativeCompressedSink.h b/src/Processors/Sinks/NativeCompressedSink.h
new file mode 100644
index 000000000000..444360c81dad
--- /dev/null
+++ b/src/Processors/Sinks/NativeCompressedSink.h
@@ -0,0 +1,43 @@
+#pragma once
+
+#include <Processors/ISink.h>
+#include <Processors/Port.h>
+#include <Common/Logger.h>
+#include <Compression/CompressedWriteBuffer.h>
+#include <Formats/NativeWriter.h>
+
+namespace DB
+{
+
+class NativeCompressedSink final : public ISink
+{
+public:
+    NativeCompressedSink(SharedHeader header_, WriteBuffer & out_, const String & stream_name_)
+        : ISink(std::move(header_))
+        , stream_name(stream_name_)
+        , out(out_)
+        , rows_written(0)
+    {
+    }
+
+    ~NativeCompressedSink() override;
+
+    String getName() const override { return "NativeCompressedSink"; }
+
+    void consume(Chunk chunk) override;
+
+    void onFinish() override;
+
+private:
+    void initWriterOnce();
+
+    const String stream_name;
+
+    WriteBuffer & out;
+    std::unique_ptr<CompressedWriteBuffer> compressed_buf;
+    std::unique_ptr<NativeWriter> writer;
+    size_t rows_written;
+    LoggerPtr log = getLogger("NativeCompressedSink");
+};
+
+}
diff --git a/src/Processors/Sources/NativeCompressedSource.cpp b/src/Processors/Sources/NativeCompressedSource.cpp
new file mode 100644
index 000000000000..1e76990c4247
--- /dev/null
+++ b/src/Processors/Sources/NativeCompressedSource.cpp
@@ -0,0 +1,63 @@
+#include <Processors/Sources/NativeCompressedSource.h>
+#include <Processors/Transforms/AggregatingTransform.h>
+#include <Core/ProtocolDefines.h>
+#include <IO/ReadHelpers.h>
+#include <Common/logger_useful.h>
+
+namespace DB
+{
+
+Chunk NativeCompressedSource::generate()
+{
+    if (output.getHeader().empty())    /// No output columns? (case of `SELECT count()`)
+    {
+        if (!in)
+            return {};
+
+        /// We must read the count of rows.
+        size_t total_rows = 0;
+        readVarUInt(total_rows, *in);
+        in.reset(); /// Nothing more to read.
+        return Chunk(Columns{}, total_rows);
+    }
+    else
+    {
+        if (!reader)
+        {
+            compressed_buf = std::make_unique<CompressedReadBuffer>(*in);
+            reader = std::make_unique<NativeReader>(*compressed_buf, output.getHeader(), DBMS_TCP_PROTOCOL_VERSION);
+        }
+
+        /// Each block is prefixed with a per-block flag, and a chunk_num when it carries aggregation
+        /// metadata. At end of stream there is no prefix, so stop here, the same way NativeReader::read
+        /// stops on eof.
+        if (compressed_buf->eof())
+            return {};
+
+        UInt64 block_flags = 0;
+        readVarUInt(block_flags, *compressed_buf);
+        const bool has_aggregated_chunk_info = (block_flags & 1);
+
+        UInt64 chunk_num = 0;
+        if (has_aggregated_chunk_info)
+            readVarUInt(chunk_num, *compressed_buf);
+
+        Block block = reader->read();
+
+        LOG_TEST(log, "Read chunk with {} rows from stream {}", block.rows(), stream_name);
+
+        Chunk result(block.getColumns(), block.rows());
+        if (has_aggregated_chunk_info)
+        {
+            auto info = std::make_shared<AggregatedChunkInfo>();
+            info->bucket_num = block.info.bucket_num;
+            info->is_overflows = block.info.is_overflows;
+            info->out_of_order_buckets = block.info.out_of_order_buckets;
+            info->chunk_num = chunk_num;
+            result.getChunkInfos().add(std::move(info));
+        }
+        return result;
+    }
+}
+
+}
diff --git a/src/Processors/Sources/NativeCompressedSource.h b/src/Processors/Sources/NativeCompressedSource.h
new file mode 100644
index 000000000000..4eb39e2ed70f
--- /dev/null
+++ b/src/Processors/Sources/NativeCompressedSource.h
@@ -0,0 +1,34 @@
+#pragma once
+
+#include <Processors/ISource.h>
+#include <Common/Logger.h>
+#include <Compression/CompressedReadBuffer.h>
+#include <Formats/NativeReader.h>
+
+namespace DB
+{
+
+class NativeCompressedSource final : public ISource
+{
+public:
+    NativeCompressedSource(SharedHeader header_, std::unique_ptr<ReadBuffer> in_, const String & stream_name_)
+        : ISource(std::move(header_))
+        , stream_name(stream_name_)
+        , in(std::move(in_))
+    {
+    }
+
+    String getName() const override { return "NativeCompressedSource"; }
+
+private:
+    Chunk generate() override;
+
+    const String stream_name;
+
+    std::unique_ptr<ReadBuffer> in;
+    std::unique_ptr<CompressedReadBuffer> compressed_buf;
+    std::unique_ptr<NativeReader> reader;
+    LoggerPtr log = getLogger("NativeCompressedSource");
+};
+
+}
diff --git a/src/Processors/Sources/ReadFromDistributedPlanSource.cpp b/src/Processors/Sources/ReadFromDistributedPlanSource.cpp
new file mode 100644
index 000000000000..3be3b3529140
--- /dev/null
+++ b/src/Processors/Sources/ReadFromDistributedPlanSource.cpp
@@ -0,0 +1,75 @@
+#include <Processors/Sources/ReadFromDistributedPlanSource.h>
+#include <Processors/QueryPlan/QueryPlan.h>
+#include <QueryPipeline/DistributedPlanExecutor.h>
+#include <Interpreters/Context.h>
+#include <Processors/Chunk.h>
+#include <Common/CurrentThread.h>
+#include <Common/Exception.h>
+
+namespace DB
+{
+
+void ReadFromDistributedPlanSource::cleanupLocked()
+{
+    if (cleaned_up)
+        return;
+    /// Mark cleaned up before the call so a throwing cleanup is not retried.
+    cleaned_up = true;
+    if (distributed_query_executor)
+        distributed_query_executor->cleanup();
+}
+
+std::optional<Chunk> ReadFromDistributedPlanSource::tryGenerate()
+{
+    std::lock_guard lock(executor_mutex);
+
+    /// Cancelled (via onCancel) or already finished - stop without launching/continuing work.
+    if (cleaned_up || *cancellation_flag)
+    {
+        cleanupLocked();
+        return std::nullopt;
+    }
+
+    try
+    {
+        if (!started)
+        {
+            started = true;
+            distributed_query_executor = createDistributedQueryExecutor(
+                unique_query_id, distributed_query_plan, task_to_host_map, CurrentThread::tryGetQueryContext(), cancellation_flag);
+            distributed_query_executor->start();
+        }
+
+        if (distributed_query_executor->execute())
+        {
+            cleanupLocked();
+            return std::nullopt;
+        }
+    }
+    catch (...)
+    {
+        cleanupLocked();
+        throw;
+    }
+
+    return Chunk();
+}
+
+void ReadFromDistributedPlanSource::onCancel() noexcept
+{
+    /// Signal first (lock-free) so an in-flight start()/execute() returns promptly, then tear down
+    /// under the lock. Without active cleanup, cancellation is only seen on the next tryGenerate,
+    /// which may never come once the pipeline is cancelled.
+    *cancellation_flag = true;
+    try
+    {
+        std::lock_guard lock(executor_mutex);
+        cleanupLocked();
+    }
+    catch (...)
+    {
+        tryLogCurrentException(__PRETTY_FUNCTION__);
+    }
+}
+
+}
diff --git a/src/Processors/Sources/ReadFromDistributedPlanSource.h b/src/Processors/Sources/ReadFromDistributedPlanSource.h
new file mode 100644
index 000000000000..680ab1cd6cb7
--- /dev/null
+++ b/src/Processors/Sources/ReadFromDistributedPlanSource.h
@@ -0,0 +1,57 @@
+#pragma once
+
+#include <memory>
+#include <mutex>
+#include <Processors/QueryPlan/QueryPlan.h>
+#include <Processors/ISource.h>
+#include <Core/Types_fwd.h>
+#include <QueryPipeline/DistributedPlanExecutor.h>
+
+namespace DB
+{
+
+class TaskToHostMap;
+using TaskToHostMapPtr = std::shared_ptr<const TaskToHostMap>;
+
+/// Wraps distributed plan execution.
+/// It is used in pipeline on distributed query initiator.
+class ReadFromDistributedPlanSource final : public ISource
+{
+public:
+    ReadFromDistributedPlanSource(
+        SharedHeader header_,
+        const UUID & unique_query_id_,
+        DistributedQueryPlan distributed_query_plan_,
+        TaskToHostMapPtr task_to_host_map_)
+        : ISource(std::move(header_))
+        , unique_query_id(unique_query_id_)
+        , distributed_query_plan(std::move(distributed_query_plan_))
+        , task_to_host_map(std::move(task_to_host_map_))
+    {
+    }
+
+    String getName() const override { return "ReadFromDistributedPlanSource"; }
+
+private:
+    std::optional<Chunk> tryGenerate() override;
+    void onCancel() noexcept override;
+
+    /// Tear down the executor once. Must be called with `executor_mutex` held.
+    void cleanupLocked();
+
+    const UUID unique_query_id;
+    const DistributedQueryPlan distributed_query_plan;
+    TaskToHostMapPtr task_to_host_map;
+
+    /// Guards the executor lifecycle (create/start/execute/cleanup) so that `onCancel`,
+    /// which may run on another thread, never races with `tryGenerate`.
+    std::mutex executor_mutex;
+    std::unique_ptr<DistributedQueryPlanExecutor> distributed_query_executor;
+    bool started = false;
+    bool cleaned_up = false;
+
+    /// Set from `onCancel` (and observed by the executor) to stop remote work promptly.
+    std::shared_ptr<std::atomic<bool>> cancellation_flag = std::make_shared<std::atomic<bool>>(false);
+};
+
+}
diff --git a/src/Processors/Transforms/ScatterByPartitionTransform.cpp b/src/Processors/Transforms/ScatterByPartitionTransform.cpp
index ecae587c637d..09451314ad21 100644
--- a/src/Processors/Transforms/ScatterByPartitionTransform.cpp
+++ b/src/Processors/Transforms/ScatterByPartitionTransform.cpp
@@ -1,17 +1,34 @@
 #include <Columns/IColumn.h>
 #include <Core/ColumnNumbers.h>
+#include <Interpreters/castColumn.h>
 #include <Processors/Port.h>
 #include <Processors/Transforms/ScatterByPartitionTransform.h>
+#include <Common/Exception.h>
 #include <Common/PODArray.h>
 
 namespace DB
 {
-ScatterByPartitionTransform::ScatterByPartitionTransform(SharedHeader header, size_t output_size_, ColumnNumbers key_columns_)
+namespace ErrorCodes
+{
+    extern const int LOGICAL_ERROR;
+}
+
+ScatterByPartitionTransform::ScatterByPartitionTransform(SharedHeader header, size_t output_size_, ColumnNumbers key_columns_, DataTypes hash_cast_types_)
     : IProcessor(InputPorts{header}, OutputPorts{output_size_, header})
     , output_size(output_size_)
     , key_columns(std::move(key_columns_))
+    , hash_cast_types(std::move(hash_cast_types_))
     , hash(0)
-{}
+{
+    if (!hash_cast_types.empty() && hash_cast_types.size() != key_columns.size())
+        throw Exception(ErrorCodes::LOGICAL_ERROR,
+            "ScatterByPartitionTransform: hash_cast_types size ({}) does not match key columns size ({})",
+            hash_cast_types.size(), key_columns.size());
+
+    hash_input_types.reserve(key_columns.size());
+    for (const auto & column_number : key_columns)
+        hash_input_types.push_back(header->getByPosition(column_number).type);
+}
 
 IProcessor::Status ScatterByPartitionTransform::prepare()
 {
@@ -137,8 +154,18 @@ void ScatterByPartitionTransform::generateOutputChunks()
 
     hash.reset(num_rows);
 
-    for (const auto & column_number : key_columns)
-        hash.update(columns[column_number]->getWeakHash32());
+    for (size_t i = 0; i < key_columns.size(); ++i)
+    {
+        const auto & column = columns[key_columns[i]];
+        const auto & cast_type = hash_cast_types.empty() ? nullptr : hash_cast_types[i];
+        if (cast_type && !cast_type->equals(*hash_input_types[i]))
+        {
+            auto casted = castColumn({column, hash_input_types[i], ""}, cast_type);
+            hash.update(casted->getWeakHash32());
+        }
+        else
+            hash.update(column->getWeakHash32());
+    }
 
     const PaddedPODArray<UInt32> & hash_data = hash.getData();
     IColumn::Selector selector(num_rows);
diff --git a/src/Processors/Transforms/ScatterByPartitionTransform.h b/src/Processors/Transforms/ScatterByPartitionTransform.h
index 5cfa673b1ec8..c7a1c2488a45 100644
--- a/src/Processors/Transforms/ScatterByPartitionTransform.h
+++ b/src/Processors/Transforms/ScatterByPartitionTransform.h
@@ -1,6 +1,7 @@
 #pragma once
 #include <Core/Block.h>
 #include <Core/ColumnNumbers.h>
+#include <DataTypes/IDataType.h>
 #include <Processors/Chunk.h>
 #include <Processors/IProcessor.h>
 #include <Common/WeakHash.h>
@@ -10,7 +11,9 @@ namespace DB
 
 struct ScatterByPartitionTransform : IProcessor
 {
-    ScatterByPartitionTransform(SharedHeader header, size_t output_size_, ColumnNumbers key_columns_);
+    /// `hash_cast_types` (one entry per key, optional) selects a type to cast each key to
+    /// before hashing. Casting is internal to routing; output rows are unchanged.
+    ScatterByPartitionTransform(SharedHeader header, size_t output_size_, ColumnNumbers key_columns_, DataTypes hash_cast_types_ = {});
 
     String getName() const override { return "ScatterByPartitionTransform"; }
 
@@ -23,6 +26,8 @@ struct ScatterByPartitionTransform : IProcessor
 
     size_t output_size;
     ColumnNumbers key_columns;
+    DataTypes hash_input_types;
+    DataTypes hash_cast_types;
 
     bool has_data = false;
     bool all_outputs_processed = true;
diff --git a/src/QueryPipeline/DistributedPlanExecutor.cpp b/src/QueryPipeline/DistributedPlanExecutor.cpp
new file mode 100644
index 000000000000..8e8d793a6904
--- /dev/null
+++ b/src/QueryPipeline/DistributedPlanExecutor.cpp
@@ -0,0 +1,1523 @@
+#include <condition_variable>
+#include <future>
+#include <memory>
+#include <mutex>
+#include <optional>
+#include <Common/scope_guard_safe.h>
+#include <Common/DequeWithMemoryTracking.h>
+#include <Common/getMultipleKeysFromConfig.h>
+#include <Common/MapWithMemoryTracking.h>
+#include <Common/UnorderedMapWithMemoryTracking.h>
+#include <Common/UnorderedSetWithMemoryTracking.h>
+#include <Common/VectorWithMemoryTracking.h>
+#include <QueryPipeline/DistributedPlanExecutor.h>
+#include <QueryPipeline/QueryPipelineBuilder.h>
+#include <QueryPipeline/QueryPlanResourceHolder.h>
+#include <QueryPipeline/printPipeline.h>
+#include <Processors/QueryPlan/BuildQueryPipelineSettings.h>
+#include <Processors/QueryPlan/Optimizations/QueryPlanOptimizationSettings.h>
+#include <Processors/QueryPlan/IParameterLookup.h>
+#include <Processors/QueryPlan/TemporaryFiles.h>
+#include <Processors/QueryPlan/ExchangeLookup.h>
+#include <Processors/QueryPlan/QueryPlan.h>
+#include <Processors/QueryPlan/LogicalExchangeStep.h>
+#include <Processors/Executors/CompletedPipelineExecutor.h>
+#include <Processors/Executors/PullingPipelineExecutor.h>
+#include <Processors/Sinks/NativeCompressedSink.h>
+#include <Common/ThreadStatus.h>
+#include <Common/ThreadGroupSwitcher.h>
+#include <Common/QueryScope.h>
+#include <Processors/Sources/NativeCompressedSource.h>
+#include <Planner/Utils.h>
+#include <Disks/DiskObjectStorage/ObjectStorages/ObjectStorageFactory.h>
+#include <Core/ProtocolDefines.h>
+#include <IO/WriteBufferFromString.h>
+#include <IO/WriteBufferFromFileBase.h>
+#include <IO/ReadBufferFromString.h>
+#include <Poco/URI.h>
+#include <Server/StatelessWorker/StatelessWorkerClient.h>
+#include <Server/DistributedQuery/StreamingExchangeLookup.h>
+#include <Interpreters/Cluster.h>
+#include <Interpreters/Context.h>
+#include <Interpreters/ProcessList.h>
+#include <Interpreters/ProcessorsProfileLog.h>
+#include <Interpreters/executeQuery.h>
+#include <Parsers/ASTSelectQuery.h>
+#include <Common/Exception.h>
+#include <Common/Stopwatch.h>
+#include <Common/CurrentMetrics.h>
+#include <Common/CurrentThread.h>
+#include <Common/ThreadPool.h>
+#include <Common/logger_useful.h>
+#include <Common/setThreadName.h>
+#include <Core/Settings.h>
+#include <base/defines.h>
+#include <base/getFQDNOrHostName.h>
+
+
+namespace CurrentMetrics
+{
+    extern const Metric TaskTrackerThreads;
+    extern const Metric TaskTrackerThreadsActive;
+    extern const Metric TaskTrackerThreadsScheduled;
+}
+
+
+namespace DB
+{
+
+namespace Setting
+{
+    extern const SettingsBool distributed_plan_execute_locally;
+}
+
+namespace ErrorCodes
+{
+    extern const int SUPPORT_IS_DISABLED;
+    extern const int LOGICAL_ERROR;
+    extern const int RECEIVED_ERROR_FROM_REMOTE_IO_SERVER;
+    extern const int QUERY_WAS_CANCELLED;
+    extern const int INVALID_CONFIG_PARAMETER;
+}
+
+class TaskParameters : public IParameterLookup
+{
+public:
+    explicit TaskParameters(const QueryPlanParameters & parameters_)
+        : parameters(parameters_)
+    {
+    }
+
+    Field getParameter(const String & name) const override
+    {
+        auto it = parameters.parameters.find(name);
+        if (it == parameters.parameters.end())
+            throw Exception(ErrorCodes::LOGICAL_ERROR, "Parameter {} not found", name);
+        return it->second;
+    }
+
+private:
+    const QueryPlanParameters parameters;
+};
+
+
+/// Creates read and write buffers for temporary files in object storage using just logical name of the file.
+class TemporaryFilesInObjectStorage : public ITemporaryFileLookup
+{
+public:
+    TemporaryFilesInObjectStorage(ObjectStoragePtr object_storage_, const String & object_storage_path_,
+        const Strings & input_temporary_files_, const Strings & output_temporary_files_)
+        : object_storage(std::move(object_storage_))
+        , object_storage_path(object_storage_path_)
+        , input_temporary_files(input_temporary_files_.begin(), input_temporary_files_.end())
+        , output_temporary_files(output_temporary_files_.begin(), output_temporary_files_.end())
+    {
+    }
+
+    WriteBuffer & getTemporaryFileForWriting(const String & file_name) override
+    {
+        LOG_DEBUG(logger, "Writing to temporary file '{}'", file_name);
+
+        if (!output_temporary_files.contains(file_name))
+            throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected output temporary file requested: '{}'", file_name);
+        StoredObject object(object_storage_path + "/" + file_name, file_name);
+        write_buffers.emplace_back(object_storage->writeObject(object, WriteMode::Rewrite));
+        return *write_buffers.back();
+    }
+
+    std::unique_ptr<ReadBuffer> getTemporaryFileForReading(const String & file_name) override
+    {
+        LOG_TRACE(logger, "Reading from temporary file '{}'", file_name);
+
+        if (!input_temporary_files.contains(file_name))
+            throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected input temporary file requested: '{}'", file_name);
+        StoredObject object(object_storage_path + "/" + file_name, file_name);
+        return object_storage->readObject(object, {});
+    }
+
+private:
+    ObjectStoragePtr object_storage;
+    const String object_storage_path;
+    const UnorderedSetWithMemoryTracking<String> input_temporary_files;
+    const UnorderedSetWithMemoryTracking<String> output_temporary_files;
+    VectorWithMemoryTracking<std::unique_ptr<WriteBuffer>> write_buffers;
+    LoggerPtr logger = getLogger("TemporaryFilesInObjectStorage");
+};
+
+
+class ExchangeViaTemporaryFiles : public IExchangeLookup
+{
+public:
+    explicit ExchangeViaTemporaryFiles(TemporaryFileLookupPtr temporary_files_)
+        : temporary_files(std::move(temporary_files_))
+    {
+    }
+
+    std::shared_ptr<ISink> createSink(SharedHeader input_header, const ExchangeStreamId & exchange_stream_id) override
+    {
+        if (!temporary_files)
+            throw Exception(
+                ErrorCodes::SUPPORT_IS_DISABLED,
+                "Object storage for Persisted exchanges is not configured, exchange stream id: {}",
+                exchange_stream_id.toString());
+
+        auto file_name = exchange_stream_id.toString();
+        return std::make_shared<NativeCompressedSink>(input_header, temporary_files->getTemporaryFileForWriting(file_name), file_name);
+    }
+
+    std::shared_ptr<ISource> createSource(SharedHeader output_header, const ExchangeStreamId & exchange_stream_id) override
+    {
+        if (!temporary_files)
+            throw Exception(
+                ErrorCodes::SUPPORT_IS_DISABLED,
+                "Object storage for Persisted exchanges is not configured, exchange stream id: {}",
+                exchange_stream_id.toString());
+
+        auto file_name = exchange_stream_id.toString();
+        std::unique_ptr<QueryPipelineBuilder> pipeline_ptr = std::make_unique<QueryPipelineBuilder>();
+        return std::make_shared<NativeCompressedSource>(output_header, temporary_files->getTemporaryFileForReading(file_name), file_name);
+    }
+
+private:
+    TemporaryFileLookupPtr temporary_files;
+};
+
+/// Simple implementation of streaming exchange for local execution.
+/// It just holds a queue of chunks in memory.
+class InMemoryExchange : boost::noncopyable
+{
+public:
+    explicit InMemoryExchange(const String & name_)
+        : name(name_)
+    {
+    }
+
+    void appendChunk(Chunk chunk)
+    {
+        LOG_TEST(log, "Appending chunk to exchange '{}', rows {}", name, chunk.getNumRows());
+
+        std::lock_guard lock(mutex);
+        chunks.emplace_back(std::move(chunk));
+        has_data.notify_one();
+    }
+
+    /// Wake any waiter so it stops instead of blocking forever for a chunk that will never arrive
+    /// (the producing task was cancelled before sending the end-of-data marker).
+    void cancel()
+    {
+        std::lock_guard lock(mutex);
+        cancelled = true;
+        has_data.notify_all();
+    }
+
+    Chunk getChunk()
+    {
+        LOG_TEST(log, "Waiting for chunk from exchange '{}'", name);
+
+        Chunk chunk;
+        {
+            std::unique_lock lock(mutex);
+            has_data.wait(lock, [this] { return !chunks.empty() || cancelled; });
+            if (chunks.empty())
+                return {};   /// Cancelled: report end of data.
+            chunk = std::move(chunks.front());
+            chunks.pop_front();
+        }
+
+        LOG_TEST(log, "Got chunk from exchange '{}', rows {}", name, chunk.getNumRows());
+
+        return chunk;
+    }
+
+private:
+    LoggerPtr log = getLogger("InMemoryExchange");
+    String name;
+    std::mutex mutex;
+    std::condition_variable has_data;
+    DequeWithMemoryTracking<Chunk> chunks;
+    bool cancelled = false;
+};
+
+using InMemoryExchangePtr = std::shared_ptr<InMemoryExchange>;
+
+
+/// A map of in-memory exchanges addressed by their logical names
+class InMemoryExchanges : boost::noncopyable
+{
+public:
+    InMemoryExchangePtr getExchange(const String & query_id, const String & exchange_id)
+    {
+        std::lock_guard lock(mutex);
+        auto & element = exchanges_by_query_id[query_id][exchange_id];
+        if (!element)
+            element = std::make_shared<InMemoryExchange>(exchange_id);
+        return element;
+    }
+
+    /// Cancel every exchange of the query so waiting tasks unblock. The exchanges stay in the
+    /// registry so a result reader that looks one up afterwards still finds the produced chunks and
+    /// their end-of-data marker; removeQuery drops them once the whole query pipeline is destroyed.
+    void cancelQuery(const String & query_id)
+    {
+        std::lock_guard lock(mutex);
+        auto it = exchanges_by_query_id.find(query_id);
+        if (it == exchanges_by_query_id.end())
+            return;
+        for (auto & [_, exchange] : it->second)
+            exchange->cancel();
+    }
+
+    /// Drop the query's exchanges from the registry. Called when the query pipeline is destroyed.
+    void removeQuery(const String & query_id)
+    {
+        std::lock_guard lock(mutex);
+        exchanges_by_query_id.erase(query_id);
+    }
+
+    static std::shared_ptr<InMemoryExchanges> instance()
+    {
+        static std::shared_ptr<InMemoryExchanges> self = std::make_shared<InMemoryExchanges>();
+        return self;
+    }
+
+private:
+    using InMemoryExchangeMap = UnorderedMapWithMemoryTracking<String, InMemoryExchangePtr>;
+
+    UnorderedMapWithMemoryTracking<String, InMemoryExchangeMap> exchanges_by_query_id TSA_GUARDED_BY(mutex);
+    std::mutex mutex;
+};
+
+class ExchangeViaChunks : public IExchangeLookup
+{
+public:
+    explicit ExchangeViaChunks(const String & query_id_)
+        : query_id(query_id_)
+    {
+    }
+
+    std::shared_ptr<ISink> createSink(SharedHeader input_header, const ExchangeStreamId & exchange_stream_id) override
+    {
+        auto file_name = exchange_stream_id.toString();
+        auto exchange = InMemoryExchanges::instance()->getExchange(query_id, file_name);
+        return std::make_shared<SinkFromInMemoryExchange>(input_header, exchange);
+    }
+
+    std::shared_ptr<ISource> createSource(SharedHeader output_header, const ExchangeStreamId & exchange_stream_id) override
+    {
+        auto file_name = exchange_stream_id.toString();
+        auto exchange = InMemoryExchanges::instance()->getExchange(query_id, file_name);
+        return std::make_shared<SourceFromInMemoryExchange>(output_header, exchange);
+    }
+
+private:
+    class SinkFromInMemoryExchange final : public ISink
+    {
+    public:
+        SinkFromInMemoryExchange(SharedHeader header_, InMemoryExchangePtr exchange_)
+            : ISink(header_)
+            , exchange(std::move(exchange_))
+        {
+        }
+
+        String getName() const override { return "SinkFromInMemoryExchange"; }
+
+        void consume(Chunk chunk) override
+        {
+            exchange->appendChunk(std::move(chunk));
+        }
+
+        void onFinish() override
+        {
+            exchange->appendChunk({});
+        }
+
+    private:
+        InMemoryExchangePtr exchange;
+    };
+
+    class SourceFromInMemoryExchange final : public ISource
+    {
+    public:
+        SourceFromInMemoryExchange(SharedHeader header_, InMemoryExchangePtr exchange_)
+            : ISource(header_)
+            , exchange(std::move(exchange_))
+        {
+        }
+
+        String getName() const override { return "SourceFromInMemoryExchange"; }
+
+        Chunk generate() override
+        {
+            return exchange->getChunk();
+        }
+    private:
+        InMemoryExchangePtr exchange;
+    };
+
+    const String query_id;
+};
+
+
+/// A wrapper that looks up exchanges by their kind and delegates to the corresponding exchange lookup: Persistent or Streaming
+class AllKindsExchangeLookup : public IExchangeLookup
+{
+public:
+    AllKindsExchangeLookup(
+        const ExchangeDescriptions & exchanges_,
+        ExchangeLookupPtr persistent_exchange_lookup_,
+        ExchangeLookupPtr streaming_exchange_lookup_)
+        : exchanges(exchanges_)
+        , persistent_exchange_lookup(std::move(persistent_exchange_lookup_))
+        , streaming_exchange_lookup(std::move(streaming_exchange_lookup_))
+    {
+    }
+
+    std::shared_ptr<ISink> createSink(SharedHeader input_header, const ExchangeStreamId & exchange_stream_id) override
+    {
+        auto it = exchanges.find(exchange_stream_id.exchange_id);
+        if (it == exchanges.end())
+            throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown exchange '{}'", exchange_stream_id.exchange_id);
+
+        if (it->second.kind == ExchangeDescription::Kind::Persisted)
+            return persistent_exchange_lookup->createSink(input_header, exchange_stream_id);
+        else if (it->second.kind == ExchangeDescription::Kind::Streaming)
+            return streaming_exchange_lookup->createSink(input_header, exchange_stream_id);
+        else
+            throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown exchange kind '{}'", static_cast<int>(it->second.kind));
+    }
+
+    std::shared_ptr<ISource> createSource(SharedHeader output_header, const ExchangeStreamId & exchange_stream_id) override
+    {
+        auto it = exchanges.find(exchange_stream_id.exchange_id);
+        if (it == exchanges.end())
+            throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown exchange '{}'", exchange_stream_id.exchange_id);
+
+        if (it->second.kind == ExchangeDescription::Kind::Persisted)
+            return persistent_exchange_lookup->createSource(output_header, exchange_stream_id);
+        else if (it->second.kind == ExchangeDescription::Kind::Streaming)
+            return streaming_exchange_lookup->createSource(output_header, exchange_stream_id);
+        else
+            throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown exchange kind '{}'", static_cast<int>(it->second.kind));
+    }
+
+private:
+    const ExchangeDescriptions exchanges;
+    ExchangeLookupPtr persistent_exchange_lookup;
+    ExchangeLookupPtr streaming_exchange_lookup;
+};
+
+/// Cleans up temporary files produced by distributed query execution.
+class TemporaryFilesInObjectStorageCleaner : public ICustomResourceHolder
+{
+public:
+    TemporaryFilesInObjectStorageCleaner(ObjectStoragePtr object_storage_, const String & object_storage_path_,
+        const Strings & temporary_files_)
+        : object_storage(std::move(object_storage_))
+        , object_storage_path(object_storage_path_)
+        , temporary_files(temporary_files_.begin(), temporary_files_.end())
+    {
+    }
+
+    ~TemporaryFilesInObjectStorageCleaner() override
+    {
+        /// TODO: add them to some background cleanup queue to avoid garbage in case of exceptions?
+        try
+        {
+            cleanup();
+        }
+        catch (...)
+        {
+            tryLogCurrentException(__PRETTY_FUNCTION__);
+        }
+    }
+
+    void cleanup()
+    {
+        StoredObjects all_objects;
+        for (const auto & file_name : temporary_files)
+        {
+            StoredObject object(object_storage_path + "/" + file_name, file_name);
+            all_objects.emplace_back(std::move(object));
+        }
+        LOG_TRACE(getLogger("TemporaryFilesInObjectStorageCleaner"), "Removing temporary files at path {} : [{}]",
+            object_storage_path, fmt::join(temporary_files, ", "));
+        object_storage->removeObjectsIfExist(all_objects);
+    }
+
+private:
+    ObjectStoragePtr object_storage;
+    const String object_storage_path;
+    const UnorderedSetWithMemoryTracking<String> temporary_files;
+};
+
+std::shared_ptr<ICustomResourceHolder> makeTemporaryFilesCleaner(ObjectStoragePtr object_storage_, const String & object_storage_path_, const Strings & temporary_files_)
+{
+    return std::make_shared<TemporaryFilesInObjectStorageCleaner>(object_storage_, object_storage_path_, temporary_files_);
+}
+
+/// Removes the query's in-memory exchanges from the registry when the query pipeline is destroyed.
+/// Their lifetime spans the whole pipeline because the result reader drains final_result after the
+/// executor (the driver) has finished, so removal cannot be tied to the executor's completion.
+class InMemoryExchangesCleaner : public ICustomResourceHolder
+{
+public:
+    explicit InMemoryExchangesCleaner(String query_id_) : query_id(std::move(query_id_)) {}
+
+    ~InMemoryExchangesCleaner() override
+    {
+        try
+        {
+            InMemoryExchanges::instance()->removeQuery(query_id);
+        }
+        catch (...)
+        {
+            tryLogCurrentException(__PRETTY_FUNCTION__);
+        }
+    }
+
+private:
+    const String query_id;
+};
+
+std::shared_ptr<ICustomResourceHolder> makeInMemoryExchangesCleaner(const String & query_id)
+{
+    return std::make_shared<InMemoryExchangesCleaner>(query_id);
+}
+
+TemporaryFileLookupPtr createTemporaryFilesLookup(ObjectStoragePtr object_storage_, const String & object_storage_path_,
+    const Strings & input_temporary_files_, const Strings & output_temporary_files_)
+{
+    if (!object_storage_)
+        return nullptr;
+
+    return std::make_shared<TemporaryFilesInObjectStorage>(object_storage_, object_storage_path_, input_temporary_files_, output_temporary_files_);
+}
+
+/// `query_id` must be the node-independent distributed query id: it keys the in-memory and streaming
+/// exchanges, so producers and consumers on different nodes (and the cleanup paths) must agree on it.
+/// It must not embed any node-local object-storage subpath, which would differ between nodes.
+ExchangeLookupPtr createExchangeLookup(
+    const String & query_id,
+    const ExchangeDescriptions & exchanges_,
+    const ExchangeStreamSources & exchange_stream_sources,
+    TemporaryFileLookupPtr temporary_files_,
+    ContextPtr context)
+{
+    bool run_locally = context->getSettingsRef()[Setting::distributed_plan_execute_locally];
+    if (run_locally)
+    {
+        LOG_DEBUG(getLogger("createExchangeLookup"), "`distributed_plan_execute_locally` setting is enabled, using in-memory queues for all exchanges");
+        return std::make_shared<ExchangeViaChunks>(query_id);
+    }
+
+    auto persisted_exchanges = std::make_shared<ExchangeViaTemporaryFiles>(temporary_files_);
+
+    bool has_streaming_exchange = false;
+    for (const auto & [exchange_id, exchange] : exchanges_)
+        if (exchange.kind == ExchangeDescription::Kind::Streaming)
+        {
+            has_streaming_exchange = true;
+            break;
+        }
+
+    /// Persisted exchanges only need the temporary-file lookup, so a plan where every exchange
+    /// is Persisted runs without a streaming transport (and on any platform). The streaming
+    /// port and lookup are required only when the plan actually contains a Streaming exchange.
+    if (!has_streaming_exchange)
+    {
+        UNUSED(exchange_stream_sources);
+        return std::make_shared<AllKindsExchangeLookup>(exchanges_, persisted_exchanges, /*streaming_exchange_lookup=*/nullptr);
+    }
+
+#ifdef OS_LINUX
+    auto streaming_exchange_port = context->getConfigRef().getUInt("distributed_query.streaming_exchange_port", 0);
+    if (streaming_exchange_port == 0)
+        throw Exception(ErrorCodes::SUPPORT_IS_DISABLED,
+            "Streaming exchange requires `distributed_query.streaming_exchange_port` to be configured; "
+            "set the port, force `distributed_plan_force_exchange_kind = 'Persisted'`, or enable "
+            "`distributed_plan_execute_locally` for in-process testing");
+    if (streaming_exchange_port > 65535)
+        throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER,
+            "`distributed_query.streaming_exchange_port` must be in range 1..65535, got {}", streaming_exchange_port);
+
+    /// The listener starts only when a listen host is also configured, so streaming peers are
+    /// unreachable without one. Reject here instead of connecting to a listener that never started.
+    if (getMultipleValuesFromConfig(context->getConfigRef(), "distributed_query", "streaming_exchange_listen_host").empty())
+        throw Exception(ErrorCodes::SUPPORT_IS_DISABLED,
+            "Streaming exchange requires `distributed_query.streaming_exchange_listen_host` to be configured; "
+            "set it, force `distributed_plan_force_exchange_kind = 'Persisted'`, or enable "
+            "`distributed_plan_execute_locally` for in-process testing");
+
+    auto streaming_exchanges = createStreamingExchangeLookup(
+        query_id, ExchangeConnections::instance(), exchange_stream_sources, static_cast<UInt16>(streaming_exchange_port));
+    return std::make_shared<AllKindsExchangeLookup>(exchanges_, persisted_exchanges, streaming_exchanges);
+#else
+    UNUSED(exchange_stream_sources);
+    throw Exception(ErrorCodes::SUPPORT_IS_DISABLED,
+        "Streaming exchanges are only supported on Linux; "
+        "use `distributed_plan_force_exchange_kind = 'Persisted'`");
+#endif
+}
+
+
+static String serializeQueryPlan(const QueryPlan & query_plan)
+{
+    WriteBufferFromOwnString out;
+    query_plan.serialize(out, DBMS_QUERY_PLAN_SERIALIZATION_VERSION);
+    return out.str();
+}
+
+static QueryPlan deserializeQueryPlan(const String & serialized_query_plan, ContextPtr context)
+{
+    ReadBufferFromString in(serialized_query_plan);
+    auto plan_and_sets = QueryPlan::deserialize(in, context);
+    return QueryPlan::makeSets(std::move(plan_and_sets), context);
+}
+
+void doExecuteTask(const DistributedQueryTaskDescription & task_description, ObjectStoragePtr object_storage,
+    const String & object_storage_path, const String & distributed_query_id, ContextMutablePtr context,
+    std::function<bool()> is_cancelled, ProgressCallback progress_callback)
+{
+    Stopwatch execute_task_watch;
+    const auto & task = task_description.task;
+
+    std::shared_ptr<OpenTelemetry::SpanHolder> query_span = std::make_shared<OpenTelemetry::SpanHolder>(task.task_id);
+
+    auto logger = Poco::Logger::getShared("executeDistributedQuery");
+
+    /// Disable the query condition cache: its per-worker state could make workers read inconsistent
+    /// data for the same fragment.
+    context->setSetting("use_query_condition_cache", false);
+
+    Strings input_exchange_streams;
+    for (const auto & stream_id : task.input_exchange_streams)
+        input_exchange_streams.push_back(stream_id.toString());
+
+    Strings output_exchange_streams;
+    for (const auto & stream_id : task.output_exchange_streams)
+        output_exchange_streams.push_back(stream_id.toString());
+
+    LOG_TRACE(logger, "Task '{}' input exchange streams: [{}], output exchange streams: [{}]",
+        task.task_id, fmt::join(input_exchange_streams, ", "), fmt::join(output_exchange_streams, ", "));
+
+#ifdef OS_LINUX
+    /// Release this task's pending streaming exchange connections on the worker when it ends. A
+    /// consumer that never connects (e.g. its query was cancelled) would otherwise leave them behind.
+    /// Only this task's output streams are dropped, so sibling tasks of the same query are unaffected.
+    SCOPE_EXIT_SAFE(ExchangeConnections::instance()->removePendingStreams(distributed_query_id, output_exchange_streams));
+#endif
+
+    auto temporary_files = createTemporaryFilesLookup(
+        object_storage, object_storage_path, input_exchange_streams, output_exchange_streams);
+
+    auto pipeline_settings = BuildQueryPipelineSettings(context);
+    pipeline_settings.temporary_file_lookup = temporary_files;
+    pipeline_settings.parameter_lookup = std::make_shared<TaskParameters>(task.parameters);
+    pipeline_settings.exchange_lookup = createExchangeLookup(
+        distributed_query_id,
+        task_description.exchanges,
+        task_description.exchange_stream_sources,
+        temporary_files,
+        context);
+
+    auto optimization_settings = QueryPlanOptimizationSettings(context);
+
+    /// Disable stats-driven plan-shape rewrites on the worker side: per-worker
+    /// stats can diverge and produce incompatible plans across workers (e.g. one
+    /// swaps the join sides while the others don't), breaking exchange partitioning.
+    optimization_settings.join_swap_table = std::make_optional(false);
+    optimization_settings.query_plan_optimize_join_order_limit = 0;
+    optimization_settings.query_plan_optimize_join_order_randomize = 0;
+    optimization_settings.convert_join_to_in = false;
+    optimization_settings.convert_outer_join_to_inner_join = false;
+    optimization_settings.convert_any_join_to_semi_or_anti_join = false;
+    optimization_settings.merge_filter_into_join_condition = false;
+    optimization_settings.top_k_through_join = false;
+
+    /// The fragment's read is bucketed; re-introducing the implicit count projection would count the
+    /// whole part per bucket. Keep it off so counts read the bucket's mark ranges.
+    optimization_settings.optimize_use_implicit_projections = false;
+
+    QueryPipeline pipeline;
+
+    {
+        QueryPlan query_plan = deserializeQueryPlan(task_description.serialized_query_plan, context);
+
+        auto builder = query_plan.buildQueryPipeline(
+                optimization_settings,
+                pipeline_settings);
+
+        pipeline = QueryPipelineBuilder::getPipeline(std::move(*builder));
+    }
+
+    ASTPtr ast_stub = make_intrusive<ASTSelectQuery>(); /// FIXME: this is only used to populate query_kind
+    UInt64 query_plan_hash = sipHash64(task_description.serialized_query_plan);
+
+    auto query_log_elem = logQueryStart(
+        std::chrono::system_clock::now(),
+        context,
+        /*query_for_logging*/ task.task_id,
+        query_plan_hash,
+        ast_stub, pipeline,
+        /*interpreter*/ nullptr,
+        /*internal*/ false,
+        /*database*/ "",
+        /*table*/ "",
+        /*async_insert*/ false);
+
+    try
+    {
+        LOG_TEST(logger, "Executing task '{}', pipeline:\n{}",
+            task.task_id,
+            [&pipeline]() -> String
+            {
+                WriteBufferFromOwnString out;
+                printPipeline(pipeline.getProcessors(), out);
+                return out.str();
+            }());
+
+        if (!pipeline.completed())
+            throw Exception(ErrorCodes::LOGICAL_ERROR, "Task pipeline must be completed");
+
+        pipeline.setProcessListElement(context->getProcessListElement());
+
+        pipeline.setProgressCallback(progress_callback);
+
+        CompletedPipelineExecutor executor(pipeline);
+        if (is_cancelled)
+            executor.setCancelCallback(is_cancelled, 100);
+        executor.execute();
+
+        logQueryFinish(query_log_elem, context, ast_stub, std::move(pipeline), false,
+            query_span, QueryResultCacheUsage::None, false);
+    }
+    catch (...)
+    {
+        logQueryException(query_log_elem, context, execute_task_watch, ast_stub, query_span, false, true);
+        throw;
+    }
+}
+
+/// Storage path (and thus the exchange query key) for a query's temporary files. Kept separate
+/// from object-storage creation so it can be recomputed cheaply (e.g. for cleanup).
+static String getTemporaryFilesPath(const String & unique_temp_file_path, ContextPtr context)
+{
+    const auto & config = context->getConfigRef();
+    String config_prefix = "distributed_query.temporary_files_storage";
+    if (config.has(config_prefix))
+        return config.getString(config_prefix + ".endpoint_subpath") + unique_temp_file_path;
+    return unique_temp_file_path;
+}
+
+std::pair<ObjectStoragePtr, String> getObjectStorageForTemporaryFiles(const String & unique_temp_file_path, ContextPtr context)
+{
+    const auto & config = context->getConfigRef();
+    String config_prefix = "distributed_query.temporary_files_storage";
+    String object_storage_path = getTemporaryFilesPath(unique_temp_file_path, context);
+    if (config.has(config_prefix))
+    {
+        ObjectStoragePtr object_storage = ObjectStorageFactory::instance().create("distributed_query_temp_files", config, config_prefix, context, false);
+        return {object_storage, object_storage_path};
+    }
+    return {nullptr, object_storage_path};
+}
+
+static void executeTask(const UUID & unique_query_id, const DistributedQueryTaskDescription & task, ContextPtr context, std::shared_ptr<std::atomic<bool>> is_cancelled)
+{
+    auto [object_storage, object_storage_path] = getObjectStorageForTemporaryFiles(toString(unique_query_id), context);
+
+    /// Run each task as an independent query fragment with its own query context and thread group,
+    /// matching the worker path. Attaching the task context to this thread (instead of sharing the
+    /// initiator's) gives the task its own per-query state, such as the runtime filter lookup.
+    auto task_context = Context::createCopy(context);
+    task_context->makeQueryContext();
+    auto query_scope = QueryScope::create(task_context);
+    setThreadName(ThreadName::DISTRIBUTED_QUERY_TASK);
+
+    doExecuteTask(task, object_storage, object_storage_path, toString(unique_query_id), std::move(task_context), [is_cancelled]() -> bool { return *is_cancelled; });
+}
+
+/// Runs tasks in local threads. Useful for testing and debugging.
+class DistributedQueryPlanExecutorLocal final : public DistributedQueryPlanExecutor
+{
+public:
+    DistributedQueryPlanExecutorLocal(const UUID & unique_query_id_, const DistributedQueryPlan & distributed_query_plan_, ContextPtr context_, std::shared_ptr<std::atomic<bool>> is_cancelled_)
+        : DistributedQueryPlanExecutor(unique_query_id_, distributed_query_plan_, makeContextForLocalExecution(context_), std::move(is_cancelled_))
+    {
+    }
+
+    void cleanup() override
+    {
+        /// Cancel the query's in-memory exchanges before waiting for the detached task threads, or a
+        /// task stuck in InMemoryExchange::getChunk never returns. The exchanges are not removed here:
+        /// the result reader still drains final_result after the driver finishes; removal happens when
+        /// the query pipeline is destroyed (see makeInMemoryExchangesCleaner).
+        InMemoryExchanges::instance()->cancelQuery(toString(unique_query_id));
+
+        for (auto & [_, tasks] : stage_tasks)
+            for (auto & task : tasks)
+                if (task.valid())
+                    task.wait();
+        stage_tasks.clear();
+    }
+
+protected:
+    static ContextPtr makeContextForLocalExecution(ContextPtr ctx)
+    {
+        auto new_context = Context::createCopy(ctx);
+        /// We will execute tasks with local plan fragments. They should not be converted into distributed plan themselves.
+        new_context->setSetting("make_distributed_plan", false);
+        return new_context;
+    }
+
+    std::future<void> startTask(const DistributedQueryTaskDescription & task_description)
+    {
+        std::promise<void> task_promise;
+        std::future<void> future = task_promise.get_future();
+
+        std::thread([promise = std::move(task_promise), query_id = unique_query_id, task_description, ctx = context, is_cancelled = this->is_cancelled]() mutable
+        {
+            ThreadStatus thread_status;
+            /// The task attaches its own query context and thread group inside executeTask (matching
+            /// the worker path), so this thread is intentionally left detached from the initiator group.
+
+            try
+            {
+                executeTask(query_id, task_description, ctx, is_cancelled);
+                promise.set_value();
+            }
+            catch (...)
+            {
+                promise.set_exception(std::current_exception());
+            }
+        }).detach();
+
+        return future;
+    }
+
+    void startStage(const String & stage_name, const DistributedQueryStage & stage) override
+    {
+        VectorWithMemoryTracking<std::shared_future<void>> started_tasks;
+        started_tasks.reserve(stage.tasks.size());
+        DistributedQueryTaskDescription task_description;
+        task_description.serialized_query_plan = serializeQueryPlan(stage.query_plan_fragment);
+        task_description.exchanges = distributed_query_plan.exchange_descriptions; /// TODO: add only exchanges for this stage
+
+        for (const auto & task : stage.tasks)
+        {
+            task_description.task = task;
+            started_tasks.emplace_back(startTask(task_description).share());
+        }
+
+        stage_tasks[stage_name] = std::move(started_tasks);
+    }
+
+    bool waitForStage(const String & stage_name, std::optional<UInt64> timeout_ms) override
+    {
+        auto & started_tasks = stage_tasks[stage_name];
+
+        for (auto & task : started_tasks)
+        {
+            if (timeout_ms.has_value())
+            {
+                if (task.wait_for(std::chrono::milliseconds(timeout_ms.value())) != std::future_status::ready)
+                {
+                    /// While this stage is still running, surface a failure from any other stage so a
+                    /// consumer blocked on an exchange does not wait forever on a dead producer.
+                    rethrowFailedTasks();
+                    return false;
+                }
+            }
+            else
+            {
+                task.wait();
+            }
+        }
+
+        auto tasks = std::move(started_tasks);
+        started_tasks.clear();
+
+        /// Throw exception if any task failed
+        for (auto & task : tasks)
+            task.get();
+
+        return true;
+    }
+
+private:
+    /// Rethrow the exception of any already-finished failed task across all stages, without blocking.
+    void rethrowFailedTasks()
+    {
+        for (auto & [stage_name, tasks] : stage_tasks)
+            for (auto & task : tasks)
+                if (task.valid() && task.wait_for(std::chrono::seconds(0)) == std::future_status::ready)
+                    task.get();
+    }
+
+    UnorderedMapWithMemoryTracking<String, VectorWithMemoryTracking<std::shared_future<void>>> stage_tasks;
+};
+
+
+TaskToHostMap::TaskToHostMap(const DistributedQueryPlan & distributed_query_plan_, ContextPtr context_)
+{
+    fillHostnames(context_);
+    assignHostsForTasks(distributed_query_plan_);
+}
+
+void TaskToHostMap::fillHostnames(ContextPtr context)
+{
+    if (!context->getConfigRef().getBool("stateless_worker_client.enabled", false))
+        throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "Stateless worker client is not enabled in configuration");
+
+    String host;
+    String cluster_name = context->getConfigRef().getString("stateless_worker_client.cluster", "");
+    if (!cluster_name.empty())
+    {
+        auto cluster = context->tryGetCluster(cluster_name);
+        if (!cluster)
+            throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "Cluster '{}' not found", cluster_name);
+
+        auto shard_addresses = cluster->getShardsAddresses();
+        if (shard_addresses.empty())
+            throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "Cluster '{}' has no shards", cluster_name);
+        /// Only a single-shard worker cluster is supported for now.
+        if (shard_addresses.size() > 1)
+            throw Exception(ErrorCodes::SUPPORT_IS_DISABLED,
+                "Stateless worker cluster '{}' must have a single shard, got {}", cluster_name, shard_addresses.size());
+        for (const auto & replica : shard_addresses[0])
+            hostnames.push_back(replica.host_name);
+    }
+    else
+    {
+        host = context->getConfigRef().getString("stateless_worker_client.host");
+        if (!host.empty())
+            hostnames.push_back(host);
+    }
+
+    if (hostnames.empty())
+        throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "No hosts specified for stateless worker client");
+}
+
+void TaskToHostMap::assignHostsForTasks(const DistributedQueryPlan & distributed_query_plan)
+{
+    size_t current_host = 0;
+    for (const auto & [stage_id, stage] : distributed_query_plan.stages)
+    {
+        for (const auto & task : stage.tasks)
+        {
+            const auto & assigned_host = hostnames[current_host];
+            current_host = (current_host + 1) % hostnames.size();
+            task_hosts[task.task_id] = assigned_host;
+            for (const auto & output_stream : task.output_exchange_streams)
+                exchange_stream_source_hosts[output_stream.toString()] = assigned_host;
+        }
+    }
+}
+
+
+/// Sends tasks to remote nodes.
+class DistributedQueryPlanExecutorRemote final : public DistributedQueryPlanExecutor
+{
+public:
+    DistributedQueryPlanExecutorRemote(
+        const UUID & unique_query_id_,
+        const DistributedQueryPlan & distributed_query_plan_,
+        TaskToHostMapPtr task_to_host_map_,
+        ContextPtr context_,
+        std::shared_ptr<std::atomic<bool>> is_cancelled_)
+        : DistributedQueryPlanExecutor(unique_query_id_, distributed_query_plan_, std::move(context_), std::move(is_cancelled_))
+        , task_to_host_map(std::move(task_to_host_map_))
+        , running_tasks(8, context, is_cancelled, logger)
+    {
+        QueryStatusPtr query_status = context->getProcessListElement();
+        LOG_DEBUG(logger, "Hosts for running distributed query: [{}]", fmt::join(task_to_host_map->getHostnames(), ", "));
+    }
+
+    void cleanup() override
+    {
+        running_tasks.cancel();
+#ifdef OS_LINUX
+        /// Drop any still-pending exchange connection slots that belong to this query
+        /// (the peer was cancelled or never arrived). Without this they would leak
+        /// FutureConnection/eventfd entries in the ExchangeConnections singleton for
+        /// the lifetime of the server.
+        ExchangeConnections::instance()->cleanupQuery(toString(unique_query_id));
+#endif
+    }
+
+protected:
+    struct RunningTaskInfo
+    {
+        String endpoint_uri;
+        String task_id;
+    };
+
+    /// Tracks the statuses of running tasks in parallel, collects progress counters.
+    class TaskTracker
+    {
+    public:
+        TaskTracker(Int64 max_in_flight_requests_, ContextPtr context_, std::shared_ptr<std::atomic<bool>> is_cancelled_, LoggerPtr logger_)
+            : context(std::move(context_))
+            , query_status(context->getProcessListElement())
+            , max_in_flight_requests(max_in_flight_requests_)
+            , is_cancelled(std::move(is_cancelled_))
+            , thread_pool(CurrentMetrics::TaskTrackerThreads, CurrentMetrics::TaskTrackerThreadsActive, CurrentMetrics::TaskTrackerThreadsScheduled,
+                max_in_flight_requests, max_in_flight_requests, 2 * max_in_flight_requests)
+            , logger(std::move(logger_))
+        {}
+
+        ~TaskTracker()
+        {
+            thread_pool.wait();
+        }
+
+        /// Add started task to be tracked
+        void addTask(const String & stage_name, RunningTaskInfo task_info)
+        {
+            auto task_name = task_info.task_id;
+
+            {
+                std::lock_guard g(lock);
+                stage_tasks[stage_name][task_name] = std::move(task_info);
+
+                /// Create stage info if this stage was not known before
+                if (!all_stages.contains(stage_name))
+                    all_stages[stage_name] = std::make_shared<StageInfo>(stage_name);
+                all_stages[stage_name]->started_tasks++;
+            }
+
+            addTaskToCheckQueue(stage_name, task_name);
+            enqueueGetStatus();
+        }
+
+        /// Wait for all tasks of the stage to finish
+        bool waitForStage(const String & stage_name, std::optional<UInt64> timeout_ms)
+        {
+            LOG_DEBUG(logger, "Waiting for stage {} to finish", stage_name);
+
+            std::shared_future<void> finished;
+            {
+                std::lock_guard g(lock);
+
+                auto & stage = all_stages.at(stage_name);
+
+                /// Is already finished?
+                if (stage->started_tasks == stage->finished_tasks)
+                    return true;
+
+                /// Create a future that will be signaled by the last finishing task of this stage
+                if (!stage_results.contains(stage_name))
+                    stage_results[stage_name] = stage->promise.get_future();
+
+                finished = stage_results.at(stage_name);
+            }
+
+            bool stage_finished = false;
+            if (timeout_ms.has_value())
+            {
+                stage_finished = (finished.wait_for(std::chrono::milliseconds(timeout_ms.value())) == std::future_status::ready);
+            }
+            else
+            {
+                /// Poll with timeout instead of blocking indefinitely, so that
+                /// checkCancelled can detect worker failures stored in first_exception.
+                while (finished.wait_for(std::chrono::milliseconds(100)) != std::future_status::ready)
+                    checkCancelled();
+                stage_finished = true;
+            }
+            checkCancelled();
+            return stage_finished;
+        }
+
+        /// Cancel all unfinished tasks. Collects task info under lock, then
+        /// sends HTTP cancel requests without holding lock so that
+        /// `checkStatusFunc` threads can observe `is_cancelled` and exit.
+        void cancel()
+        {
+            VectorWithMemoryTracking<RunningTaskInfo> tasks_to_cancel;
+            {
+                std::lock_guard g(lock);
+                for (auto & [stage_name, started_tasks] : stage_tasks)
+                {
+                    for (auto & [task_name, task_info] : started_tasks)
+                        tasks_to_cancel.push_back(task_info);
+                    started_tasks.clear();
+                }
+                /// Clear queues so that `enqueueGetStatus` doesn't pick up
+                /// stale task names after `stage_tasks` has been emptied.
+                stages_to_check.clear();
+                for (auto & [_, stage] : all_stages)
+                    stage->tasks_to_check.clear();
+            }
+
+            for (auto & task : tasks_to_cancel)
+            {
+                LOG_TRACE(logger, "Cancelling task {} on host {}", task.task_id, task.endpoint_uri);
+                try
+                {
+                    cancelTask(task.endpoint_uri, task.task_id, context);
+                }
+                catch (...)
+                {
+                    tryLogCurrentException(__PRETTY_FUNCTION__);
+                }
+            }
+
+            /// `forget` drops the worker's only handle to a task, so forgetting one that is still
+            /// running would orphan a fragment that keeps writing exchange and temporary outputs.
+            /// `cancel` only requests termination, so wait for the terminal state before forgetting,
+            /// and leave a task that does not settle for the worker to reclaim on shutdown.
+            for (auto & task : tasks_to_cancel)
+            {
+                if (waitForTaskTerminal(task))
+                    tryForgetTask(task);
+                else
+                    LOG_WARNING(logger, "Task {} on {} did not reach a terminal state after cancellation; "
+                        "leaving it for the worker to reclaim", task.task_id, task.endpoint_uri);
+            }
+        }
+
+        /// Cancel a task that is not tracked yet (its start request failed) and forget it only once
+        /// the worker reports it terminal, so a start that partially reached the worker is not orphaned.
+        void cancelAndForgetUntracked(const RunningTaskInfo & task)
+        {
+            try
+            {
+                cancelTask(task.endpoint_uri, task.task_id, context);
+            }
+            catch (...)
+            {
+                tryLogCurrentException(__PRETTY_FUNCTION__);
+            }
+
+            if (waitForTaskTerminal(task))
+                tryForgetTask(task);
+            else
+                LOG_WARNING(logger, "Task {} on {} did not reach a terminal state after a failed start; "
+                    "leaving it for the worker to reclaim", task.task_id, task.endpoint_uri);
+        }
+
+    private:
+        void checkCancelled()
+        {
+            if (query_status)
+                query_status->checkTimeLimit();
+
+            {
+                std::lock_guard exception_lock(lock);
+                if (first_exception)
+                    std::rethrow_exception(first_exception);
+            }
+
+            if (*is_cancelled)
+                throw Exception(ErrorCodes::QUERY_WAS_CANCELLED, "Query was cancelled");
+        }
+
+        /// Thead function to check one task. If the task is not finished, adds the task back to the queue for checking.
+        void checkStatusFunc(const String & stage_name, const RunningTaskInfo & task)
+        {
+            checkCancelled();
+
+            UInt32 wait_milliseconds = 300;
+
+            auto task_status = getTaskStatus(task.endpoint_uri, task.task_id, wait_milliseconds, context);
+
+            auto progress_callback = context->getProgressCallback();
+            if (progress_callback)
+                progress_callback(task_status.progress);
+
+            if (task_status.status == "Running")
+            {
+                /// Add the task back to the end of the queue
+                addTaskToCheckQueue(stage_name, task.task_id);
+                return;
+            }
+
+            /// Task reached a terminal state on the worker. Release worker-side
+            /// bookkeeping for it (TaskState/progress/future). Best-effort.
+            tryForgetTask(task);
+
+            if (task_status.status != "Finished")
+                throw Exception(ErrorCodes::RECEIVED_ERROR_FROM_REMOTE_IO_SERVER,
+                    "Task {} did not finish successfully (status: {}): {}",
+                    task.task_id, task_status.status, task_status.error_message);
+
+            /// Update task state
+            setTaskFinished(stage_name, task.task_id);
+        }
+
+        void tryForgetTask(const RunningTaskInfo & task) noexcept
+        {
+            try
+            {
+                forgetTask(task.endpoint_uri, task.task_id, context);
+            }
+            catch (...)
+            {
+                tryLogCurrentException(__PRETTY_FUNCTION__, fmt::format("forgetTask {} on {}", task.task_id, task.endpoint_uri));
+            }
+        }
+
+        /// Polls the worker until the task leaves the "Running" state or a bounded time budget elapses.
+        /// Returns true when the task is known to be terminal (or already gone from the worker), false
+        /// on timeout or a status-request error.
+        bool waitForTaskTerminal(const RunningTaskInfo & task) noexcept
+        {
+            constexpr UInt32 poll_wait_ms = 300;
+            constexpr size_t max_polls = 10;
+            for (size_t poll = 0; poll < max_polls; ++poll)
+            {
+                try
+                {
+                    auto task_status = getTaskStatus(task.endpoint_uri, task.task_id, poll_wait_ms, context, /*for_cleanup*/ true);
+                    if (task_status.status != "Running")
+                        return true;
+                }
+                catch (...)
+                {
+                    tryLogCurrentException(__PRETTY_FUNCTION__, fmt::format("waitForTaskTerminal {} on {}", task.task_id, task.endpoint_uri));
+                    return false;
+                }
+            }
+            return false;
+        }
+
+        void addTaskToCheckQueue(const String & stage_name, const String & task_name)
+        {
+            std::lock_guard g(lock);
+            StageInfoPtr stage;
+            stage = all_stages[stage_name];
+            stage->tasks_to_check.push_back(task_name);
+
+            /// If this stage didn't have any tasks to check before we added this task then the stage is not it the queue for checking,
+            /// so we add it to the queue.
+            if (stage->tasks_to_check.size() == 1)
+                stages_to_check.push_back(stage);
+        }
+
+        void setTaskFinished(const String & stage_name, const String & task_name)
+        {
+            std::lock_guard g(lock);
+            auto & stage = all_stages[stage_name];
+            stage->finished_tasks++;
+
+            /// Signal completion at most once: a status-poll race can make the counts match again
+            /// after the stage finished, and setting the promise twice throws.
+            if (stage->finished_tasks == stage->started_tasks && stage_results.contains(stage_name) && !stage->promise_signaled)
+            {
+                stage->promise_signaled = true;
+                stage->promise.set_value();
+            }
+
+            stage_tasks[stage_name].erase(task_name); // TODO: really need to erase?
+        }
+
+        /// Picks the next task from the queue.
+        /// The queue contains stages and within each stage there is a queue of unfinished tasks.
+        /// This allows to pick tasks from all stages and thus track progress of all stages in parallel.
+        std::optional<std::pair<String, String>> getNextTaskToCheck() TSA_REQUIRES(lock)
+        {
+            if (stages_to_check.empty())
+                return {};
+
+            auto stage = stages_to_check.front();
+            stages_to_check.pop_front();
+
+            if (stage->tasks_to_check.empty())    /// TODO: should not happen, but let's be safe
+                return {};
+
+            auto task = stage->tasks_to_check.front();
+            stage->tasks_to_check.pop_front();
+
+            /// If there are more tasks to check in the stage then put the stage back to the end of the queue
+            if (!stage->tasks_to_check.empty())
+                stages_to_check.push_back(stage);
+
+            return std::make_pair(stage->name, task);
+        }
+
+        void enqueueGetStatus()
+        {
+            std::lock_guard g(lock);
+
+            if (in_flight_request_count >= max_in_flight_requests)
+                return;
+
+            /// Choose next task to check
+            auto task = getNextTaskToCheck();
+            if (!task)
+                return;
+
+            /// Look up task metadata; skip if absent (task was cancelled/removed).
+            auto stage_it = stage_tasks.find(task->first);
+            if (stage_it == stage_tasks.end())
+                return;
+            auto task_it = stage_it->second.find(task->second);
+            if (task_it == stage_it->second.end())
+                return;
+            auto task_info = task_it->second;
+
+            thread_pool.scheduleOrThrow([this, task, task_info]()
+                {
+                    try
+                    {
+                        checkStatusFunc(task->first, task_info);
+                    }
+                    catch (...)
+                    {
+                        tryLogCurrentException(__PRETTY_FUNCTION__);
+                        {
+                            std::lock_guard exception_lock(lock);
+                            if (!first_exception)
+                                first_exception = std::current_exception();
+                        }
+                        *is_cancelled = true;
+                    }
+                    /// Decrement the in-flight counter before scheduling the next check so
+                    /// the next `enqueueGetStatus` is not gated by an already-finished slot.
+                    /// Otherwise, when the counter sits at the limit, every re-enqueue
+                    /// inside `checkStatusFunc` sees a full pipeline, all in-flight checks
+                    /// then decrement to zero, and no further check is ever scheduled.
+                    --in_flight_request_count;
+                    enqueueGetStatus();
+                });
+            ++in_flight_request_count;
+        }
+
+        struct StageInfo
+        {
+            const String name;
+            /// Queue of the tasks from this stage that are not finished and are not being checked at the moment
+            DequeWithMemoryTracking<String> tasks_to_check;
+            Int64 started_tasks = 0;
+            Int64 finished_tasks = 0;
+            std::promise<void> promise;
+            bool promise_signaled = false;
+        };
+
+        using StageInfoPtr = std::shared_ptr<StageInfo>;
+
+        ContextPtr context;
+        QueryStatusPtr query_status;
+
+        const Int64 max_in_flight_requests;
+
+        std::mutex lock;
+        UnorderedMapWithMemoryTracking<String, StageInfoPtr> all_stages TSA_GUARDED_BY(lock);
+        UnorderedMapWithMemoryTracking<String, MapWithMemoryTracking<String, RunningTaskInfo>> stage_tasks TSA_GUARDED_BY(lock);
+        std::atomic<Int64> in_flight_request_count = 0;
+        /// Queue of stages that have unfinished tasks to be checked
+        DequeWithMemoryTracking<StageInfoPtr> stages_to_check TSA_GUARDED_BY(lock);
+        UnorderedMapWithMemoryTracking<String, std::shared_future<void>> stage_results TSA_GUARDED_BY(lock);
+        std::shared_ptr<std::atomic<bool>> is_cancelled;
+        std::exception_ptr first_exception TSA_GUARDED_BY(lock);
+        ThreadPool thread_pool;
+        LoggerPtr logger;
+    };
+
+    RunningTaskInfo buildTaskInfo(const DistributedQueryTaskDescription & task_description) const
+    {
+        const String host = task_to_host_map->getTaskHosts().at(task_description.task.task_id);
+        String stateless_worker_endpoint_uri;
+        {
+            auto default_port = context->getInterserverIOAddress().second;
+            auto port = context->getConfigRef().getUInt("stateless_worker_client.port", default_port);
+            if (port == 0 || port > 65535)
+                throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER,
+                    "`stateless_worker_client.port` must be in range 1..65535, got {}", port);
+            String default_endpoint = context->getConfigRef().getString("stateless_worker_server.endpoint", "localhost");
+            auto endpoint = context->getConfigRef().getString("stateless_worker_client.endpoint", "stateless_worker/" + default_endpoint);
+            Poco::URI stateless_worker_uri;
+            /// Match the interserver scheme so a server with interserver_https_port is not sent plaintext.
+            stateless_worker_uri.setScheme(context->getInterserverScheme());
+            stateless_worker_uri.setHost(host);
+            stateless_worker_uri.setPort(static_cast<UInt16>(port));
+            stateless_worker_uri.addQueryParameter("endpoint", endpoint);
+            stateless_worker_endpoint_uri = stateless_worker_uri.toString();
+        }
+
+        String unique_task_id = toString(unique_query_id) + "::" + task_description.task.task_id;
+        return {stateless_worker_endpoint_uri, unique_task_id};
+    }
+
+    void startStage(const String & stage_name, const DistributedQueryStage & stage) override
+    {
+        DistributedQueryTaskDescription task_description;
+        task_description.initial_query_id = context->getCurrentQueryId();
+        task_description.serialized_query_plan = serializeQueryPlan(stage.query_plan_fragment);
+        task_description.exchanges = distributed_query_plan.exchange_descriptions; /// TODO: add only exchanges for this stage
+        task_description.settings_changes = context->getSettingsRef().changes();
+
+        const String unique_temp_file_path = toString(unique_query_id);
+
+        for (const auto & task : stage.tasks)
+        {
+            checkCancelled();
+
+            task_description.task = task;
+
+            /// Add exchange destinations for output streams
+            task_description.exchange_stream_sources = {};
+            for (const auto & input_stream : task.input_exchange_streams)
+            {
+                String input_stream_name = input_stream.toString();
+                task_description.exchange_stream_sources.stream_hosts[input_stream_name] = task_to_host_map->getExchangeStreamSourceHosts().at(input_stream_name);
+            }
+
+            /// Send the task before registering it: status polling does not tolerate
+            /// UnknownTaskId, so a tracker poll racing the start would abort the query.
+            /// On send failure clean up directly in case the worker did accept the start;
+            /// cancel/forget tolerate UnknownTaskId.
+            auto task_info = buildTaskInfo(task_description);
+            LOG_DEBUG(logger, "Sending task {} to {}", task_info.task_id, task_info.endpoint_uri);
+            try
+            {
+                sendTask(task_info.endpoint_uri, task_info.task_id, task_description, unique_temp_file_path, context);
+            }
+            catch (...)
+            {
+                running_tasks.cancelAndForgetUntracked(task_info);
+                throw;
+            }
+            running_tasks.addTask(stage_name, task_info);
+        }
+    }
+
+    bool waitForStage(const String & stage_name, std::optional<UInt64> timeout_ms) override
+    {
+        return running_tasks.waitForStage(stage_name, timeout_ms);
+    }
+
+    TaskToHostMapPtr task_to_host_map;
+    TaskTracker running_tasks;
+};
+
+
+DistributedQueryPlanExecutor::DistributedQueryPlanExecutor(const UUID & unique_query_id_, const DistributedQueryPlan & distributed_query_plan_, ContextPtr context_, std::shared_ptr<std::atomic<bool>> is_cancelled_)
+    : unique_query_id(unique_query_id_)
+    , distributed_query_plan(distributed_query_plan_)
+    , context(std::move(context_))
+    , query_status(context->getProcessListElement())
+    , is_cancelled(std::move(is_cancelled_))
+    , logger(getLogger("DistributedQueryPlanExecutor"))
+{
+}
+
+void DistributedQueryPlanExecutor::checkCancelled() const
+{
+    if (query_status)
+        query_status->checkTimeLimit();
+
+    if (*is_cancelled)
+        throw Exception(ErrorCodes::QUERY_WAS_CANCELLED, "Query was cancelled");
+}
+
+void DistributedQueryPlanExecutor::startStageWithDependencies(const String & stage_name, UnorderedSetWithMemoryTracking<String> & executed_stages)
+{
+    if (executed_stages.contains(stage_name))
+        return;
+
+    /// A streaming producer runs concurrently with its consumer and blocks until the consumer
+    /// connects, so start streaming producers only after this stage (their consumer); starting them
+    /// first can fill the worker pool with producers waiting for consumers that have not run yet.
+    /// A persisted producer runs to completion before the consumer reads its output, so start and
+    /// wait for persisted producers up front.
+    Strings streaming_dependencies;
+    if (distributed_query_plan.stage_depends_on.contains(stage_name))
+    {
+        Strings dependencies_to_wait;
+
+        for (const auto & [dependency, exchange_id] : distributed_query_plan.stage_depends_on.at(stage_name))
+        {
+            if (distributed_query_plan.exchange_descriptions.at(exchange_id).kind == ExchangeDescription::Kind::Persisted)
+            {
+                startStageWithDependencies(dependency, executed_stages);
+                dependencies_to_wait.push_back(dependency);
+            }
+            else
+            {
+                streaming_dependencies.push_back(dependency);
+            }
+        }
+
+        for (const auto & dependency : dependencies_to_wait)
+            waitForStage(dependency, std::nullopt);
+    }
+
+    const auto & stage = distributed_query_plan.stages.at(stage_name);
+    LOG_DEBUG(logger,
+        "\n====================== Executing stage '{}' =========================\n"
+        "PLAN:\n{}\nTASKS: {}\n"
+        "==========================================================================",
+        stage_name, dumpQueryPlan(stage.query_plan_fragment), stage.tasks.size());
+    startStage(stage_name, stage);
+    executed_stages.insert(stage_name);
+
+    /// The consumer is running now and can accept connections, so start its streaming producers.
+    for (const auto & dependency : streaming_dependencies)
+        startStageWithDependencies(dependency, executed_stages);
+}
+
+void DistributedQueryPlanExecutor::start()
+{
+    LOG_DEBUG(logger, "Starting distributed query, unique id: {}", toString(unique_query_id));
+
+    /// Start from the root stages (those no other stage depends on) so the recursion can start each
+    /// consumer before its streaming producers. Entering at an arbitrary stage could start a producer
+    /// before its consumer is running.
+    {
+        UnorderedSetWithMemoryTracking<String> depended_on_stages;
+        for (const auto & [_, dependencies] : distributed_query_plan.stage_depends_on)
+            for (const auto & [dependency, exchange_id] : dependencies)
+                depended_on_stages.insert(dependency);
+
+        UnorderedSetWithMemoryTracking<String> executed_stages;
+        for (const auto & [stage_name, _] : distributed_query_plan.stages)
+            if (!depended_on_stages.contains(stage_name))
+                startStageWithDependencies(stage_name, executed_stages);
+
+        /// Start anything not reachable from a root (a disconnected stage should not occur, but if it
+        /// does it must still run).
+        for (const auto & [stage_name, _] : distributed_query_plan.stages)
+            startStageWithDependencies(stage_name, executed_stages);
+    }
+
+    /// Wait for all stages to finish
+    for (const auto & [stage_name, _] : distributed_query_plan.stages)
+        running_stages.push_back(stage_name);
+}
+
+bool DistributedQueryPlanExecutor::execute()
+{
+    if (running_stages.empty())
+        return true;
+
+    auto & stage_name = running_stages.front();
+    bool stage_finished = waitForStage(stage_name, 100);
+    if (stage_finished)
+    {
+        LOG_DEBUG(logger, "Stage '{}' finished", stage_name);
+        running_stages.pop_front();
+    }
+
+    return false;
+}
+
+std::unique_ptr<DistributedQueryPlanExecutor> createDistributedQueryExecutor(
+    const UUID & unique_query_id,
+    const DistributedQueryPlan & distributed_query_plan,
+    TaskToHostMapPtr task_to_host_map,
+    ContextPtr context,
+    std::shared_ptr<std::atomic<bool>> is_cancelled)
+{
+    bool run_locally = context->getSettingsRef()[Setting::distributed_plan_execute_locally];
+    std::unique_ptr<DistributedQueryPlanExecutor> executor;
+    if (run_locally)
+        executor = std::make_unique<DistributedQueryPlanExecutorLocal>(unique_query_id, distributed_query_plan, context, is_cancelled);
+    else
+        executor = std::make_unique<DistributedQueryPlanExecutorRemote>(unique_query_id, distributed_query_plan, task_to_host_map, context, is_cancelled);
+
+    return executor;
+}
+
+}
diff --git a/src/QueryPipeline/DistributedPlanExecutor.h b/src/QueryPipeline/DistributedPlanExecutor.h
new file mode 100644
index 000000000000..d39cfd2f4d26
--- /dev/null
+++ b/src/QueryPipeline/DistributedPlanExecutor.h
@@ -0,0 +1,138 @@
+#pragma once
+
+#include <Processors/Chunk.h>
+#include <Disks/DiskObjectStorage/ObjectStorages/IObjectStorage_fwd.h>
+#include <Interpreters/Context_fwd.h>
+#include <Processors/QueryPlan/QueryPlan.h>
+#include <IO/Progress.h>
+
+#include <Common/DequeWithMemoryTracking.h>
+#include <Common/SettingsChanges.h>
+#include <Common/UnorderedMapWithMemoryTracking.h>
+#include <Common/UnorderedSetWithMemoryTracking.h>
+
+namespace DB
+{
+
+class TaskToHostMap : public boost::noncopyable
+{
+public:
+    TaskToHostMap(const DistributedQueryPlan & distributed_query_plan_, ContextPtr context_);
+
+    const Strings & getHostnames() const { return hostnames; }
+    const UnorderedMapWithMemoryTracking<String, String> & getTaskHosts() const { return task_hosts; }
+    const UnorderedMapWithMemoryTracking<String, String> & getExchangeStreamSourceHosts() const { return exchange_stream_source_hosts; }
+
+private:
+    void fillHostnames(ContextPtr context);
+    void assignHostsForTasks(const DistributedQueryPlan & distributed_query_plan);
+
+    Strings hostnames;
+    UnorderedMapWithMemoryTracking<String, String> task_hosts;
+    UnorderedMapWithMemoryTracking<String, String> exchange_stream_source_hosts;
+};
+
+using TaskToHostMapPtr = std::shared_ptr<const TaskToHostMap>;
+
+struct DistributedQueryPlan;
+
+class QueryStatus;
+using QueryStatusPtr = std::shared_ptr<QueryStatus>;
+
+/// Implements distributed query plan execution logic by executing stages according to dependencies between them.
+class DistributedQueryPlanExecutor
+{
+public:
+    virtual ~DistributedQueryPlanExecutor() = default;
+
+    void start();
+    bool execute(); /// Returns true if the execution is finished, false if it is still in progress and should be called again later.
+
+    virtual void cleanup() = 0;
+
+private:
+    void startStageWithDependencies(const String & stage_name, UnorderedSetWithMemoryTracking<String> & executed_stages);
+
+protected:
+    DistributedQueryPlanExecutor(const UUID & unique_query_id_, const DistributedQueryPlan & distributed_query_plan_, ContextPtr context_, std::shared_ptr<std::atomic<bool>> is_cancelled_);
+
+    virtual void startStage(const String & stage_name, const DistributedQueryStage & stage) = 0;
+    virtual bool waitForStage(const String & stage_name, std::optional<UInt64> timeout_ms) = 0;
+
+    void checkCancelled() const;
+
+    const UUID unique_query_id;
+    const DistributedQueryPlan & distributed_query_plan;
+    ContextPtr context;
+    QueryStatusPtr query_status;
+    std::shared_ptr<std::atomic<bool>> is_cancelled;
+    DequeWithMemoryTracking<String> running_stages;
+    LoggerPtr logger;
+};
+
+std::unique_ptr<DistributedQueryPlanExecutor> createDistributedQueryExecutor(
+    const UUID & unique_query_id,
+    const DistributedQueryPlan & distributed_query_plan,
+    TaskToHostMapPtr task_to_host_map,
+    ContextPtr context,
+    std::shared_ptr<std::atomic<bool>> is_cancelled);
+
+/// Contains info about hosts assigned to exchange buckets
+struct ExchangeStreamSources
+{
+    /// Exchange stream id -> source host
+    UnorderedMapWithMemoryTracking<String, String> stream_hosts;
+};
+
+/// Contains all info to send a task to remote worker
+struct DistributedQueryTaskDescription
+{
+    String initial_query_id;
+    DistributedQueryTask task;
+    String serialized_query_plan;
+    ExchangeDescriptions exchanges;
+    ExchangeStreamSources exchange_stream_sources;
+    /// The initiator's changed settings, applied on the worker so query limits and execution-affecting
+    /// settings (e.g. max_memory_usage) are honored remotely.
+    SettingsChanges settings_changes;
+};
+
+/// Executes a task locally. `distributed_query_id` is the node-independent identifier of the whole
+/// distributed query (the same value on every node); it keys the in-memory and streaming exchanges,
+/// while `object_storage_path` locates this node's persisted temporary files.
+void doExecuteTask(const DistributedQueryTaskDescription & task, ObjectStoragePtr object_storage,
+    const String & object_storage_path, const String & distributed_query_id, ContextMutablePtr context,
+    std::function<bool()> is_cancelled = nullptr, ProgressCallback progress_callback = nullptr);
+
+/// Returns object storage and path for temporary files
+std::pair<ObjectStoragePtr, String> getObjectStorageForTemporaryFiles(const String & unique_temp_file_path, ContextPtr context);
+
+struct ITemporaryFileLookup;
+using TemporaryFileLookupPtr = std::shared_ptr<ITemporaryFileLookup>;
+
+/// ITemporaryFileLookup that is used in buildQueryPipeline() to create readers and writers for temporary files by temporary file logical names
+TemporaryFileLookupPtr createTemporaryFilesLookup(ObjectStoragePtr object_storage_, const String & object_storage_path_,
+    const Strings & input_temporary_files_, const Strings & output_temporary_files_);
+
+struct IExchangeLookup;
+using ExchangeLookupPtr = std::shared_ptr<IExchangeLookup>;
+
+struct ExchangeDescription;
+
+ExchangeLookupPtr createExchangeLookup(
+    const String & query_id,
+    const ExchangeDescriptions & exchanges_,
+    const ExchangeStreamSources & exchange_stream_sources,
+    TemporaryFileLookupPtr temporary_files_,
+    ContextPtr context);
+
+class ICustomResourceHolder;
+
+/// Helper to clean temporary files after query execution
+std::shared_ptr<ICustomResourceHolder> makeTemporaryFilesCleaner(ObjectStoragePtr object_storage_, const String & object_storage_path_,
+    const Strings & temporary_files_);
+
+/// Helper to drop the query's in-memory exchanges once the query pipeline is destroyed.
+std::shared_ptr<ICustomResourceHolder> makeInMemoryExchangesCleaner(const String & query_id);
+
+}
diff --git a/src/Server/DistributedQuery/ExchangeConnections.cpp b/src/Server/DistributedQuery/ExchangeConnections.cpp
new file mode 100644
index 000000000000..794955c9ba7a
--- /dev/null
+++ b/src/Server/DistributedQuery/ExchangeConnections.cpp
@@ -0,0 +1,188 @@
+#ifdef OS_LINUX
+#include <mutex>
+#include <Server/DistributedQuery/ExchangeConnections.h>
+#include <Server/DistributedQuery/FutureConnection.h>
+#include <Common/Exception.h>
+#include <Common/logger_useful.h>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int QUERY_WAS_CANCELLED;
+}
+
+void ExchangeConnections::addConnection(const String & query_id, const String & exchange_stream_id, Poco::Net::StreamSocket socket)
+{
+    LOG_TRACE(log, "Adding connection for query id {} exchange stream {}", query_id, exchange_stream_id);
+
+    const auto connection_key = std::make_pair(query_id, exchange_stream_id);
+
+    std::lock_guard lock(mutex);
+
+    /// Refuse late arrivals for a cancelled query: getConnection cannot consume the
+    /// entry anymore, so creating one would leak.
+    if (cancelled_queries.contains(query_id))
+    {
+        LOG_TRACE(log, "Dropping late connection for cancelled query id {} exchange stream {}", query_id, exchange_stream_id);
+        socket.close();
+        return;
+    }
+
+    /// The owning task already released this stream; do not recreate a slot nothing will consume.
+    if (released_streams.contains(connection_key))
+    {
+        LOG_TRACE(log, "Dropping connection for released query id {} exchange stream {}", query_id, exchange_stream_id);
+        socket.close();
+        return;
+    }
+
+    auto & slot = pending_connections[connection_key];
+
+    /// One producer per stream. A duplicate (reconnect or repeated `SourceHello`) must not drop the
+    /// first socket: close the new one and keep the original.
+    if (slot.socket_delivered)
+    {
+        LOG_WARNING(log, "Dropping duplicate connection for query id {} exchange stream {}", query_id, exchange_stream_id);
+        socket.close();
+        return;
+    }
+
+    /// Deliver the socket: wakes a consumer that is already waiting, or stays ready until one takes it.
+    slot.future->setSocket(socket);
+    slot.socket_delivered = true;
+}
+
+FutureConnectionPtr ExchangeConnections::getConnection(const String & query_id, const String & exchange_stream_id)
+{
+    LOG_TRACE(log, "Getting connection for query id {} exchange stream {}", query_id, exchange_stream_id);
+
+    const auto connection_key = std::make_pair(query_id, exchange_stream_id);
+
+    std::lock_guard lock(mutex);
+
+    /// Cancelled query: return a pre-cancelled future so the caller fails on the next
+    /// wait instead of creating a pending entry that nothing will satisfy.
+    if (cancelled_queries.contains(query_id))
+    {
+        auto future_connection = std::make_shared<FutureConnection>();
+        future_connection->cancel(std::make_exception_ptr(
+            Exception(ErrorCodes::QUERY_WAS_CANCELLED, "Exchange connection cancelled, query id {}", query_id)));
+        return future_connection;
+    }
+
+    /// The owning task already released this stream; reject instead of creating a slot nothing frees.
+    if (released_streams.contains(connection_key))
+    {
+        auto cancelled = std::make_shared<FutureConnection>();
+        cancelled->cancel(std::make_exception_ptr(
+            Exception(ErrorCodes::QUERY_WAS_CANCELLED, "Exchange stream already released, query id {} stream {}", query_id, exchange_stream_id)));
+        return cancelled;
+    }
+
+    auto & slot = pending_connections[connection_key];
+
+    /// One consumer per stream. A duplicate (e.g. a task started twice) must not get the same
+    /// socket; reject it with a cancelled future.
+    if (slot.consumer_assigned)
+    {
+        LOG_WARNING(log, "Refusing duplicate consumer for query id {} exchange stream {}", query_id, exchange_stream_id);
+        auto cancelled = std::make_shared<FutureConnection>();
+        cancelled->cancel(std::make_exception_ptr(
+            Exception(ErrorCodes::QUERY_WAS_CANCELLED, "Duplicate exchange consumer, query id {} stream {}", query_id, exchange_stream_id)));
+        return cancelled;
+    }
+
+    slot.consumer_assigned = true;
+    /// Ready already if the producer connected first; otherwise `addConnection` wakes the waiter.
+    return slot.future;
+}
+
+void ExchangeConnections::cleanupQuery(const String & query_id)
+{
+    std::vector<FutureConnectionPtr> to_cancel;
+    {
+        std::lock_guard lock(mutex);
+        /// Mark the query so any addConnection/getConnection arriving after this point
+        /// short-circuits and doesn't recreate an orphan entry. Keep the tombstone set bounded by
+        /// evicting the oldest ids; late connections are only possible for recently-cleaned queries.
+        if (cancelled_queries.insert(query_id).second)
+        {
+            cancelled_queries_order.push_back(query_id);
+            while (cancelled_queries_order.size() > MAX_CANCELLED_QUERIES)
+            {
+                cancelled_queries.erase(cancelled_queries_order.front());
+                cancelled_queries_order.pop_front();
+            }
+        }
+        for (auto it = pending_connections.begin(); it != pending_connections.end();)
+        {
+            if (it->first.first == query_id)
+            {
+                to_cancel.push_back(it->second.future);
+                it = pending_connections.erase(it);
+            }
+            else
+                ++it;
+        }
+    }
+
+    if (to_cancel.empty())
+        return;
+
+    LOG_TRACE(log, "Cleaning up {} pending exchange connections for query id {}", to_cancel.size(), query_id);
+
+    auto exception = std::make_exception_ptr(
+        Exception(ErrorCodes::QUERY_WAS_CANCELLED, "Exchange connection cancelled, query id {}", query_id));
+    /// `cancel` is a no-op if the connection already paired (`FutureConnection` completes at most once).
+    for (auto & future : to_cancel)
+        future->cancel(exception);
+}
+
+void ExchangeConnections::markStreamReleased(const ConnectionKey & key)
+{
+    if (released_streams.insert(key).second)
+    {
+        released_streams_order.push_back(key);
+        while (released_streams_order.size() > MAX_RELEASED_STREAMS)
+        {
+            released_streams.erase(released_streams_order.front());
+            released_streams_order.pop_front();
+        }
+    }
+}
+
+void ExchangeConnections::removePendingStreams(const String & query_id, const std::vector<String> & exchange_stream_ids)
+{
+    std::vector<FutureConnectionPtr> to_cancel;
+    {
+        std::lock_guard lock(mutex);
+        for (const auto & exchange_stream_id : exchange_stream_ids)
+        {
+            const auto key = std::make_pair(query_id, exchange_stream_id);
+            if (auto it = pending_connections.find(key); it != pending_connections.end())
+            {
+                to_cancel.push_back(it->second.future);
+                pending_connections.erase(it);
+            }
+            /// Tombstone the stream so a connection arriving after this task finished is rejected
+            /// instead of recreating an orphan slot (a worker never runs cleanupQuery).
+            markStreamReleased(key);
+        }
+    }
+
+    if (to_cancel.empty())
+        return;
+
+    LOG_TRACE(log, "Cleaning up {} pending exchange connections for query id {}", to_cancel.size(), query_id);
+
+    auto exception = std::make_exception_ptr(
+        Exception(ErrorCodes::QUERY_WAS_CANCELLED, "Exchange connection cancelled, query id {}", query_id));
+    /// `cancel` is a no-op if the connection already paired (`FutureConnection` completes at most once).
+    for (auto & future : to_cancel)
+        future->cancel(exception);
+}
+
+}
+#endif
diff --git a/src/Server/DistributedQuery/ExchangeConnections.h b/src/Server/DistributedQuery/ExchangeConnections.h
new file mode 100644
index 000000000000..633bd736b2a5
--- /dev/null
+++ b/src/Server/DistributedQuery/ExchangeConnections.h
@@ -0,0 +1,93 @@
+#pragma once
+
+#ifdef OS_LINUX
+#include <Client/Connection.h>
+#include <base/defines.h>
+#include <Poco/Net/StreamSocket.h>
+#include <Server/DistributedQuery/FutureConnection.h>
+
+#include <boost/container_hash/hash.hpp>
+
+#include <deque>
+#include <future>
+#include <memory>
+#include <unordered_set>
+#include <utility>
+
+
+namespace DB
+{
+
+/// Stores conncections initiated by remote tasks and allows local tasks to find them.
+class ExchangeConnections : boost::noncopyable
+{
+public:
+    ExchangeConnections() = default;
+    virtual ~ExchangeConnections() = default;
+
+    /// TODO: move to Context instead of this singleton
+    static std::shared_ptr<ExchangeConnections> instance()
+    {
+        static std::shared_ptr<ExchangeConnections> self = std::make_shared<ExchangeConnections>();
+        return self;
+    }
+
+    void addConnection(const String & query_id, const String & exchange_stream_id, Poco::Net::StreamSocket socket);
+
+    /// Get a future connection that will be ready once the remote side connects.
+    /// Returns immediately without blocking.
+    FutureConnectionPtr getConnection(const String & query_id, const String & exchange_stream_id);
+
+    /// Remove all pending entries that belong to the given query and wake any
+    /// waiters on them with a cancellation exception. Called from the
+    /// distributed-query cleanup path so connections that never paired up do
+    /// not leak `FutureConnection`s/eventfds across the lifetime of the server.
+    void cleanupQuery(const String & query_id);
+
+    /// Drop the pending entries for the given streams of a query, waking any waiter with a
+    /// cancellation. Used by a worker task to release only its own output streams on completion or
+    /// cancellation, without disturbing sibling tasks of the same query that own other streams.
+    void removePendingStreams(const String & query_id, const std::vector<String> & exchange_stream_ids);
+
+private:
+    std::mutex mutex;
+    using ConnectionKey = std::pair<String, String>; /// query_id, exchange_stream_id
+    /// One rendezvous per exchange stream: the producer provides the socket (`addConnection`) and the
+    /// consumer takes the future (`getConnection`), in either order. The flags let a duplicate be
+    /// rejected instead of dropping the socket or handing it to two readers. Released by
+    /// `removePendingStreams` or `cleanupQuery`.
+    struct ConnectionSlot
+    {
+        FutureConnectionPtr future = std::make_shared<FutureConnection>();
+        bool socket_delivered = false;   /// a producer connection has provided its socket
+        bool consumer_assigned = false;  /// a consumer has taken the future
+    };
+    using ConnectionsMap = std::unordered_map<ConnectionKey, ConnectionSlot, boost::hash<ConnectionKey>>;
+    ConnectionsMap pending_connections;
+    /// Queries that have been cleaned up. Late add/get for these query ids must not
+    /// (re)create pending entries — otherwise the entry would have no owner and leak.
+    /// Bounded: query ids are tracked in insertion order and the oldest tombstone is dropped past
+    /// the cap, so the set cannot grow without limit on a long-running server.
+    static constexpr size_t MAX_CANCELLED_QUERIES = 100000;
+    std::unordered_set<String> cancelled_queries;
+    std::deque<String> cancelled_queries_order;
+
+    /// Streams already released by `removePendingStreams`. A worker never calls `cleanupQuery`, so
+    /// without this a connection arriving after the owning task finished would recreate a slot that
+    /// nothing ever releases, leaking an eventfd. Late add/get for these streams is rejected. Bounded
+    /// with the same FIFO eviction as `cancelled_queries`.
+    static constexpr size_t MAX_RELEASED_STREAMS = 100000;
+    std::unordered_set<ConnectionKey, boost::hash<ConnectionKey>> released_streams;
+    std::deque<ConnectionKey> released_streams_order;
+
+    /// Records a stream as released, evicting the oldest entry past the cap. Call under `mutex`.
+    void markStreamReleased(const ConnectionKey & key);
+
+    LoggerPtr log = getLogger("ExchangeConnections");
+};
+
+using ExchangeConnectionsPtr = std::shared_ptr<ExchangeConnections>;
+
+}
+
+#endif
diff --git a/src/Server/DistributedQuery/ExchangeServer.cpp b/src/Server/DistributedQuery/ExchangeServer.cpp
new file mode 100644
index 000000000000..42a6410d6694
--- /dev/null
+++ b/src/Server/DistributedQuery/ExchangeServer.cpp
@@ -0,0 +1,263 @@
+#ifdef OS_LINUX
+#include <Server/DistributedQuery/ExchangeServer.h>
+#include <Server/DistributedQuery/ExchangeConnections.h>
+#include <Server/DistributedQuery/StreamingExchangeProtocol.h>
+#include <Common/logger_useful.h>
+#include <Common/Exception.h>
+#include <Common/PODArray.h>
+#include <Common/Stopwatch.h>
+#include <IO/ReadBufferFromMemory.h>
+#include <IO/ReadHelpers.h>
+#include <IO/WriteBufferFromPocoSocket.h>
+#include <IO/WriteBufferFromString.h>
+#include <IO/WriteHelpers.h>
+#include <Common/CurrentMetrics.h>
+#include <Poco/Net/NetException.h>
+
+namespace CurrentMetrics
+{
+    extern const Metric ExchangeServerThreads;
+    extern const Metric ExchangeServerThreadsActive;
+    extern const Metric ExchangeServerThreadsScheduled;
+}
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int UNEXPECTED_PACKET_FROM_CLIENT;
+    extern const int PROTOCOL_VERSION_MISMATCH;
+}
+
+/// Bounds the handshake thread pool: enough threads to absorb bursts of concurrent connections
+/// without serializing on the accept thread, while staying bounded.
+static constexpr size_t HANDSHAKE_POOL_MAX_THREADS = 64;
+static constexpr size_t HANDSHAKE_POOL_MAX_FREE_THREADS = 8;
+static constexpr size_t HANDSHAKE_POOL_QUEUE_SIZE = 10000;
+
+ExchangeServer::ExchangeServer(const String & listen_host, UInt16 port, ExchangeConnectionsPtr connections_)
+    : connections(std::move(connections_))
+    , server_socket(Poco::Net::ServerSocket(Poco::Net::SocketAddress(listen_host, port)))
+    , accept_thread("ExchangeServer")
+    , handshake_pool(
+        CurrentMetrics::ExchangeServerThreads,
+        CurrentMetrics::ExchangeServerThreadsActive,
+        CurrentMetrics::ExchangeServerThreadsScheduled,
+        HANDSHAKE_POOL_MAX_THREADS, HANDSHAKE_POOL_MAX_FREE_THREADS, HANDSHAKE_POOL_QUEUE_SIZE)
+    , stopped(true)
+    , log(getLogger("ExchangeServer"))
+{
+}
+
+ExchangeServer::~ExchangeServer()
+{
+    try
+    {
+        stop();
+    }
+    catch (...)
+    {
+        tryLogCurrentException(log);
+    }
+}
+
+
+void ExchangeServer::start()
+{
+    LOG_DEBUG(log, "Starting ExchangeServer on {}", server_socket.address().toString());
+    stopped = false;
+    accept_thread.start(*this);
+}
+
+
+void ExchangeServer::stop()
+{
+    if (!stopped)
+    {
+        stopped = true;
+        accept_thread.join();
+        /// Finish in-flight handshakes (each bounded by HELLO_TIMEOUT_SECONDS) before `connections`
+        /// is torn down.
+        handshake_pool.wait();
+    }
+}
+
+void ExchangeServer::run()
+{
+    while (!stopped)
+    {
+        Poco::Timespan timeout(250000);
+        try
+        {
+            if (server_socket.poll(timeout, Poco::Net::Socket::SELECT_READ))
+            {
+                Poco::Net::StreamSocket socket;
+                try
+                {
+                    socket = server_socket.acceptConnection();
+                }
+                // Termination request
+                catch (Poco::InvalidArgumentException &)
+                {
+                    break;
+                }
+
+                /// Run the handshake on the pool, not inline, so a slow peer does not stall the
+                /// accept loop and block subsequent connections. Drop the connection if the pool
+                /// rejects the job (queue full or shutting down).
+                try
+                {
+                    handshake_pool.scheduleOrThrowOnError(
+                        [accepted = socket, conns = connections, task_log = log]()
+                        {
+                            try
+                            {
+                                handleConnection(accepted, conns, task_log);
+                            }
+                            catch (...)
+                            {
+                                tryLogCurrentException(task_log);
+                            }
+                        });
+                }
+                catch (...)
+                {
+                    tryLogCurrentException(log);
+                }
+            }
+        }
+        catch (Poco::Exception &)
+        {
+            tryLogCurrentException(log);
+            Poco::Thread::sleep(50);
+        }
+    }
+}
+
+namespace
+{
+    /// Read exactly `size` bytes from a blocking socket. Throws on EOF or transport error.
+    /// On a blocking socket with setReceiveTimeout, EAGAIN cannot surface (Poco raises
+    /// TimeoutException instead), so the would-block return from `tryReceive` would
+    /// indicate the socket was reconfigured non-blocking by mistake — we treat it as an error.
+    void receiveAll(Poco::Net::StreamSocket & socket, void * buffer, size_t size, const String & description, const Stopwatch & handshake_watch)
+    {
+        char * dst = static_cast<char *>(buffer);
+        size_t position = 0;
+        while (position < size)
+        {
+            /// Absolute deadline across the whole handshake: the per-call receive timeout alone does
+            /// not stop a peer that dribbles one byte just under the timeout and keeps this inline
+            /// accept thread (and thus all later connections) occupied indefinitely.
+            if (handshake_watch.elapsedSeconds() > StreamingExchangeProtocol::HELLO_TIMEOUT_SECONDS)
+                throw Poco::Net::NetException(fmt::format(
+                    "Handshake from {} exceeded {}s while receiving {}",
+                    socket.peerAddress().toString(), StreamingExchangeProtocol::HELLO_TIMEOUT_SECONDS, description));
+
+            ssize_t received = StreamingExchangeProtocol::tryReceive(socket, dst + position, size - position, description);
+            if (received == 0)
+                throw Poco::Net::NetException(fmt::format(
+                    "Failed to receive {} from {}, socket reported would-block on a blocking handshake after {} of {} bytes",
+                    description, socket.peerAddress().toString(), position, size));
+            position += received;
+        }
+    }
+}
+
+void ExchangeServer::handleConnection(Poco::Net::StreamSocket socket, ExchangeConnectionsPtr connections, LoggerPtr log)
+{
+    LOG_TRACE(log, "Connection from {}", socket.peerAddress().toString());
+
+    /// The handshake runs inline on the accept thread on a blocking socket.
+    /// Apply per-call timeouts so a silent or stalling peer cannot hold this thread
+    /// (and block subsequent accepts) for longer than HELLO_TIMEOUT_SECONDS.
+    Poco::Timespan hello_timeout(StreamingExchangeProtocol::HELLO_TIMEOUT_SECONDS, 0);
+    socket.setReceiveTimeout(hello_timeout);
+    socket.setSendTimeout(hello_timeout);
+
+    Stopwatch handshake_watch;
+
+    StreamingExchangeProtocol::PacketHeader header{};
+    receiveAll(socket, &header, sizeof(header), "SourceHello header", handshake_watch);
+
+    if (header.packet_type != StreamingExchangeProtocol::PacketType::SourceHello)
+        throw Exception(ErrorCodes::UNEXPECTED_PACKET_FROM_CLIENT,
+            "Unexpected packet type 0x{:x} from {} (expected SourceHello 0x{:x})",
+            header.packet_type, socket.peerAddress().toString(),
+            static_cast<UInt64>(StreamingExchangeProtocol::PacketType::SourceHello));
+
+    if (header.bytes_size > StreamingExchangeProtocol::MAX_HELLO_BODY_BYTES)
+        throw Exception(ErrorCodes::UNEXPECTED_PACKET_FROM_CLIENT,
+            "SourceHello body size {} from {} exceeds the limit {}",
+            header.bytes_size, socket.peerAddress().toString(),
+            StreamingExchangeProtocol::MAX_HELLO_BODY_BYTES);
+
+    if (header.bytes_size < sizeof(UInt64))
+        throw Exception(ErrorCodes::UNEXPECTED_PACKET_FROM_CLIENT,
+            "SourceHello body size {} from {} is too small to contain the protocol version",
+            header.bytes_size, socket.peerAddress().toString());
+
+    PODArray<char> body_buffer(header.bytes_size);
+    if (!body_buffer.empty())
+        receiveAll(socket, body_buffer.data(), body_buffer.size(), "SourceHello body", handshake_watch);
+
+    /// Read only the version field first. The layout of fields after it can change between
+    /// protocol versions, so a peer on a different version must not have its body further parsed.
+    ReadBufferFromMemory body_in(body_buffer.data(), body_buffer.size());
+    StreamingExchangeProtocol::SourceHelloBody source_hello;
+    source_hello.source_version = StreamingExchangeProtocol::SourceHelloBody::readVersion(body_in);
+
+    WriteBufferFromOwnString reply_body;
+    StreamingExchangeProtocol::SinkHelloBody sink_hello{.sink_version = StreamingExchangeProtocol::PROTOCOL_VERSION};
+    sink_hello.write(reply_body);
+    reply_body.finalize();
+    const std::string & reply_body_str = reply_body.str();
+
+    StreamingExchangeProtocol::PacketHeader reply_header{
+        .packet_type = StreamingExchangeProtocol::PacketType::SinkHello,
+        .bytes_size = reply_body_str.size(),
+    };
+
+    /// On version mismatch, send SinkHello on a best-effort basis so the peer can produce
+    /// a precise diagnostic naming both versions, then throw locally. The connection is
+    /// not registered. On version match, the SinkHello send must succeed for the handshake
+    /// to be considered complete; let any write error propagate and abort the registration.
+    auto send_sink_hello = [&]
+    {
+        WriteBufferFromPocoSocket out(socket);
+        out.write(reinterpret_cast<const char *>(&reply_header), sizeof(reply_header));
+        if (!reply_body_str.empty())
+            out.write(reply_body_str.data(), reply_body_str.size());
+        out.finalize();
+    };
+
+    if (source_hello.source_version != StreamingExchangeProtocol::PROTOCOL_VERSION)
+    {
+        try
+        {
+            send_sink_hello();
+        }
+        catch (...)
+        {
+            tryLogCurrentException(log, fmt::format("Failed to send SinkHello to {}", socket.peerAddress().toString()));
+        }
+        throw Exception(ErrorCodes::PROTOCOL_VERSION_MISMATCH,
+            "Streaming exchange protocol version mismatch from {}: peer speaks version {}, this node speaks version {}",
+            socket.peerAddress().toString(), source_hello.source_version,
+            StreamingExchangeProtocol::PROTOCOL_VERSION);
+    }
+
+    /// Versions match - body layout is known, parse the rest.
+    source_hello.readAfterVersion(body_in);
+
+    LOG_TRACE(log, "Query id: {}, stream: {}, peer protocol version: {}",
+        source_hello.query_id, source_hello.stream_name, source_hello.source_version);
+
+    send_sink_hello();
+
+    connections->addConnection(source_hello.query_id, source_hello.stream_name, socket);
+}
+
+}
+#endif
diff --git a/src/Server/DistributedQuery/ExchangeServer.h b/src/Server/DistributedQuery/ExchangeServer.h
new file mode 100644
index 000000000000..c0f5bec36aed
--- /dev/null
+++ b/src/Server/DistributedQuery/ExchangeServer.h
@@ -0,0 +1,45 @@
+#pragma once
+
+#ifdef OS_LINUX
+#include <Server/DistributedQuery/ExchangeConnections.h>
+#include <Server/TCPServerConnectionFactory.h>
+#include <Poco/Net/ServerSocket.h>
+#include <Poco/Runnable.h>
+#include <Common/Logger.h>
+#include <Common/ThreadPool.h>
+
+namespace DB
+{
+
+/// Accepts connections for streaming exchanges used by distributed queries.
+//  Reads first packet from the connections that contains distributed query id and exchange stream id.
+/// Then the connection is stored in a map and can be retrieved by distributed query task to create ExchangeStreamingSource
+class ExchangeServer : public Poco::Runnable
+{
+public:
+    ExchangeServer(const String & listen_host, UInt16 port, ExchangeConnectionsPtr connections_);
+    ~ExchangeServer() override;
+
+    void start();
+    void stop();
+
+    void run() override;
+
+    /// Runs the SourceHello/SinkHello handshake on `socket` and registers the
+    /// resulting connection in `connections` on success. Throws on protocol
+    /// mismatch or transport failure without registering. Exposed for tests.
+    static void handleConnection(Poco::Net::StreamSocket socket, ExchangeConnectionsPtr connections, LoggerPtr log);
+
+private:
+    ExchangeConnectionsPtr connections;
+    Poco::Net::ServerSocket server_socket;
+    Poco::Thread accept_thread;
+    /// Handshakes run on this pool so a slow peer cannot stall the single accept thread and block
+    /// subsequent connections.
+    ThreadPool handshake_pool;
+    std::atomic<bool> stopped {false};
+    LoggerPtr log;
+};
+
+}
+#endif
diff --git a/src/Server/DistributedQuery/FutureConnection.cpp b/src/Server/DistributedQuery/FutureConnection.cpp
new file mode 100644
index 000000000000..880f25e64631
--- /dev/null
+++ b/src/Server/DistributedQuery/FutureConnection.cpp
@@ -0,0 +1,95 @@
+#ifdef OS_LINUX
+
+#include <Server/DistributedQuery/FutureConnection.h>
+#include <Common/Exception.h>
+#include <Common/logger_useful.h>
+#include <base/scope_guard.h>
+#include <sys/eventfd.h>
+#include <unistd.h>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int CANNOT_OPEN_FILE;
+    extern const int LOGICAL_ERROR;
+}
+
+FutureConnection::FutureConnection()
+    : future(promise.get_future())
+    , event_fd(createEventFd())
+{
+    LOG_TRACE(log, "Created FutureConnection");
+}
+
+FutureConnection::~FutureConnection()
+{
+    [[maybe_unused]] int err = close(event_fd);
+    chassert(!err || errno == EINTR);
+}
+
+int FutureConnection::createEventFd()
+{
+    auto fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
+    if (fd == -1)
+        throw Exception(ErrorCodes::CANNOT_OPEN_FILE, "Failed to create eventfd, error {}", errno);
+    return fd;
+}
+
+int FutureConnection::getEventFd() const
+{
+    return event_fd;
+}
+
+bool FutureConnection::isReady() const
+{
+    return future.wait_for(std::chrono::seconds(0)) == std::future_status::ready;
+}
+
+Poco::Net::Socket FutureConnection::getSocket()
+{
+    if (!isReady())
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "FutureConnection does not have a ready future, check is Ready() before calling getSocket()");
+
+    // since it is a shared_future, multiple calls to get() are allowed and will return the same socket once it is set.
+    return future.get();
+}
+
+void FutureConnection::setSocket(Poco::Net::Socket socket)
+{
+    /// First completion wins; a later setSocket/cancel is a no-op (the connection already paired
+    /// or the query was torn down).
+    if (satisfied.exchange(true))
+        return;
+
+    LOG_TRACE(log, "Setting socket for FutureConnection");
+    promise.set_value(std::move(socket));
+    notifyWaiter();
+}
+
+void FutureConnection::cancel(std::exception_ptr exception)
+{
+    if (satisfied.exchange(true))
+        return;
+
+    LOG_TRACE(log, "Cancelling FutureConnection");
+    promise.set_exception(std::move(exception));
+    notifyWaiter();
+}
+
+void FutureConnection::notifyWaiter() const
+{
+    uint64_t value = 1;
+    ssize_t written = 0;
+    /// Retry on EINTR so a signal does not leave the promise ready while the epoll waiter is never
+    /// woken. Other write failures cannot happen for a non-full, valid eventfd.
+    do
+        written = write(event_fd, &value, sizeof(value));
+    while (written < 0 && errno == EINTR);
+    chassert(written == sizeof(value));
+}
+
+}
+
+#endif
diff --git a/src/Server/DistributedQuery/FutureConnection.h b/src/Server/DistributedQuery/FutureConnection.h
new file mode 100644
index 000000000000..7589590a1e8a
--- /dev/null
+++ b/src/Server/DistributedQuery/FutureConnection.h
@@ -0,0 +1,60 @@
+#pragma once
+
+#ifdef OS_LINUX
+
+#include <Poco/Net/StreamSocket.h>
+#include <Common/Logger.h>
+#include <atomic>
+#include <future>
+
+namespace DB
+{
+
+/// Represents a connection that may not be established yet.
+/// Provides an eventfd that can be used with epoll to wait asynchronously for the connection.
+class FutureConnection
+{
+public:
+    FutureConnection();
+    ~FutureConnection();
+
+    /// Get the eventfd file descriptor for epoll (creates it lazily if needed)
+    int getEventFd() const;
+
+    /// Check if the connection is ready (non-blocking)
+    bool isReady() const;
+
+    /// Try to get the socket
+    /// Should only be called once the connection is ready, otherwise it will throw an exception.
+    /// Could be called multiple times after connection is ready and will return the same socket.
+    Poco::Net::Socket getSocket();
+
+    /// Set the socket value (called when connection is established)
+    /// Should be called only once, subsequent calls will throw an exception.
+    void setSocket(Poco::Net::Socket socket);
+
+    /// Wake the waiter with an exception. Used to cancel a still-pending
+    /// connection (e.g. when the owning query is being torn down).
+    /// At-most-once like `setSocket`.
+    void cancel(std::exception_ptr exception);
+
+private:
+    static int createEventFd();
+
+    /// Wake the epoll waiter via the eventfd after the promise is completed.
+    void notifyWaiter() const;
+
+    std::promise<Poco::Net::Socket> promise;
+    std::shared_future<Poco::Net::Socket> future;
+    /// Guards the single allowed promise completion: setSocket and cancel race (the peer connecting
+    /// vs. query teardown), and the loser must be a no-op rather than throw "promise already set".
+    std::atomic<bool> satisfied{false};
+    int event_fd;
+    LoggerPtr log = getLogger("FutureConnection");
+};
+
+using FutureConnectionPtr = std::shared_ptr<FutureConnection>;
+
+}
+
+#endif
diff --git a/src/Server/DistributedQuery/StreamingExchangeLookup.cpp b/src/Server/DistributedQuery/StreamingExchangeLookup.cpp
new file mode 100644
index 000000000000..c5dffc6f0a92
--- /dev/null
+++ b/src/Server/DistributedQuery/StreamingExchangeLookup.cpp
@@ -0,0 +1,69 @@
+#ifdef OS_LINUX
+
+#include <Server/DistributedQuery/StreamingExchangeLookup.h>
+#include <Server/DistributedQuery/StreamingExchangeSink.h>
+#include <Server/DistributedQuery/StreamingExchangeSource.h>
+#include <Server/DistributedQuery/ExchangeConnections.h>
+#include <Processors/QueryPlan/ExchangeLookup.h>
+#include <Processors/QueryPlan/LogicalExchangeStep.h>
+#include <QueryPipeline/DistributedPlanExecutor.h>
+#include <base/types.h>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int LOGICAL_ERROR;
+}
+
+class StreamingExchangeLookup : public IExchangeLookup
+{
+public:
+    explicit StreamingExchangeLookup(
+        const String & query_id_,
+        ExchangeConnectionsPtr connections_,
+        const ExchangeStreamSources & exchange_stream_sources_,
+        UInt16 streaming_exchange_port_)
+        : query_id(query_id_)
+        , connections(connections_)
+        , exchange_stream_sources(exchange_stream_sources_)
+        , streaming_exchange_port(streaming_exchange_port_)
+    {
+    }
+
+    std::shared_ptr<ISink> createSink(SharedHeader input_header, const ExchangeStreamId & exchange_stream_id) override
+    {
+        auto stream_name = exchange_stream_id.toString();
+        auto future_connection = connections->getConnection(query_id, stream_name);
+        return std::make_shared<StreamingExchangeSink>(input_header, future_connection, stream_name);
+    }
+
+    std::shared_ptr<ISource> createSource(SharedHeader output_header, const ExchangeStreamId & exchange_stream_id) override
+    {
+        auto stream_name = exchange_stream_id.toString();
+        auto it = exchange_stream_sources.stream_hosts.find(stream_name);
+        if (it == exchange_stream_sources.stream_hosts.end())
+            throw Exception(ErrorCodes::LOGICAL_ERROR, "No host found for exchange stream {}", stream_name);
+        return std::make_shared<StreamingExchangeSource>(output_header, query_id, stream_name, it->second, streaming_exchange_port);
+    }
+
+private:
+    const String query_id;
+    const ExchangeConnectionsPtr connections;
+    const ExchangeStreamSources exchange_stream_sources;
+    const UInt16 streaming_exchange_port;
+};
+
+ExchangeLookupPtr createStreamingExchangeLookup(
+    const String & query_id,
+    ExchangeConnectionsPtr connections,
+    const ExchangeStreamSources & exchange_stream_sources,
+    UInt16 streaming_exchange_port)
+{
+    return std::make_shared<StreamingExchangeLookup>(query_id, connections, exchange_stream_sources, streaming_exchange_port);
+}
+
+}
+
+#endif
diff --git a/src/Server/DistributedQuery/StreamingExchangeLookup.h b/src/Server/DistributedQuery/StreamingExchangeLookup.h
new file mode 100644
index 000000000000..c3ed9f8639d6
--- /dev/null
+++ b/src/Server/DistributedQuery/StreamingExchangeLookup.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#ifdef OS_LINUX
+
+#include <Server/DistributedQuery/ExchangeConnections.h>
+#include <Processors/QueryPlan/ExchangeLookup.h>
+#include <QueryPipeline/DistributedPlanExecutor.h>
+
+namespace DB
+{
+
+ExchangeLookupPtr createStreamingExchangeLookup(
+    const String & query_id,
+    ExchangeConnectionsPtr connections,
+    const ExchangeStreamSources & exchange_stream_sources,
+    UInt16 streaming_exchange_port);
+
+}
+
+#endif
diff --git a/src/Server/DistributedQuery/StreamingExchangeProtocol.cpp b/src/Server/DistributedQuery/StreamingExchangeProtocol.cpp
new file mode 100644
index 000000000000..4571c917c75c
--- /dev/null
+++ b/src/Server/DistributedQuery/StreamingExchangeProtocol.cpp
@@ -0,0 +1,72 @@
+#include <Server/DistributedQuery/StreamingExchangeProtocol.h>
+
+#include <IO/ReadHelpers.h>
+#include <IO/WriteHelpers.h>
+#include <Poco/Net/NetException.h>
+#include <Poco/Net/StreamSocket.h>
+
+#include <algorithm>
+#include <climits>
+#include <cerrno>
+
+namespace DB
+{
+namespace StreamingExchangeProtocol
+{
+
+UInt64 SourceHelloBody::readVersion(ReadBuffer & in)
+{
+    UInt64 version = 0;
+    readIntBinary(version, in);
+    return version;
+}
+
+void SourceHelloBody::readAfterVersion(ReadBuffer & in)
+{
+    readStringBinary(query_id, in);
+    readStringBinary(stream_name, in);
+}
+
+void SourceHelloBody::write(WriteBuffer & out) const
+{
+    writeIntBinary(source_version, out);
+    writeStringBinary(query_id, out);
+    writeStringBinary(stream_name, out);
+}
+
+void SinkHelloBody::read(ReadBuffer & in)
+{
+    readIntBinary(sink_version, in);
+}
+
+void SinkHelloBody::write(WriteBuffer & out) const
+{
+    writeIntBinary(sink_version, out);
+}
+
+ssize_t tryReceive(Poco::Net::StreamSocket & socket, char * buffer, size_t size, const String & description)
+{
+    /// Poco's receiveBytes takes int. Cap the request at INT_MAX so a >2 GiB buffer
+    /// (the data-path body is sized by an untrusted peer) does not wrap negative.
+    const int chunk = static_cast<int>(std::min<size_t>(size, INT_MAX));
+    while (true)
+    {
+        ssize_t received = socket.receiveBytes(buffer, chunk);
+        if (received > 0)
+            return received;
+        if (received == 0)
+            throw Poco::Net::NetException(fmt::format(
+                "Failed to receive {} from {}, peer closed connection", description, socket.peerAddress().toString()));
+
+        const int last_error = errno;
+        if (last_error == EINTR)
+            continue;
+        if (last_error == EAGAIN || last_error == EWOULDBLOCK)
+            return 0;
+        throw Poco::Net::NetException(fmt::format(
+            "Failed to receive {} from {}, errno {}", description, socket.peerAddress().toString(), last_error));
+    }
+}
+
+}
+}
diff --git a/src/Server/DistributedQuery/StreamingExchangeProtocol.h b/src/Server/DistributedQuery/StreamingExchangeProtocol.h
new file mode 100644
index 000000000000..1e1a2ed8d1e5
--- /dev/null
+++ b/src/Server/DistributedQuery/StreamingExchangeProtocol.h
@@ -0,0 +1,85 @@
+#pragma once
+
+#include <base/types.h>
+
+namespace Poco::Net
+{
+    class StreamSocket;
+}
+
+namespace DB
+{
+
+class ReadBuffer;
+class WriteBuffer;
+
+namespace StreamingExchangeProtocol
+{
+    /// Wire-format version. Bumped on any change to packet layouts.
+    /// Negotiated in SourceHello/SinkHello; mismatches reject the connection.
+    static constexpr UInt64 PROTOCOL_VERSION = 2;
+
+    /// Sanity cap for the body of a Hello packet.
+    static constexpr UInt64 MAX_HELLO_BODY_BYTES = 64 * 1024;
+
+    /// Sanity cap for the body of a Data packet. The peer-supplied size is allocated
+    /// before any payload is read, so an unbounded value would let a buggy or hostile
+    /// sink make the source allocate arbitrarily large buffers.
+    static constexpr UInt64 MAX_DATA_PACKET_BODY_BYTES = 256ULL * 1024 * 1024;
+
+    /// Per-recv/send timeout applied to the Hello exchange on the server side.
+    /// The handshake runs inline on the accept thread, so a silent peer must not
+    /// hold it indefinitely.
+    static constexpr int HELLO_TIMEOUT_SECONDS = 10;
+
+    /// Packet types between StreamingExchangeSink and StreamingExchangeSource.
+    /// SourceHello/SinkHello magic numbers were changed when framing was introduced,
+    /// so peers on the older unframed handshake fail with UNEXPECTED_PACKET_FROM_CLIENT.
+    enum PacketType : UInt64
+    {
+        SourceHello = 0x0004e110,   /// Sent by source to sink when initiating connection
+        SinkHello   = 0x1004e110,   /// Response from sink to source
+        Data        = 0x0000da7a,   /// Data packet
+        NoMoreDataNeeded  = 0x00a11fed, /// Sent by source to sink when no more data is needed; not framed by PacketHeader
+    };
+
+    /// Packet header used by Data, SourceHello, and SinkHello packets. Contains size in bytes
+    /// of the whole serialized body. This allows to first read full packet from the socket with
+    /// non-blocking reads and epoll and then deserialize the body from memory buffer to also avoid blocking.
+    struct PacketHeader
+    {
+        UInt64 packet_type;
+        UInt64 bytes_size;  /// Size of the packet body (does not include this header)
+    };
+
+    /// Wire format of the SourceHello body. Parsing is split in two phases because
+    /// only the version field is guaranteed to live at a fixed offset across protocol
+    /// versions: peers on a different version may use a different layout for the rest,
+    /// so the version must be read and validated first.
+    struct SourceHelloBody
+    {
+        UInt64 source_version = 0;
+        String query_id;
+        String stream_name;
+
+        static UInt64 readVersion(ReadBuffer & in);
+        void readAfterVersion(ReadBuffer & in);
+        void write(WriteBuffer & out) const;
+    };
+
+    /// Wire format of the SinkHello body. Currently carries the sink's protocol version
+    /// so the source can produce a precise diagnostic on a mismatch.
+    struct SinkHelloBody
+    {
+        UInt64 sink_version = 0;
+
+        void read(ReadBuffer & in);
+        void write(WriteBuffer & out) const;
+    };
+
+    /// Single receive that retries on EINTR. Returns bytes read, or 0 if the socket
+    /// would block. Throws Poco::Net::NetException on early EOF or other socket error;
+    /// `description` labels the call site in the exception message.
+    ssize_t tryReceive(Poco::Net::StreamSocket & socket, char * buffer, size_t size, const String & description);
+}
+}
diff --git a/src/Server/DistributedQuery/StreamingExchangeSink.cpp b/src/Server/DistributedQuery/StreamingExchangeSink.cpp
new file mode 100644
index 000000000000..04f2a96446f3
--- /dev/null
+++ b/src/Server/DistributedQuery/StreamingExchangeSink.cpp
@@ -0,0 +1,449 @@
+#include <limits>
+#include <memory>
+#include <base/defines.h>
+#ifdef OS_LINUX
+
+#include <Server/DistributedQuery/StreamingExchangeSink.h>
+#include <Server/DistributedQuery/StreamingExchangeProtocol.h>
+#include <Processors/Transforms/AggregatingTransform.h>
+#include <Compression/CompressedWriteBuffer.h>
+#include <Formats/NativeWriter.h>
+#include <Core/ProtocolDefines.h>
+#include <IO/WriteHelpers.h>
+#include <IO/WriteBufferFromPocoSocket.h>
+#include <Common/logger_useful.h>
+#include <Poco/Net/NetException.h>
+
+#include <sys/epoll.h>
+#include <unistd.h>
+
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int UNEXPECTED_PACKET_FROM_CLIENT;
+    extern const int LOGICAL_ERROR;
+    extern const int NOT_IMPLEMENTED;
+}
+
+StreamingExchangeSink::~StreamingExchangeSink()
+{
+    if (out && !out->isFinalized())
+        out->cancel();
+}
+
+void StreamingExchangeSink::extractSocket()
+{
+    if (socket)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Socket has already been extracted for exchange stream {}", stream_name);
+
+    if (!future_connection)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Future connection is not set for exchange stream {}", stream_name);
+
+    if (!future_connection->isReady())
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Future connection is expected be ready at this point. Wrong sequence of prepare/schedule/work calls for exchange stream {}", stream_name);
+
+    LOG_TRACE(log, "Extracting socket from future connection for exchange stream {}", stream_name);
+    socket = std::make_unique<Poco::Net::StreamSocket>(future_connection->getSocket());
+    future_connection.reset();
+    chassert(socket);
+
+    /// Set socket to non-blocking mode after handshake is finished.
+    socket->setBlocking(false);
+    socket->setSendBufferSize(1 * 1024 * 1024);
+
+    /// Prepare initial in-memory buffer for serializing chunks
+    out = std::make_shared<WriteBufferFromOwnString>();
+}
+
+/// Send data to socket until the buffer is empty or until socket would block.
+void StreamingExchangeSink::sendToSocket()
+{
+    /// Drain any inbound NoMoreDataNeeded / peer half-close before attempting to send.
+    tryReceiveControlPacket();
+    if (no_more_data_needed)
+        return;
+
+    while (current_send_position_in_buffer < current_send_buffer.size())
+    {
+        try
+        {
+            /// `markNoMoreDataNeeded` clears `current_send_buffer`, so we can't be in this loop.
+            chassert(!no_more_data_needed);
+
+            size_t bytes_to_send = current_send_buffer.size() - current_send_position_in_buffer;
+            /// Saturate at INT_MAX: a plain cast would wrap negative for buffers > 2 GiB, after
+            /// which Poco's wrapper short-circuits without ever calling ::send.
+            ssize_t sent = socket->sendBytes(
+                current_send_buffer.data() + current_send_position_in_buffer,
+                static_cast<int>(std::min<size_t>(bytes_to_send, std::numeric_limits<int>::max())));
+            if (sent < 0)
+            {
+                auto last_error = errno;
+                if (last_error == EINTR)
+                {
+                    continue;
+                }
+                else if (last_error == EAGAIN || last_error == EWOULDBLOCK)
+                {
+                    /// Socket is not ready for writing, wait for epoll event
+                    break;
+                }
+                else
+                {
+                    throw Poco::Net::NetException(fmt::format("Failed to send data to socket for stream {}, last error {}", stream_name, last_error));
+                }
+            }
+
+            LOG_TEST(log, "Sent {} bytes to exchange stream {}, fd: {}", sent, stream_name, socket->sockfd());
+
+            current_send_position_in_buffer += sent;
+            total_bytes_sent += sent;
+        }
+        catch (const Poco::IOException & e)
+        {
+            /// Peer may have sent NoMoreDataNeeded or half-closed; only swallow the exception
+            /// in those cases, otherwise it's a real network error.
+            LOG_TRACE(log, "Send to exchange stream {} hit IO exception: {}; checking for peer close", stream_name, e.displayText());
+            tryReceiveControlPacket();
+            if (!no_more_data_needed)
+                throw;
+            return;
+        }
+    }
+
+    /// Is there enough serialized data to start sending it to socket?
+    if (out->count() >= FLUSH_BUFFER_TO_SOCKET_THRESHOLD)
+        tryToSwitchSendBuffer();
+}
+
+bool StreamingExchangeSink::canAddChunk() const
+{
+    const size_t unsent = current_send_buffer.size() - current_send_position_in_buffer;
+    return (unsent + out->count()) < MAX_PENDING_BYTES;
+}
+
+void StreamingExchangeSink::tryToSwitchSendBuffer()
+{
+    /// Check that current_send_buffer has been fully sent to socket
+    if (current_send_position_in_buffer < current_send_buffer.size())
+        return;
+
+    /// Check that new buffer has anything in it
+    if (out->count() == 0)
+        return;
+
+    out->finalize();
+    current_send_buffer = out->str();
+    current_send_position_in_buffer = 0;
+    out = std::make_shared<WriteBufferFromOwnString>();
+}
+
+ISink::Status StreamingExchangeSink::prepare()
+{
+    /// If socket is not ready yet, wait for it
+    if (!socket)
+        return Status::Async;
+
+    if (has_input)
+        return canAddChunk() ? Status::Ready : Status::Async;
+
+    if (input.isFinished())
+    {
+        if (!final_chunk_added)
+        {
+            if (!canAddChunk())
+                return Status::Async;
+            /// Input is finished, send an empty chunk to signal end-of-stream.
+            input_is_finished = true;
+            current_chunk = {};
+            has_input = true;
+            return Status::Ready;
+        }
+
+        /// Need ot flush all remaining data
+        if (current_send_position_in_buffer < current_send_buffer.size() || out->count() > 0)
+            return Status::Async;
+
+        if (!was_on_finish_called)
+            return Status::Ready;
+
+        return Status::Finished;
+    }
+
+    /// Propagate back-pressure upstream: don't pull until there's room.
+    if (!canAddChunk())
+        return Status::Async;
+
+    input.setNeeded();
+    if (!input.hasData())
+        return Status::NeedData;
+
+    current_chunk = input.pull(true);
+    has_input = true;
+    return Status::Ready;
+}
+
+void StreamingExchangeSink::work()
+{
+    /// Try to extract socket if not done yet
+    if (!socket)
+    {
+        extractSocket();
+        return;
+    }
+
+    /// React to EPOLLIN / EPOLLRDHUP wakeups (see scheduleForEvent).
+    tryReceiveControlPacket();
+
+    if (has_input)
+    {
+        /// If we have already added final chunk then no new input is expected
+        chassert(!final_chunk_added);
+
+        has_input = false;
+        if (input_is_finished)
+        {
+            /// Send empty final chunk
+            chassert(!current_chunk);
+            final_chunk_added = true;
+            consume(std::move(current_chunk));
+        }
+        else if (current_chunk)
+        {
+            /// It the chunk is not the final, send it only if it is not empty
+            consume(std::move(current_chunk));
+        }
+
+        return;
+    }
+
+    /// Send pending data to socket
+    if (current_send_position_in_buffer < current_send_buffer.size() || out->count() > 0)
+    {
+        sendToSocket();
+        tryToSwitchSendBuffer();
+        return;
+    }
+
+    if (!was_on_finish_called)
+    {
+        was_on_finish_called = true;
+        onFinish();
+        return;
+    }
+}
+
+std::pair<int, uint32_t> StreamingExchangeSink::scheduleForEvent()
+{
+    /// If socket is not ready yet, wait on the eventfd
+    if (!socket)
+    {
+        if (!future_connection)
+            throw Exception(ErrorCodes::LOGICAL_ERROR, "Future connection is not set for exchange stream {}", stream_name);
+
+        if (future_connection->isReady())
+            extractSocket();
+    }
+
+    if (socket)
+    {
+        LOG_TEST(log, "Schedule exchange stream sink {}, socket is ready, fd: {}", stream_name, socket->sockfd());
+        /// EPOLLIN | EPOLLRDHUP wake us on peer-initiated NoMoreDataNeeded / half-close.
+        return {
+            socket->sockfd(),
+            EPOLLOUT | EPOLLIN | EPOLLRDHUP | EPOLLERR};
+    }
+
+    int fd = future_connection->getEventFd();
+
+    LOG_TEST(log, "Schedule exchange stream sink {} waiting for connection, eventfd: {}", stream_name, fd);
+    return {fd, EPOLLIN | EPOLLERR};
+}
+
+void StreamingExchangeSink::consume(Chunk chunk)
+{
+    if (no_more_data_needed)
+    {
+        /// We have to consume all chunks from input even if we have already received NoMoreDataNeeded packet.
+        /// This is needed to avoid stuck pipeline in case of some buckets of ShuffleExchange don't need data while others still do.
+        /// So we just drop the chunk and continue.
+        /// TODO: is there a better way to figure out when when all buckets don't need data and close the inputs in pipeline?
+        LOG_TEST(log, "No more data needed for exchange stream {}, dropping chunk with {} rows", stream_name, chunk.getNumRows());
+        return;
+    }
+
+    rows_written += chunk.getNumRows();
+
+    if (chunk.getNumRows() == 0 && chunk.getNumColumns() != 0)
+    {
+        LOG_TEST(log, "Unexpected chunk with 0 rows to exchange stream {}", stream_name);
+    }
+
+    LOG_TEST(log, "Writing chunk with {} rows to exchange stream {}", chunk.getNumRows(), stream_name);
+
+    /// Write packet header stub.
+    /// The actual size will be calculated and overwritten after the chuck is serialized
+    const ssize_t packet_header_offset = out->count();
+    StreamingExchangeProtocol::PacketHeader packet_header{.packet_type = StreamingExchangeProtocol::PacketType::Data, .bytes_size = 0};
+    out->write(reinterpret_cast<const char*>(&packet_header), sizeof(packet_header));
+
+    const bool final_chunk = chunk.empty();
+    auto agg_info = chunk.getChunkInfos().get<AggregatedChunkInfo>();
+    const bool has_aggregated_chunk_info = !!agg_info;
+    UInt64 flags = 0;
+    if (final_chunk)
+        flags |= 1;
+    if (has_aggregated_chunk_info)
+        flags |= 2;
+    writeVarUInt(flags, *out);
+    writeVarUInt(chunk.getNumRows(), *out);
+    writeVarUInt(chunk.getNumColumns(), *out);
+    /// chunk_num has no BlockInfo field; carry it in the exchange framing so memory-bound merging
+    /// can restore chunk order on the receiver.
+    if (has_aggregated_chunk_info)
+        writeVarUInt(agg_info->chunk_num, *out);
+
+    if (chunk.getNumColumns() > 0)
+    {
+        auto compressed_buf = std::make_unique<CompressedWriteBuffer>(*out);
+        auto writer = std::make_unique<NativeWriter>(*compressed_buf, DBMS_TCP_PROTOCOL_VERSION, input.getSharedHeader());
+
+        Block block = input.getHeader().cloneWithColumns(chunk.getColumns());
+        /// Carry the remaining aggregation metadata in block.info, the same way partial-aggregation
+        /// results are transported for distributed/parallel reads.
+        if (agg_info)
+        {
+            block.info.bucket_num = agg_info->bucket_num;
+            block.info.is_overflows = agg_info->is_overflows;
+            block.info.out_of_order_buckets = agg_info->out_of_order_buckets;
+        }
+        writer->write(block);
+
+        writer->flush();
+        compressed_buf->finalize();
+    }
+
+    /// Fill the actual size in the header
+    {
+        /// `out` is a WriteBufferFromString to we can rely on count() for getting curretn position in the buffer.
+        const ssize_t end_of_packet_offset = out->count();
+        const ssize_t packet_data_size = end_of_packet_offset - packet_header_offset - sizeof(StreamingExchangeProtocol::PacketHeader);
+
+        if (packet_data_size < 0)
+            throw Exception(ErrorCodes::LOGICAL_ERROR, "Invalid packet data size: {}", packet_data_size);
+
+        /// The receiver rejects Data packets above this limit; fail here with a clear, local error
+        /// instead of sending one the peer would reject. Splitting large chunks is not implemented yet.
+        if (static_cast<UInt64>(packet_data_size) > StreamingExchangeProtocol::MAX_DATA_PACKET_BODY_BYTES)
+            throw Exception(ErrorCodes::NOT_IMPLEMENTED,
+                "Exchange data packet of {} bytes exceeds the maximum {}; splitting large chunks is not implemented",
+                packet_data_size, StreamingExchangeProtocol::MAX_DATA_PACKET_BODY_BYTES);
+
+        /// Fill bytes_size field using memcpy because packet header address in the buffer might not be properly aligned.
+        char * packet_header_start = const_cast<char*>(out->stringView().data()) + packet_header_offset;
+        static_assert(sizeof(StreamingExchangeProtocol::PacketHeader::bytes_size) == sizeof(packet_data_size));
+        memcpy(packet_header_start + offsetof(StreamingExchangeProtocol::PacketHeader, bytes_size), &packet_data_size, sizeof(packet_data_size));
+
+        LOG_TEST(log, "Packet with {} bytes was added to exchange stream {}", packet_data_size, stream_name);
+    }
+
+    if (chunk.getNumRows() == 0)
+    {
+        /// Just in case, flush buffer to the socket
+        tryToSwitchSendBuffer();
+    }
+
+    sendToSocket();
+}
+
+void StreamingExchangeSink::onFinish()
+{
+    LOG_TRACE(log, "Finished writing to exchange stream {}, total rows: {}, bytes: {}",
+        stream_name, rows_written, total_bytes_sent);
+}
+
+bool StreamingExchangeSink::tryReadFromSocketNonBlocking(char * buffer, size_t buffer_size, size_t & position)
+{
+    while (position < buffer_size)
+    {
+        const size_t remaining = buffer_size - position;
+        ssize_t received = socket->receiveBytes(
+            buffer + position,
+            static_cast<int>(std::min<size_t>(remaining, std::numeric_limits<int>::max())));
+        if (received < 0)
+        {
+            auto last_error = errno;
+            if (last_error == EINTR)
+                continue;
+            if (last_error == EAGAIN || last_error == EWOULDBLOCK)
+                return true; /// No data right now, try later.
+            throw Poco::Net::NetException(fmt::format(
+                "Failed to receive control packet on exchange stream {}, error {}", stream_name, last_error));
+        }
+        if (received == 0)
+            return false; /// Peer half-closed.
+        position += received;
+    }
+    return true;
+}
+
+void StreamingExchangeSink::tryReceiveControlPacket()
+{
+    if (no_more_data_needed)
+        return;
+    if (!socket)
+        return;
+
+    const bool not_eof = tryReadFromSocketNonBlocking(
+        reinterpret_cast<char *>(&incoming_packet_type),
+        sizeof(incoming_packet_type),
+        incoming_packet_bytes_filled);
+
+    if (incoming_packet_bytes_filled == sizeof(incoming_packet_type))
+    {
+        if (incoming_packet_type != StreamingExchangeProtocol::PacketType::NoMoreDataNeeded)
+            throw Exception(ErrorCodes::UNEXPECTED_PACKET_FROM_CLIENT,
+                "Unexpected packet type {} from peer on exchange stream {}",
+                incoming_packet_type, stream_name);
+
+        LOG_TRACE(log, "Received NoMoreDataNeeded for exchange stream {}", stream_name);
+        markNoMoreDataNeeded();
+        return;
+    }
+
+    if (!not_eof)
+    {
+        if (incoming_packet_bytes_filled > 0)
+            throw Exception(ErrorCodes::UNEXPECTED_PACKET_FROM_CLIENT,
+                "Peer half-closed exchange stream {} after {} of {} control bytes; truncated control message",
+                stream_name, incoming_packet_bytes_filled, sizeof(incoming_packet_type));
+
+        /// Normal end-of-stream: after the sink sent the final empty chunk, the source consumes it
+        /// and closes without sending NoMoreDataNeeded. Treat EOF as benign only with nothing left to send.
+        const size_t unsent = current_send_buffer.size() - current_send_position_in_buffer;
+        if (!final_chunk_added || unsent > 0 || out->count() > 0)
+            throw Exception(ErrorCodes::UNEXPECTED_PACKET_FROM_CLIENT,
+                "Peer half-closed exchange stream {} without sending NoMoreDataNeeded "
+                "(final_chunk_added={}, unsent={}, out={})",
+                stream_name, final_chunk_added, unsent, out->count());
+
+        LOG_TRACE(log, "Peer closed exchange stream {} after consuming the final chunk", stream_name);
+        markNoMoreDataNeeded();
+    }
+}
+
+void StreamingExchangeSink::markNoMoreDataNeeded()
+{
+    no_more_data_needed = true;
+    /// Drop pending output: consume() drops new chunks too, no more sends will happen.
+    current_send_buffer.clear();
+    current_send_position_in_buffer = 0;
+    out = std::make_shared<WriteBufferFromOwnString>();
+}
+
+}
+
+#endif
diff --git a/src/Server/DistributedQuery/StreamingExchangeSink.h b/src/Server/DistributedQuery/StreamingExchangeSink.h
new file mode 100644
index 000000000000..883b8acac403
--- /dev/null
+++ b/src/Server/DistributedQuery/StreamingExchangeSink.h
@@ -0,0 +1,93 @@
+#pragma once
+
+#ifdef OS_LINUX
+
+#include <Common/Logger.h>
+#include <Core/Types.h>
+#include <Processors/ISink.h>
+#include <Processors/Port.h>
+#include <Poco/Net/StreamSocket.h>
+#include <IO/WriteBufferFromString.h>
+#include <Server/DistributedQuery/FutureConnection.h>
+
+namespace DB
+{
+
+class StreamingExchangeSink final : public ISink
+{
+public:
+    StreamingExchangeSink(SharedHeader header_, FutureConnectionPtr future_connection_, String stream_name_)
+        : ISink(std::move(header_))
+        , future_connection(std::move(future_connection_))
+        , stream_name(std::move(stream_name_))
+    {
+    }
+
+    ~StreamingExchangeSink() override;
+
+    String getName() const override { return "StreamingExchangeSink(" + stream_name + ")"; }
+
+    Status prepare() override;
+    std::pair<int, uint32_t> scheduleForEvent() override;
+
+private:
+    void consume(Chunk chunk) override;
+    void onFinish() override;
+    void work() override;
+
+    /// Drain any inbound NoMoreDataNeeded packet or peer half-close. Safe to call at any time.
+    void tryReceiveControlPacket();
+
+    /// Non-blocking read into `buffer[position .. buffer_size]`, advancing `position`.
+    /// Returns true on progress (including EAGAIN), false on peer half-close. Throws on hard errors.
+    bool tryReadFromSocketNonBlocking(char * buffer, size_t buffer_size, size_t & position);
+
+    /// Set `no_more_data_needed` and drop pending output buffers.
+    void markNoMoreDataNeeded();
+
+    /// Send data in current_send_buffer to socket in non-blocking mode.
+    void sendToSocket();
+
+    /// Checks if out buffer has not too much data already if so, it is possible to add new chunk.
+    bool canAddChunk() const;
+
+    /// Move out buffer to current_send_buffer and reset out. It is only possible if current_send_buffer have been fully sent to socket.
+    /// Otherwise, need to wait on socket and then call this method again.
+    void tryToSwitchSendBuffer();
+
+    /// Extract socket from future connection
+    void extractSocket();
+
+    FutureConnectionPtr future_connection;
+    std::unique_ptr<Poco::Net::StreamSocket> socket;
+    const String stream_name;
+
+    /// In-memory buffer to which the chunks are serialized.
+    /// Once it becomes big enough we move it to current_send_buffer.
+    std::shared_ptr<WriteBufferFromOwnString> out;
+
+    /// This buffer is being written to socket
+    String current_send_buffer;
+    /// How many bytes were already written to socket
+    size_t current_send_position_in_buffer = 0;
+
+    size_t rows_written = 0;
+    size_t total_bytes_sent = 0;
+
+    const size_t FLUSH_BUFFER_TO_SOCKET_THRESHOLD = 128 * 1024;
+    /// Cap on total unsent bytes (`current_send_buffer` + `out`); back-pressure trips here.
+    static constexpr size_t MAX_PENDING_BYTES = 16 * 1024 * 1024;
+    bool input_is_finished = false;     /// We have read all the data from input port.
+    bool final_chunk_added = false;     /// Final empty chunk was added to signal the exchange stream receiver that we are done.
+    bool no_more_data_needed = false;   /// Set to true when exchange stream receiver has sent us NoMoreDataNeeded.
+
+    /// Accumulator for the inbound NoMoreDataNeeded packet (single UInt64, no body).
+    UInt64 incoming_packet_type = 0;
+    size_t incoming_packet_bytes_filled = 0;
+
+    LoggerPtr log = getLogger("StreamingExchangeSink");
+};
+
+}
+
+#endif
diff --git a/src/Server/DistributedQuery/StreamingExchangeSource.cpp b/src/Server/DistributedQuery/StreamingExchangeSource.cpp
new file mode 100644
index 000000000000..73a047caf7b7
--- /dev/null
+++ b/src/Server/DistributedQuery/StreamingExchangeSource.cpp
@@ -0,0 +1,340 @@
+#include <memory>
+#include <Server/DistributedQuery/StreamingExchangeSource.h>
+#include <Server/DistributedQuery/StreamingExchangeProtocol.h>
+#include <Processors/Transforms/AggregatingTransform.h>
+#include <Compression/CompressedReadBuffer.h>
+#include <Formats/NativeReader.h>
+#include <Core/ProtocolDefines.h>
+#include <IO/ReadHelpers.h>
+#include <IO/WriteHelpers.h>
+#include <IO/WriteBufferFromPocoSocket.h>
+#include <Poco/Net/NetException.h>
+#include <Common/logger_useful.h>
+#include <Common/PODArray.h>
+#include <base/types.h>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int UNEXPECTED_PACKET_FROM_CLIENT;
+    extern const int PROTOCOL_VERSION_MISMATCH;
+}
+
+StreamingExchangeSource::~StreamingExchangeSource()
+{
+    if (out && !out->isFinalized())
+        out->cancel();
+}
+
+void StreamingExchangeSource::onStart()
+{
+    connect();
+
+    /// The handshake runs synchronously on a blocking socket. Apply per-call
+    /// timeouts so a stalled peer cannot freeze the source startup indefinitely.
+    Poco::Timespan hello_timeout(StreamingExchangeProtocol::HELLO_TIMEOUT_SECONDS, 0);
+    socket->setReceiveTimeout(hello_timeout);
+    socket->setSendTimeout(hello_timeout);
+
+    sendHello();
+    receiveHello();
+
+    /// Set socket to non-blocking mode after handshake is finished.
+    socket->setBlocking(false);
+    /// Initialize packet receive state
+    packet_receive_state = ReceivingHeader;
+    current_packet_header_bytes_filled = 0;
+}
+
+void StreamingExchangeSource::connect()
+{
+    LOG_TRACE(log, "Connecting to {}:{} for query id {} exchange stream {}", host, port, query_id, stream_name);
+    socket = std::make_unique<Poco::Net::StreamSocket>();
+    Poco::Net::SocketAddress address(host, port);
+    /// Apply a connect timeout so a blackholed or filtered peer cannot stall the worker
+    /// thread for the default kernel connect timeout (minutes) and ignore cancellation.
+    Poco::Timespan connect_timeout(StreamingExchangeProtocol::HELLO_TIMEOUT_SECONDS, 0);
+    socket->connect(address, connect_timeout);
+    socket->setReceiveBufferSize(10 * 1024 * 1024);
+}
+
+void StreamingExchangeSource::sendHello()
+{
+    WriteBufferFromOwnString body;
+    StreamingExchangeProtocol::SourceHelloBody source_hello{
+        .source_version = StreamingExchangeProtocol::PROTOCOL_VERSION,
+        .query_id = query_id,
+        .stream_name = stream_name,
+    };
+    source_hello.write(body);
+    body.finalize();
+    const std::string & body_str = body.str();
+
+    StreamingExchangeProtocol::PacketHeader header{
+        .packet_type = StreamingExchangeProtocol::PacketType::SourceHello,
+        .bytes_size = body_str.size(),
+    };
+
+    WriteBufferFromPocoSocket hello_out(*socket);
+    hello_out.write(reinterpret_cast<const char *>(&header), sizeof(header));
+    if (!body_str.empty())
+        hello_out.write(body_str.data(), body_str.size());
+    hello_out.finalize();
+}
+
+void StreamingExchangeSource::receiveHello()
+{
+    StreamingExchangeProtocol::PacketHeader header{};
+    size_t position = 0;
+    readFromSocket(reinterpret_cast<char *>(&header), sizeof(header), position);
+    if (position != sizeof(header))
+        throw Poco::Net::NetException(fmt::format(
+            "Failed to receive SinkHello header from socket for exchange stream {}, expected {} bytes but received {}",
+            stream_name, sizeof(header), position));
+
+    if (header.packet_type != StreamingExchangeProtocol::PacketType::SinkHello)
+        throw Exception(ErrorCodes::UNEXPECTED_PACKET_FROM_CLIENT,
+            "Unexpected packet type 0x{:x} (expected SinkHello 0x{:x}) for exchange stream {}",
+            header.packet_type, static_cast<UInt64>(StreamingExchangeProtocol::PacketType::SinkHello), stream_name);
+
+    if (header.bytes_size > StreamingExchangeProtocol::MAX_HELLO_BODY_BYTES)
+        throw Exception(ErrorCodes::UNEXPECTED_PACKET_FROM_CLIENT,
+            "SinkHello body size {} exceeds the limit {} for exchange stream {}",
+            header.bytes_size, StreamingExchangeProtocol::MAX_HELLO_BODY_BYTES, stream_name);
+
+    if (header.bytes_size < sizeof(UInt64))
+        throw Exception(ErrorCodes::UNEXPECTED_PACKET_FROM_CLIENT,
+            "SinkHello body size {} is too small to contain the protocol version for exchange stream {}",
+            header.bytes_size, stream_name);
+
+    PODArray<char> body_buffer(header.bytes_size);
+    size_t body_position = 0;
+    readFromSocket(body_buffer.data(), body_buffer.size(), body_position);
+    if (body_position != body_buffer.size())
+        throw Poco::Net::NetException(fmt::format(
+            "Failed to receive SinkHello body from socket for exchange stream {}, expected {} bytes but received {}",
+            stream_name, body_buffer.size(), body_position));
+
+    ReadBufferFromMemory body_in(body_buffer.data(), body_buffer.size());
+    StreamingExchangeProtocol::SinkHelloBody sink_hello;
+    sink_hello.read(body_in);
+
+    if (sink_hello.sink_version != StreamingExchangeProtocol::PROTOCOL_VERSION)
+        throw Exception(ErrorCodes::PROTOCOL_VERSION_MISMATCH,
+            "Streaming exchange protocol version mismatch for stream {}: this node speaks version {}, sink at {}:{} speaks version {}",
+            stream_name, StreamingExchangeProtocol::PROTOCOL_VERSION, host, port, sink_hello.sink_version);
+}
+
+IProcessor::Status StreamingExchangeSource::prepare()
+{
+    LOG_TEST(log, "Prepare exchange source {}", stream_name);
+
+    if (finished_reading)
+    {
+        output.finish();
+        return Status::Finished;
+    }
+
+    /// Check can output.
+    if (output.isFinished())
+    {
+        output_finished = true;
+        /// Return Status::Ready because we still need to send NoMoreDataNeeded packet to sink before closing the socket to let it know that this is not a disconnect.
+        return Status::Ready;
+    }
+
+    if (!output.canPush())
+        return Status::PortFull;
+
+    if (has_input)
+    {
+        output.pushData(std::move(current_chunk));
+        has_input = false;
+        return Status::PortFull;
+    }
+
+    /// TODO: handle cancelled state?
+
+
+    if (!was_on_start_called)
+        return Status::Ready;
+
+    if (packet_in)
+        return Status::Ready;
+
+    return Status::Async;
+}
+
+int StreamingExchangeSource::schedule()
+{
+    LOG_TEST(log, "Schedule exchange stream {}, fd: {}", stream_name, socket->sockfd());
+
+    return socket->sockfd();
+}
+
+void StreamingExchangeSource::sendNoMoreDataNeeded()
+{
+    if (!out)
+        out = std::make_unique<WriteBufferFromPocoSocket>(*socket);
+    writeIntBinary(StreamingExchangeProtocol::PacketType::NoMoreDataNeeded, *out);
+    out->next();
+}
+
+void StreamingExchangeSource::readFromSocket(char * buffer, size_t buffer_size, size_t & position)
+{
+    while (position < buffer_size)
+    {
+        ssize_t received = StreamingExchangeProtocol::tryReceive(*socket, buffer + position, buffer_size - position, stream_name);
+        if (received == 0)
+        {
+            /// Socket is not ready for reading, wait for epoll event.
+            break;
+        }
+
+        LOG_TEST(log, "Received {} bytes from exchange stream {}, fd: {}", received, stream_name, socket->sockfd());
+
+        position += received;
+        bytes_read += received;
+    }
+}
+
+void StreamingExchangeSource::tryReadHeader()
+{
+    /// Read remaining size to header buffer
+    readFromSocket(reinterpret_cast<char*>(&current_packet_header) , sizeof(current_packet_header), current_packet_header_bytes_filled);
+    if (current_packet_header_bytes_filled == sizeof(current_packet_header))
+    {
+        if (current_packet_header.packet_type != StreamingExchangeProtocol::PacketType::Data)
+            throw Exception(ErrorCodes::UNEXPECTED_PACKET_FROM_CLIENT, "Unexpected packet type {}", current_packet_header.packet_type);
+
+        if (current_packet_header.bytes_size > StreamingExchangeProtocol::MAX_DATA_PACKET_BODY_BYTES)
+            throw Exception(ErrorCodes::UNEXPECTED_PACKET_FROM_CLIENT,
+                "Data packet body size {} exceeds limit {} on exchange stream {}",
+                current_packet_header.bytes_size, StreamingExchangeProtocol::MAX_DATA_PACKET_BODY_BYTES, stream_name);
+
+        current_packet_body.resize(current_packet_header.bytes_size);
+        current_packet_body_bytes_filled = 0;
+        packet_receive_state = ReceivingBody;
+
+        LOG_TEST(log, "Expecting packet with {} bytes from exchange stream {}, fd: {}", current_packet_header.bytes_size, stream_name, socket->sockfd());
+    }
+}
+
+void StreamingExchangeSource::tryReadBody()
+{
+    /// Read remaining size of the packet
+    readFromSocket(current_packet_body.data() , current_packet_body.size(), current_packet_body_bytes_filled);
+    if (current_packet_body_bytes_filled == current_packet_body.size())
+    {
+        packet_receive_state = ReceivingHeader;
+        current_packet_header_bytes_filled = 0;
+        packet_in = std::make_unique<ReadBufferFromMemory>(current_packet_body.data(), current_packet_body.size());
+    }
+}
+
+std::optional<Chunk> StreamingExchangeSource::tryGenerate()
+{
+    if (!was_on_start_called)
+    {
+        was_on_start_called = true;
+        onStart();
+        return Chunk(); /// Empty chunk means we need to be called again
+    }
+
+    if (output_finished)
+    {
+        LOG_TRACE(log, "NoMoreDataNeeded from exchange stream {}, total rows: {}, bytes: {}", stream_name, rows_read, bytes_read);
+
+        sendNoMoreDataNeeded();
+        finished_reading = true;
+        return {};
+    }
+
+    LOG_TEST(log, "Reading from exchange stream {}", stream_name);
+
+    if (packet_receive_state == ReceivingHeader)
+        tryReadHeader();
+
+    if (packet_receive_state == ReceivingBody)
+        tryReadBody();
+
+    /// If a whole packet has been read, we can parse it.
+    if (!packet_in)
+        return Chunk(); /// Empty chunk means that we currently heve no data but we have not finished yet.
+
+    UInt64 flags = 0;
+    readVarUInt(flags, *packet_in);
+    const bool final_chunk = (flags & 1);
+    const bool has_aggregated_chunk_info = (flags & 2);
+    UInt64 num_rows = 0;
+    readVarUInt(num_rows, *packet_in);
+    UInt64 num_columns = 0;
+    readVarUInt(num_columns, *packet_in);
+    UInt64 chunk_num = 0;
+    if (has_aggregated_chunk_info)
+        readVarUInt(chunk_num, *packet_in);
+
+    /// The final packet is the empty end-of-stream marker. A final packet carrying rows would have
+    /// them dropped once finished_reading is set, so reject it as a protocol violation.
+    if (final_chunk && num_rows != 0)
+        throw Exception(ErrorCodes::UNEXPECTED_PACKET_FROM_CLIENT,
+            "Final data packet on exchange stream {} carries {} rows; it must be empty", stream_name, num_rows);
+
+    /// A data packet must carry exactly the header's columns, or values would be dropped while the
+    /// row count is kept. A header-less stream (e.g. SELECT count()) sends rows with zero columns.
+    const size_t expected_columns = output.getHeader().columns();
+    if (num_rows != 0 && num_columns != expected_columns)
+        throw Exception(ErrorCodes::UNEXPECTED_PACKET_FROM_CLIENT,
+            "Data packet on exchange stream {} carries {} rows with {} columns, but the stream header has {} columns",
+            stream_name, num_rows, num_columns, expected_columns);
+
+    std::optional<Chunk> result;
+    if (num_columns != 0)
+    {
+        auto compressed_buf = std::make_unique<CompressedReadBuffer>(*packet_in);
+        auto reader = std::make_unique<NativeReader>(*compressed_buf, output.getHeader(), DBMS_TCP_PROTOCOL_VERSION);
+        Block block = reader->read();
+
+        result = Chunk(block.getColumns(), num_rows);
+        if (has_aggregated_chunk_info)
+        {
+            auto info = std::make_shared<AggregatedChunkInfo>();
+            info->bucket_num = block.info.bucket_num;
+            info->is_overflows = block.info.is_overflows;
+            info->out_of_order_buckets = block.info.out_of_order_buckets;
+            info->chunk_num = chunk_num;
+            result->getChunkInfos().add(std::move(info));
+        }
+        rows_read += num_rows;
+
+        LOG_TEST(log, "Received chunk with {} rows and {} columns from exchange stream {}", num_rows, num_columns, stream_name);
+    }
+    else if (num_rows == 0)
+    {
+        LOG_TEST(log, "Received empty chunk from exchange stream {}", stream_name);
+        result = Chunk(output.getHeader().cloneEmptyColumns(), 0);
+    }
+    else
+    {
+        LOG_TEST(log, "Received chunk with {} rows and no columns from exchange stream {}", num_rows, stream_name);
+        result = Chunk(Columns{}, num_rows);
+    }
+
+    if (final_chunk)
+    {
+        finished_reading = true;
+        LOG_TRACE(log, "Finished reading from exchange stream {}, total rows: {}, bytes: {}",
+            stream_name, rows_read, bytes_read);
+    }
+
+    packet_in.reset();
+    packet_receive_state = ReceivingHeader;
+    current_packet_body.clear();
+    current_packet_body_bytes_filled = 0;
+
+    return result;
+}
+
+}
diff --git a/src/Server/DistributedQuery/StreamingExchangeSource.h b/src/Server/DistributedQuery/StreamingExchangeSource.h
new file mode 100644
index 000000000000..239090570ef9
--- /dev/null
+++ b/src/Server/DistributedQuery/StreamingExchangeSource.h
@@ -0,0 +1,83 @@
+#pragma once
+
+#include <memory>
+#include <Common/Logger.h>
+#include <IO/ReadBufferFromPocoSocket.h>
+#include <Processors/ISource.h>
+#include <Server/DistributedQuery/StreamingExchangeProtocol.h>
+#include <Poco/Net/StreamSocket.h>
+#include <IO/ReadBufferFromMemory.h>
+#include <IO/WriteBufferFromPocoSocket.h>
+
+namespace DB
+{
+
+class StreamingExchangeSource final : public ISource
+{
+public:
+    explicit StreamingExchangeSource(SharedHeader header_, String query_id_, String stream_name_, String host_, UInt16 port_)
+        : ISource(std::move(header_))
+        , host(std::move(host_))
+        , port(port_)
+        , query_id(std::move(query_id_))
+        , stream_name(std::move(stream_name_))
+    {
+    }
+
+    ~StreamingExchangeSource() override;
+
+    String getName() const override { return "StreamingExchangeSource(" + stream_name + ")"; }
+
+    Status prepare() override;
+    int schedule() override;
+
+private:
+    void onStart();
+    void connect();
+    void sendHello();
+    void receiveHello();
+
+    /// Read as many bytes as we can from the socket without blocking and update position accordingly.
+    void readFromSocket(char * buffer, size_t buffer_size, size_t & position);
+
+    /// Continue reading packet header until it is fully read. Then we know the full size and can start reading the body.
+    void tryReadHeader();
+    /// Continue reading packet body until it is fully read.
+    void tryReadBody();
+
+    /// Read available data from the socket and deserialize a chunk when enough data was read.
+    std::optional<Chunk> tryGenerate() override;
+
+    /// Tell the sender that no more data is needed from it.
+    void sendNoMoreDataNeeded();
+
+    const String host;
+    const UInt16 port;
+    const String query_id;
+    const String stream_name;
+
+    bool finished_reading = false;  /// All data has been read from socket.
+    bool output_finished = false;   /// Output port is finished, do not need to receive more data.
+    bool was_on_start_called = false;
+
+    enum PacketReceiveState
+    {
+        ReceivingHeader,
+        ReceivingBody,
+    } packet_receive_state = ReceivingHeader;
+
+    StreamingExchangeProtocol::PacketHeader current_packet_header{};
+    size_t current_packet_header_bytes_filled = 0;
+
+    std::vector<char> current_packet_body;
+    size_t current_packet_body_bytes_filled = 0;
+
+    std::unique_ptr<Poco::Net::StreamSocket> socket;
+    std::unique_ptr<ReadBufferFromMemory> packet_in;    /// One full packet
+    std::unique_ptr<WriteBufferFromPocoSocket> out;
+    size_t rows_read = 0;
+    size_t bytes_read = 0;
+    LoggerPtr log = getLogger("StreamingExchangeSource");
+};
+
+}
diff --git a/src/Server/DistributedQuery/tests/gtest_distributed_query.cpp b/src/Server/DistributedQuery/tests/gtest_distributed_query.cpp
new file mode 100644
index 000000000000..7f4071dc14c7
--- /dev/null
+++ b/src/Server/DistributedQuery/tests/gtest_distributed_query.cpp
@@ -0,0 +1,573 @@
+#include <cstddef>
+#include <memory>
+#include <boost/core/noncopyable.hpp>
+#include <gtest/gtest.h>
+#include <Common/CurrentThread.h>
+#include <Common/ThreadStatus.h>
+#include <Common/ThreadPool.h>
+#include <Common/tests/gtest_global_register.h>
+
+#include <Poco/ConsoleChannel.h>
+#include <Poco/FormattingChannel.h>
+#include <Poco/Logger.h>
+#include <Poco/AutoPtr.h>
+#include <Poco/PatternFormatter.h>
+#include <Common/tests/gtest_global_context.h>
+
+#include <Compression/CompressedWriteBuffer.h>
+#include <Core/Block.h>
+#include <Core/Field.h>
+#include <Core/Names.h>
+#include <Core/ProtocolDefines.h>
+#include <Core/UUID.h>
+#include <Disks/IStoragePolicy.h>
+#include <Disks/DiskObjectStorage/ObjectStorages/Local/LocalObjectStorage.h>
+#include <Disks/DiskObjectStorage/ObjectStorages/ObjectStorageFactory.h>
+#include <Disks/registerDisks.h>
+#include <Formats/NativeWriter.h>
+#include <IO/ReadBufferFromFile.h>
+#include <IO/ReadBufferFromString.h>
+#include <IO/WriteBufferFromFile.h>
+#include <IO/WriteHelpers.h>
+#include <Interpreters/JoinOperator.h>
+#include <Interpreters/Session.h>
+#include <Processors/Chunk.h>
+#include <Processors/Executors/PushingPipelineExecutor.h>
+#include <Processors/Formats/Impl/TabSeparatedRowInputFormat.h>
+#include <Processors/Formats/Impl/TabSeparatedRowOutputFormat.h>
+#include <Processors/IProcessor.h>
+#include <Processors/ISink.h>
+#include <Processors/Sinks/NativeCompressedSink.h>
+#include <Processors/Sources/NativeCompressedSource.h>
+#include <Processors/QueryPlan/JoinStepLogical.h>
+#include <Processors/QueryPlan/ISourceStep.h>
+#include <Processors/QueryPlan/Optimizations/QueryPlanOptimizationSettings.h>
+#include <Processors/QueryPlan/QueryPlanStepRegistry.h>
+#include <Processors/QueryPlan/Serialization.h>
+#include <QueryPipeline/QueryPipelineBuilder.h>
+#include <QueryPipeline/DistributedPlanExecutor.h>
+#include <base/defines.h>
+
+#include <Processors/Executors/PipelineExecutor.h>
+#include <Processors/Executors/PullingPipelineExecutor.h>
+#include <Processors/Sources/SourceFromChunks.h>
+#include <Processors/Sources/SourceFromSingleChunk.h>
+#include <Processors/QueryPlan/ShuffleExchangeStep.h>
+#include <Processors/QueryPlan/GatherExchangeStep.h>
+#include <Processors/QueryPlan/QueryPlan.h>
+#include <Processors/QueryPlan/IQueryPlanStep.h>
+#include <Processors/QueryPlan/IParameterLookup.h>
+
+#include <Processors/Executors/CompletedPipelineExecutor.h>
+
+#include <QueryPipeline/QueryPipeline.h>
+
+#include <Common/Config/ConfigProcessor.h>
+#include <Common/Config/ConfigHelper.h>
+#include <Core/ServerUUID.h>
+
+
+namespace DB
+{
+
+/// Read test data from a file
+class ReadFromFileStep : public ISourceStep
+{
+public:
+    ReadFromFileStep(SharedHeader header_, const String & file_name_)
+        : ISourceStep(std::move(header_))
+        , file_name(file_name_)
+    {
+    }
+
+    String getName() const override { return "ReadFromFile"; }
+
+    void initializePipeline(QueryPipelineBuilder & pipeline, const BuildQueryPipelineSettings & /*settings*/) override
+    {
+        pipeline.init(Pipe(std::make_shared<NativeCompressedSource>(output_header, std::make_unique<ReadBufferFromFile>(file_name), file_name)));
+    }
+
+    void serialize(Serialization & ctx) const override
+    {
+        writeStringBinary(file_name, ctx.out);
+    }
+
+    static std::unique_ptr<IQueryPlanStep> deserialize(Deserialization & ctx)
+    {
+        String file_name;
+        readStringBinary(file_name, ctx.in);
+        return std::make_unique<ReadFromFileStep>(ctx.output_header, file_name);
+    }
+
+private:
+    const String file_name;
+};
+
+static void registerReadFromFileStep(QueryPlanStepRegistry & registry)
+{
+    registry.registerStep("ReadFromFile", ReadFromFileStep::deserialize);
+}
+
+
+/// Print test result
+class PrintTSVSink : public ISink
+{
+public:
+    explicit PrintTSVSink(SharedHeader header_)
+        : ISink(std::move(header_))
+        , out("/dev/stdout")
+        , output_format(std::make_shared<TabSeparatedRowOutputFormat>(out, input.getSharedHeader(), false, false, false, DB::FormatSettings{}))
+    {}
+
+    String getName() const override { return "PrintTSVSink"; }
+
+protected:
+    void consume(Chunk chunk) override
+    {
+        Block block = input.getHeader().cloneWithColumns(chunk.getColumns());
+        output_format->write(block);
+    }
+
+    void onFinish() override
+    {
+        output_format->finalize();
+        out.finalize();
+    }
+
+private:
+    WriteBufferFromFile out;
+    OutputFormatPtr output_format;
+};
+
+
+class PrintTSVStep : public IQueryPlanStep
+{
+public:
+    explicit PrintTSVStep(SharedHeader input_header_)
+    {
+        updateInputHeaders({input_header_});
+    }
+
+    String getName() const override { return "PrintTSV"; }
+
+    bool hasOutputStream() const { return false; }
+
+    QueryPipelineBuilderPtr updatePipeline(QueryPipelineBuilders pipelines, const BuildQueryPipelineSettings & /*settings*/) override
+    {
+        auto & pipeline = *pipelines.front();
+        Block stream_header = pipeline.getHeader();
+
+        /// Single sink to print to stdout
+        pipeline.resize(1);
+
+        pipeline.setSinks([&](const SharedHeader & header, Pipe::StreamType stream_type) -> ProcessorPtr
+        {
+            chassert(stream_type == Pipe::StreamType::Main);
+            return std::make_shared<PrintTSVSink>(header);
+        });
+
+        return std::move(pipelines.front());
+    }
+
+    void serialize(Serialization & /*ctx*/) const override {}
+
+    static std::unique_ptr<IQueryPlanStep> deserialize(Deserialization & ctx)
+    {
+        return std::make_unique<PrintTSVStep>(ctx.input_headers.front());
+    }
+
+private:
+    void updateOutputHeader() override {}
+};
+
+static void registerPrintTSVStep(QueryPlanStepRegistry & registry)
+{
+    registry.registerStep("PrintTSV", PrintTSVStep::deserialize);
+}
+
+
+static SharedHeader prepareSourceFileInNativeFormat(const String & file_name, const String & data, size_t input_replicate_count)
+{
+    ReadBufferFromString read_buffer(data);
+
+    Block header;
+
+    {
+        TabSeparatedSchemaReader schema_reader(read_buffer, true, true, false, FormatSettings{});
+        auto schema = schema_reader.readSchema();
+
+        for (const auto & [name, type] : schema)
+            header.insert({type->createColumn(), type, name});
+    }
+
+    auto shared_header = std::make_shared<const Block>(std::move(header));
+
+    {
+        auto file_buffer = std::make_unique<WriteBufferFromFile>(file_name);
+        NativeCompressedSink sink(shared_header, *file_buffer, file_name);
+
+        auto reader = std::make_shared<TabSeparatedRowInputFormat>(
+            shared_header, read_buffer, IRowInputFormat::Params{}, true, true, false, FormatSettings{});
+
+        while (auto chunk = reader->read())
+        {
+            Block block = shared_header->cloneWithColumns(chunk.getColumns());
+
+            /// Repeat the same block multiple times to make the file bigger
+            for (size_t i = 0; i < input_replicate_count; ++i)
+                sink.consume(Chunk(block.getColumns(), block.rows()));
+        }
+        sink.onFinish();
+    }
+
+    return shared_header;
+}
+
+static QueryPlanStepPtr createSourceStepFromFileInNativeFormat(SharedHeader header, const String & file_name)
+{
+    auto step = std::make_unique<ReadFromFileStep>(std::move(header), file_name);
+    return step;
+}
+
+/// Simple plan that joins two tables
+static QueryPlan createHashJoinQueryPlan(const String & data_a, const String & data_b)
+{
+    const String file_name_a = "/tmp/file_a";
+    const String file_name_b = "/tmp/file_b";
+//    const size_t num_shards_a = 3;
+//    const size_t num_shards_b = 2;
+    const size_t replicate_input_count = 2; // Replicate the same data many times just for testing
+
+    SharedHeader header_a = prepareSourceFileInNativeFormat(file_name_a, data_a, replicate_input_count);
+    SharedHeader header_b = prepareSourceFileInNativeFormat(file_name_b, data_b, replicate_input_count);
+
+    /// Create source for table A
+    QueryPlan left_plan;
+    {
+        left_plan.addStep(createSourceStepFromFileInNativeFormat(header_a, file_name_a));
+    }
+
+    /// Create source for table B
+    QueryPlan right_plan;
+    {
+        right_plan.addStep(createSourceStepFromFileInNativeFormat(header_b, file_name_b));
+    }
+
+    /// Create join step
+    QueryPlan query_plan;
+    {
+        auto remove_column_pointers = [](const ColumnsWithTypeAndName & header) -> ColumnsWithTypeAndName
+        {
+            ColumnsWithTypeAndName result = header;
+            for (auto & element : result)
+                element.column = nullptr;
+            return result;
+        };
+
+        JoinExpressionActions join_expression_actions(
+            remove_column_pointers(header_a->getColumnsWithTypeAndName()),
+            remove_column_pointers(header_b->getColumnsWithTypeAndName()));
+
+        JoinOperator join_info(JoinKind::Inner);
+        /// Construct contidion "t1.c1 == t2.c1 AND t1.c2 == t2.c2"
+        {
+            auto actions_dag = join_expression_actions.getActionsDAG();
+            actions_dag->getOutputs() = actions_dag->getInputs();
+
+            join_info.expression.push_back(JoinActionRef::transform({
+                JoinActionRef(actions_dag->tryFindInOutputs("t1.c1"), join_expression_actions),
+                JoinActionRef(actions_dag->tryFindInOutputs("t2.c1"), join_expression_actions),
+            }, JoinActionRef::AddFunction(JoinConditionOperator::Equals)));
+            join_info.expression.push_back(JoinActionRef::transform({
+                JoinActionRef(actions_dag->tryFindInOutputs("t1.c2"), join_expression_actions),
+                JoinActionRef(actions_dag->tryFindInOutputs("t2.c2"), join_expression_actions),
+            }, JoinActionRef::AddFunction(JoinConditionOperator::Equals)));
+
+        }
+
+        NameSet required_output_columns = {"t1.c1", "t1.c2", "t1.va", "t2.vb"};
+        ContextPtr query_context = getContext().context;
+
+        auto join_settings = JoinSettings(query_context->getSettingsRef());
+        join_settings.enable_lazy_columns_replication = false;
+
+        auto join_step = std::make_unique<JoinStepLogical>(
+            header_a,
+            header_b,
+            std::move(join_info),
+            std::move(join_expression_actions),
+            std::move(required_output_columns),
+            std::unordered_map<String, const ActionsDAG::Node *>{},
+            false,
+            join_settings,
+            SortingStep::Settings(query_context->getSettingsRef()));
+
+        join_step->setStepDescription("Join");
+
+        std::vector<QueryPlanPtr> plans;
+        plans.emplace_back(std::make_unique<QueryPlan>(std::move(left_plan)));
+        plans.emplace_back(std::make_unique<QueryPlan>(std::move(right_plan)));
+
+        query_plan.unitePlans(std::move(join_step), {std::move(plans)});
+    }
+
+    /// Create sink
+    query_plan.addStep(std::make_unique<PrintTSVStep>(query_plan.getCurrentHeader()));
+
+    return query_plan;
+}
+
+struct DistributedQueryPlanSettings
+{
+    size_t num_buckets_for_shuffle = 5;
+};
+
+String data_a =
+            "t1.c1\tt1.c2\tt1.va\n"
+            "String\tUInt64\tString\n"
+            "a\t1\t1ab\n"
+            "g\t2\t2ba\n"
+            "c\t1\t3abc\n"
+            "a\t1\t4bad\n"
+            "f\t1\t5abe\n"
+            "a\t1\t6baf\n"
+            "d\t1\t3abc\n"
+            "e\t1\t4bad\n"
+            "f\t1\t5abe\n"
+            "a\t1\t6baf\n"
+            "b\t2\t7bb\n"
+            "g\t1\t71bb\n"
+            "b\t2\t72bb\n"
+            "b\t2\t73bb\n"
+            "a\t2\t8bb\n"
+            "c\t3\t9cc\n";
+
+String data_b =
+            "t2.c1\tt2.c2\tt2.vb\n"
+            "String\tUInt64\tString\n"
+            "a\t2\t1baaa\n"
+            "c\t3\t2bddd\n"
+            "b\t2\t31bbbb\n"
+            "g\t2\t32bbbb\n"
+            "a\t1\t4baaa\n"
+            "c\t1\t5baab\n"
+            "a\t1\t6baac\n"
+            "d\t1\t5baab\n"
+            "e\t1\t6baac\n"
+            "f\t1\t5baab\n"
+            "g\t1\t6baac\n"
+            "a\t1\t7baad\n"
+            "c\t3\t8bccc\n";
+
+void registerObjectStorages();
+
+} // namespace DB
+
+using namespace DB;
+
+
+namespace
+{
+
+std::string getConfig()
+{
+    std::string s = R"(
+<clickhouse>
+    <logger>
+        <level>trace</level>
+        <console>true</console>
+    </logger>
+
+    <distributed_query>
+        <temporary_files_storage>
+            <type>local</type>
+            <path>./local_object_storage/</path>
+            <endpoint_subpath>distributed_query_temp_files/</endpoint_subpath>
+        </temporary_files_storage>
+    </distributed_query>
+
+</clickhouse>
+)";
+
+    DB::WriteBufferFromFile f("./config_file_for_test.xml");
+    DB::writeText(s, f);
+    f.finalize();
+    return "./config_file_for_test.xml";
+}
+
+}
+
+void registerPlanSteps();
+
+class DistributedQueryTest : public ::testing::Test
+{
+public:
+    DistributedQueryTest()
+    {
+        previous_thread_status = current_thread;
+        current_thread = nullptr;
+    }
+
+    ~DistributedQueryTest() override
+    {
+        current_thread = previous_thread_status;
+    }
+
+    void SetUp() override
+    {
+        Poco::AutoPtr<Poco::ConsoleChannel> console_channel(new Poco::ConsoleChannel(std::cerr));
+        Poco::AutoPtr<Poco::PatternFormatter> formatter(new Poco::PatternFormatter("%Y-%m-%d %H:%M:%S.%i [ %I ] %T <%p> (%s) %t"));
+        Poco::AutoPtr<Poco::FormattingChannel> channel(new Poco::FormattingChannel(formatter, console_channel));
+        Poco::Logger::root().setChannel(channel);
+        if (const char * test_log_level = std::getenv("TEST_LOG_LEVEL")) // NOLINT(concurrency-mt-unsafe)
+            Poco::Logger::root().setLevel(test_log_level);
+        else
+            Poco::Logger::root().setLevel("none");
+
+        DB::ServerUUID::setRandomForUnitTests();
+
+        namespace fs = std::filesystem;
+        if (fs::exists("./config_file_for_test.xml"))
+            fs::remove_all("./config_file_for_test.xml");
+
+        auto config_path = getConfig();
+        DB::ConfigProcessor config_processor(config_path, true, true);
+        config = config_processor.loadConfig(false);
+        context_holder.context->setConfig(config.configuration);
+
+        tryRegisterFunctions();
+        registerObjectStorages();
+        registerPlanSteps();
+
+        GlobalThreadPool::instance();
+    }
+
+    void TearDown() override
+    {
+        DB::clearDiskRegistry();
+    }
+
+private:
+    const ContextHolder & context_holder = getContext();
+    DB::ConfigProcessor::LoadedConfig config;
+    ThreadStatus* previous_thread_status = nullptr;
+};
+
+namespace DB
+{
+
+void registerShuffleSendStep(QueryPlanStepRegistry & registry);
+void registerShuffleReceiveStep(QueryPlanStepRegistry & registry);
+void registerGatherSendStep(QueryPlanStepRegistry & registry);
+void registerGatherReceiveStep(QueryPlanStepRegistry & registry);
+void registerJoinStep(QueryPlanStepRegistry & registry);
+void registerFilterStep(QueryPlanStepRegistry & registry);
+void registerBuildRuntimeFilterStep(QueryPlanStepRegistry & registry);
+
+}
+
+void registerPlanSteps()
+{
+    QueryPlanStepRegistry & registry = QueryPlanStepRegistry::instance();
+
+    registerReadFromFileStep(registry);
+    registerShuffleSendStep(registry);
+    registerShuffleReceiveStep(registry);
+    registerJoinStep(registry);
+    registerGatherSendStep(registry);
+    registerGatherReceiveStep(registry);
+    registerPrintTSVStep(registry);
+    registerFilterStep(registry);
+    registerBuildRuntimeFilterStep(registry);
+}
+
+
+namespace DB
+{
+namespace QueryPlanOptimizations
+{
+
+DistributedQueryPlan makeDistributedPlan(QueryPlan::Nodes nodes, QueryPlan::Node * root, const QueryPlanOptimizationSettings & optimization_settings);
+
+}
+}
+
+static void executeTestWithExchangeKind(const String & exchangeKind)
+try
+{
+    DistributedQueryPlan distributed_query_plan;
+
+    const char * env_val = std::getenv("DISTRIBUTED_PLAN_SINGLE_STAGE"); // NOLINT(concurrency-mt-unsafe)
+    bool distributed_plan_single_stage = env_val && std::string(env_val) != "0";
+
+    ThreadStatus thread_status;
+    auto session = std::make_unique<Session>(getContext().context, ClientInfo::Interface::TCP_INTERSERVER);
+    auto query_context = session->makeQueryContext();
+    auto thread_group = ThreadGroup::createForQuery(query_context);
+    CurrentThread::attachToGroup(thread_group);
+
+    query_context->setSetting("distributed_plan_force_exchange_kind", exchangeKind);
+
+    {
+        /// Create JOIN query plan
+        auto query_plan = createHashJoinQueryPlan(data_a, data_b);
+
+        /// Optimize query plan for distributed execution
+        QueryPlanOptimizationSettings optimization_settings(query_context);
+        optimization_settings.make_distributed_plan = true;
+        optimization_settings.distributed_plan_default_shuffle_join_bucket_count = 4;
+        optimization_settings.distributed_plan_single_stage = distributed_plan_single_stage;  /// For debugging
+        query_plan.optimize(optimization_settings);
+
+        auto * root = query_plan.getRootNode();
+        auto plan_internals = QueryPlan::detachNodesAndResources(std::move(query_plan));
+
+        distributed_query_plan = QueryPlanOptimizations::makeDistributedPlan(std::move(plan_internals.first), root, optimization_settings);
+    }
+
+    Strings all_temporary_files_for_cleanup;
+    for (const auto & stage : distributed_query_plan.stages)
+    {
+        for (const auto & task : stage.second.tasks)
+        {
+            for (const auto & stream_id : task.output_exchange_streams)
+            {
+                if (distributed_query_plan.exchange_descriptions.at(stream_id.exchange_id).kind == ExchangeDescription::Kind::Persisted)
+                    all_temporary_files_for_cleanup.push_back(stream_id.toString());
+            }
+        }
+    }
+
+    const UUID query_uuid = UUIDHelpers::generateV4();
+    auto [object_storage, path] = getObjectStorageForTemporaryFiles(toString(query_uuid), query_context);
+    auto cleanup = makeTemporaryFilesCleaner(object_storage, path, all_temporary_files_for_cleanup);
+
+    query_context->setSetting("distributed_plan_execute_locally", 1);
+    auto cancellation_flag = std::make_shared<std::atomic<bool>>(false);
+
+    /// Just execute the distributed query plan without checking the result
+    auto executor = createDistributedQueryExecutor(query_uuid, distributed_query_plan, nullptr, query_context, cancellation_flag);
+
+    try
+    {
+        executor->start();
+        while (!executor->execute());
+        executor->cleanup();
+    }
+    catch (...)
+    {
+        executor->cleanup();
+        throw;
+    }
+}
+catch (Exception & e)
+{
+    std::cout << e.getStackTraceString() << std::endl;
+    throw;
+}
+
+TEST_F(DistributedQueryTest, ShuffleHashJoin)
+{
+    executeTestWithExchangeKind("Persisted");
+    executeTestWithExchangeKind("Streaming");
+}
diff --git a/src/Server/DistributedQuery/tests/gtest_exchange_server_handshake.cpp b/src/Server/DistributedQuery/tests/gtest_exchange_server_handshake.cpp
new file mode 100644
index 000000000000..ab3a3dfb6c27
--- /dev/null
+++ b/src/Server/DistributedQuery/tests/gtest_exchange_server_handshake.cpp
@@ -0,0 +1,203 @@
+#ifdef OS_LINUX
+
+#include <cstring>
+#include <optional>
+#include <string>
+#include <thread>
+
+#include <gtest/gtest.h>
+
+#include <Poco/Net/ServerSocket.h>
+#include <Poco/Net/SocketAddress.h>
+#include <Poco/Net/StreamSocket.h>
+
+#include <Common/Exception.h>
+#include <Common/Logger.h>
+#include <Common/logger_useful.h>
+#include <Server/DistributedQuery/ExchangeConnections.h>
+#include <Server/DistributedQuery/ExchangeServer.h>
+#include <Server/DistributedQuery/StreamingExchangeProtocol.h>
+#include <base/types.h>
+
+namespace DB
+{
+namespace ErrorCodes
+{
+    extern const int PROTOCOL_VERSION_MISMATCH;
+}
+}
+
+using namespace DB;
+
+namespace
+{
+    /// Read exactly `size` bytes from a blocking socket, looping over short reads.
+    void receiveExactly(Poco::Net::StreamSocket & socket, void * buffer, size_t size)
+    {
+        char * dst = static_cast<char *>(buffer);
+        size_t position = 0;
+        while (position < size)
+        {
+            ssize_t received = socket.receiveBytes(dst + position, static_cast<int>(size - position));
+            ASSERT_GT(received, 0) << "Peer closed before delivering " << size << " bytes (got " << position << ")";
+            position += received;
+        }
+    }
+}
+
+/// A peer that announces a wrong protocol version followed by a body whose
+/// remaining bytes would not parse as v1's `query_id`/`stream_name` strings must:
+///   1. Receive a `SinkHello` carrying this node's version (so the peer can
+///      produce a precise diagnostic on its side).
+///   2. Cause the sink to throw `PROTOCOL_VERSION_MISMATCH` — NOT a parse
+///      error from `readStringBinary` on the garbage tail.
+///   3. Not register the connection.
+TEST(ExchangeServerHandshake, MismatchedVersionRejectedBeforeParsingBody)
+{
+    using namespace StreamingExchangeProtocol;
+
+    auto connections = std::make_shared<ExchangeConnections>();
+    auto log = getLogger("ExchangeServerHandshakeTest");
+
+    Poco::Net::ServerSocket listener(Poco::Net::SocketAddress("127.0.0.1", 0));
+    Poco::Net::SocketAddress addr = listener.address();
+    Poco::Net::StreamSocket client(addr);
+    Poco::Net::StreamSocket server_side = listener.acceptConnection();
+    listener.close();
+
+    /// Run the server-side handshake on a thread; the main thread drives the wire.
+    std::optional<int> caught_code;
+    std::string caught_message;
+    std::thread server_thread([&]
+    {
+        try
+        {
+            ExchangeServer::handleConnection(server_side, connections, log);
+        }
+        catch (const Exception & e)
+        {
+            caught_code = e.code();
+            caught_message = e.displayText();
+        }
+    });
+
+    /// Build a SourceHello whose body has:
+    ///   - 8 bytes wrong version
+    ///   - trailing bytes that are NOT a valid (varuint length + bytes) string
+    /// All-0xff bytes form a varuint with the continuation bit set indefinitely,
+    /// so `readStringBinary` over the buffer would either consume more than the
+    /// body holds (throwing CANNOT_READ_ALL_DATA) or read a wildly large length.
+    /// Either way, the throw code would NOT be PROTOCOL_VERSION_MISMATCH — that
+    /// is the regression we are guarding against.
+    const UInt64 wrong_version = 0xDEAD'BEEF'CAFE'BABEull;
+    std::string body;
+    body.append(reinterpret_cast<const char *>(&wrong_version), sizeof(wrong_version));
+    body.append(64, '\xff');
+
+    PacketHeader header{
+        .packet_type = PacketType::SourceHello,
+        .bytes_size  = body.size(),
+    };
+    client.sendBytes(&header, sizeof(header));
+    client.sendBytes(body.data(), static_cast<int>(body.size()));
+
+    /// SinkHello must arrive even on mismatch.
+    PacketHeader reply_header{};
+    receiveExactly(client, &reply_header, sizeof(reply_header));
+    EXPECT_EQ(reply_header.packet_type, static_cast<UInt64>(PacketType::SinkHello));
+    EXPECT_EQ(reply_header.bytes_size, sizeof(UInt64));
+
+    UInt64 sink_version = 0;
+    receiveExactly(client, &sink_version, sizeof(sink_version));
+    EXPECT_EQ(sink_version, PROTOCOL_VERSION);
+
+    server_thread.join();
+    client.close();
+
+    /// The error must be the version-mismatch one, not a parse error from the
+    /// would-be query_id/stream_name. This is the structural guarantee the test
+    /// is here to enforce.
+    ASSERT_TRUE(caught_code.has_value()) << "handleConnection did not throw";
+    EXPECT_EQ(*caught_code, ErrorCodes::PROTOCOL_VERSION_MISMATCH)
+        << "Expected PROTOCOL_VERSION_MISMATCH, got: " << caught_message;
+}
+
+namespace
+{
+    /// A connected loopback socket pair; both ends are valid open sockets.
+    std::pair<Poco::Net::StreamSocket, Poco::Net::StreamSocket> makeConnectedPair()
+    {
+        Poco::Net::ServerSocket listener(Poco::Net::SocketAddress("127.0.0.1", 0));
+        Poco::Net::StreamSocket client(listener.address());
+        Poco::Net::StreamSocket server_side = listener.acceptConnection();
+        listener.close();
+        return {std::move(client), std::move(server_side)};
+    }
+}
+
+/// A second producer connection for the same stream (a reconnect or duplicate `SourceHello`) must not
+/// drop the first socket: a consumer calling `getConnection` afterwards still gets a ready connection.
+TEST(ExchangeConnectionsRendezvous, DuplicateProducerKeepsFirstSocket)
+{
+    auto connections = std::make_shared<ExchangeConnections>();
+    auto [client_1, server_1] = makeConnectedPair();
+    auto [client_2, server_2] = makeConnectedPair();
+
+    connections->addConnection("query", "stream", server_1);
+    connections->addConnection("query", "stream", server_2);
+
+    auto future = connections->getConnection("query", "stream");
+    EXPECT_TRUE(future->isReady());
+    EXPECT_NO_THROW(future->getSocket());
+}
+
+/// The rendezvous completes when the consumer arrives before the producer.
+TEST(ExchangeConnectionsRendezvous, ConsumerBeforeProducer)
+{
+    auto connections = std::make_shared<ExchangeConnections>();
+    auto [client, server] = makeConnectedPair();
+
+    auto future = connections->getConnection("query", "stream");
+    EXPECT_FALSE(future->isReady());
+
+    connections->addConnection("query", "stream", server);
+    EXPECT_TRUE(future->isReady());
+    EXPECT_NO_THROW(future->getSocket());
+}
+
+/// A second consumer for the same stream (e.g. from a task started twice) must be rejected with a
+/// cancelled future, and must not disturb the first consumer's rendezvous.
+TEST(ExchangeConnectionsRendezvous, DuplicateConsumerRejected)
+{
+    auto connections = std::make_shared<ExchangeConnections>();
+    auto [client, server] = makeConnectedPair();
+
+    auto first = connections->getConnection("query", "stream");
+    auto second = connections->getConnection("query", "stream");
+
+    EXPECT_TRUE(second->isReady());
+    EXPECT_ANY_THROW(second->getSocket());
+    EXPECT_FALSE(first->isReady());
+
+    connections->addConnection("query", "stream", server);
+    EXPECT_TRUE(first->isReady());
+    EXPECT_NO_THROW(first->getSocket());
+}
+
+/// A connection arriving after the owning task released the stream must be rejected, not stored in a
+/// fresh slot (a worker never runs cleanupQuery, so such a slot would leak its eventfd forever).
+TEST(ExchangeConnectionsRendezvous, ConnectionAfterReleaseRejected)
+{
+    auto connections = std::make_shared<ExchangeConnections>();
+    auto [client, server] = makeConnectedPair();
+
+    connections->removePendingStreams("query", {"stream"});
+    connections->addConnection("query", "stream", server);
+
+    /// The late socket was not stored, so a later consumer gets a cancelled future, not the socket.
+    auto future = connections->getConnection("query", "stream");
+    EXPECT_TRUE(future->isReady());
+    EXPECT_ANY_THROW(future->getSocket());
+}
+
+#endif
diff --git a/src/Server/StatelessWorker/StatelessTaskExecutor.cpp b/src/Server/StatelessWorker/StatelessTaskExecutor.cpp
new file mode 100644
index 000000000000..772e13565660
--- /dev/null
+++ b/src/Server/StatelessWorker/StatelessTaskExecutor.cpp
@@ -0,0 +1,209 @@
+#include <Server/StatelessWorker/StatelessTaskExecutor.h>
+#include <QueryPipeline/DistributedPlanExecutor.h>
+#include <Processors/QueryPlan/QueryPlan.h>
+#include <Interpreters/Context.h>
+#include <Interpreters/ProcessList.h>
+#include <Interpreters/ClientInfo.h>
+#include <Parsers/ASTSelectQuery.h>
+#include <Disks/DiskObjectStorage/ObjectStorages/ObjectStorageFactory.h>
+#include <Core/Block.h>
+#include <Common/Exception.h>
+#include <Common/SipHash.h>
+#include <Common/QueryScope.h>
+#include <Common/Stopwatch.h>
+#include <Common/logger_useful.h>
+#include <exception>
+#include <mutex>
+
+namespace CurrentMetrics
+{
+    extern const Metric StatelessWorkerThreads;
+    extern const Metric StatelessWorkerThreadsActive;
+    extern const Metric StatelessWorkerThreadsScheduled;
+}
+
+namespace DB
+{
+
+/// TODO: move
+std::pair<ObjectStoragePtr, String> getObjectStorageForTemporaryFiles(const String & unique_temp_file_path, ContextPtr context);
+
+StatelessTaskExecutor::StatelessTaskExecutor()
+    : thread_pool(
+        CurrentMetrics::StatelessWorkerThreads,
+        CurrentMetrics::StatelessWorkerThreadsActive,
+        CurrentMetrics::StatelessWorkerThreadsScheduled,
+        1000, 100, 3000)
+{
+}
+
+StatelessTaskExecutor::Result StatelessTaskExecutor::startTask(const String & unique_task_id, const DistributedQueryTaskDescription & task_description, const String & unique_temp_file_path)
+{
+    /// `unique_task_id` is unique per task, so a repeated start (e.g. a coordinator retry) is the same
+    /// task. Running it twice would double-write exchanges and temp files and orphan the original from
+    /// cancel/forget, so treat a duplicate start as a no-op.
+    {
+        std::lock_guard lock(tasks_mutex);
+        if (tasks.contains(unique_task_id))
+        {
+            LOG_WARNING(getLogger("StatelessTaskExecutor"), "Ignoring duplicate start for already running task {}", unique_task_id);
+            return Result::Ok;
+        }
+    }
+
+    ContextPtr global_context = Context::getGlobalContextInstance();
+    ContextMutablePtr query_context = Context::createCopy(global_context);
+    query_context->makeQueryContext();
+    {
+        ClientInfo client_info;
+        client_info.current_query_id = unique_task_id;
+        client_info.query_kind = ClientInfo::QueryKind::SECONDARY_QUERY;
+        client_info.initial_query_id = task_description.initial_query_id;
+        query_context->setClientInfo(client_info);
+    }
+
+    /// Apply the initiator's settings so the worker honors query limits and execution-affecting
+    /// settings. Limits (e.g. max_rows_to_read, max_rows_in_join) are enforced per task, so each
+    /// fragment stays under the limit but the whole query may exceed it by up to the bucket count.
+    /// Force make_distributed_plan off: the worker runs an already-split local fragment.
+    query_context->applySettingsChanges(task_description.settings_changes);
+    query_context->setSetting("make_distributed_plan", false);
+
+    auto [object_storage, object_storage_path] = getObjectStorageForTemporaryFiles(unique_temp_file_path, query_context);
+
+    std::shared_ptr<std::promise<String>> task_promise = std::make_shared<std::promise<String>>();
+    auto task_state = std::make_shared<TaskState>();
+    task_state->completion_future = task_promise->get_future();
+
+    {
+        std::lock_guard lock(tasks_mutex);
+        /// If two starts of the same id race, keep the first; overwriting would orphan the running task.
+        if (!tasks.try_emplace(unique_task_id, task_state).second)
+        {
+            LOG_WARNING(getLogger("StatelessTaskExecutor"), "Ignoring duplicate start for already running task {}", unique_task_id);
+            return Result::Ok;
+        }
+    }
+
+    /// Callback for periodic cancellation check
+    auto is_task_cancelled = [cancelled = task_state->cancelled]() -> bool
+    {
+        return *cancelled;
+    };
+
+    auto update_progress = [task_progress = task_state->progress](const Progress & progress)
+    {
+        task_progress->incrementPiecewiseAtomically(progress);
+    };
+
+    auto task_function = [task_description, object_storage, object_storage_path, distributed_query_id = unique_temp_file_path, query_context, task_promise, is_task_cancelled, update_progress]() mutable
+    {
+        try
+        {
+            /// QueryScope and process-list insertion can throw (e.g. the worker is at its query
+            /// limit); keep them inside the try so a failure completes the promise with the error
+            /// rather than leaving it unfulfilled (which would make get_status hang or throw).
+            auto query_scope = QueryScope::create(query_context);
+
+            Stopwatch start_watch(CLOCK_MONOTONIC);
+            ASTSelectQuery ast_stub; /// FIXME: this is only used to populate query_kind
+            auto query_plan_hash = sipHash64(task_description.serialized_query_plan);
+            auto process_list_entry = query_context->getProcessList().insert(task_description.task.task_id, query_plan_hash, &ast_stub, query_context, start_watch.getStart(), false);
+            query_context->setProcessListElement(process_list_entry->getQueryStatus());
+
+            doExecuteTask(task_description, object_storage, object_storage_path, distributed_query_id, query_context, is_task_cancelled, update_progress);
+            task_promise->set_value("");
+        }
+        catch (std::exception & e)
+        {
+            tryLogCurrentException(getLogger("StatelessTaskExecutor"),
+                fmt::format("Task {} failed", task_description.task.task_id));
+            task_promise->set_value(e.what());
+        }
+        catch (...)
+        {
+            tryLogCurrentException(__PRETTY_FUNCTION__);
+            task_promise->set_value("unknown exception");
+        }
+    };
+
+    try
+    {
+        thread_pool.scheduleOrThrow(std::move(task_function));
+    }
+    catch (...)
+    {
+        /// The pool refused the task (e.g. saturated or shutting down). Drop the
+        /// half-published entry so `get_status` doesn't report "running" forever
+        /// for a task no thread is executing, and complete the promise with the
+        /// scheduling exception so future waiters fail fast.
+        task_promise->set_value(getCurrentExceptionMessage(/*with_stacktrace*/ false));
+        std::lock_guard lock(tasks_mutex);
+        tasks.erase(unique_task_id);
+        throw;
+    }
+
+    return Result::Ok;
+}
+
+StatelessTaskExecutor::TaskStatus StatelessTaskExecutor::getStatus(const String & task_id, UInt64 wait_milliseconds)
+{
+    /// Make a copy of task completion future to wait for it outside of the lock
+    std::shared_future<String> completion_future;
+    std::shared_ptr<Progress> progress;
+    {
+        std::lock_guard lock(tasks_mutex);
+        auto it = tasks.find(task_id);
+        if (it == tasks.end())
+            return TaskStatus{Result::UnknownTaskId, "", {}};
+        completion_future = it->second->completion_future;
+        progress = it->second->progress;
+    }
+
+    if (completion_future.valid() && completion_future.wait_for(std::chrono::milliseconds(wait_milliseconds)) == std::future_status::timeout)
+    {
+        Progress progress_delta = progress->fetchAndResetPiecewiseAtomically();
+        return TaskStatus{Result::TaskRunnig, "", std::move(progress_delta)};
+    }
+
+    Progress progress_delta = progress->fetchAndResetPiecewiseAtomically();
+    auto error_message = completion_future.get();
+    if (error_message.empty())
+        return TaskStatus{Result::TaskFinished, "", std::move(progress_delta)};
+    else
+        return TaskStatus{Result::TaskFailed, error_message, std::move(progress_delta)};
+}
+
+StatelessTaskExecutor::Result StatelessTaskExecutor::cancelTask(const String & task_id)
+{
+    std::lock_guard lock(tasks_mutex);
+    auto it = tasks.find(task_id);
+    if (it == tasks.end())
+        return Result::UnknownTaskId;
+    *it->second->cancelled = true;
+
+    return Result::Ok;
+}
+
+StatelessTaskExecutor::Result StatelessTaskExecutor::forgetTask(const String & task_id)
+{
+    std::lock_guard lock(tasks_mutex);
+    auto it = tasks.find(task_id);
+    if (it == tasks.end())
+        return Result::UnknownTaskId;
+
+    tasks.erase(it);
+    return Result::Ok;
+}
+
+void StatelessTaskExecutor::shutdown()
+{
+    std::lock_guard lock(tasks_mutex);
+    for (auto & [task_id, task_state] : tasks)
+        *task_state->cancelled = true;
+
+    for (auto & [task_id, task_state] : tasks)
+        task_state->completion_future.wait();
+}
+
+}
diff --git a/src/Server/StatelessWorker/StatelessTaskExecutor.h b/src/Server/StatelessWorker/StatelessTaskExecutor.h
new file mode 100644
index 000000000000..ed375fa2a63a
--- /dev/null
+++ b/src/Server/StatelessWorker/StatelessTaskExecutor.h
@@ -0,0 +1,68 @@
+#pragma once
+#include <QueryPipeline/DistributedPlanExecutor.h>
+#include <IO/Progress.h>
+#include <Common/ThreadPool.h>
+#include <base/types.h>
+#include <base/defines.h>
+
+#include <atomic>
+#include <future>
+#include <memory>
+#include <mutex>
+#include <unordered_map>
+
+namespace DB
+{
+
+
+class StatelessTaskExecutor
+{
+public:
+    StatelessTaskExecutor();
+    virtual ~StatelessTaskExecutor() = default;
+
+    enum Result
+    {
+        Ok = 0,
+        UnknownTaskId = 1,
+        TaskRunnig = 2,
+        TaskFinished = 3,
+        TaskCancelled = 4,
+        TaskFailed = 5,
+    };
+
+    struct TaskStatus
+    {
+        Result result;
+        String message;
+        Progress progress;
+    };
+
+    Result startTask(const String & unique_task_id, const DistributedQueryTaskDescription & task, const String & unique_temp_file_path);
+    TaskStatus getStatus(const String & task_id, UInt64 wait_milliseconds);
+    Result cancelTask(const String & task_id);
+    Result forgetTask(const String & task_id);
+
+    void shutdown();
+
+private:
+    static void executeTask();
+
+//    std::atomic<bool> shutdown_called{false};
+
+    struct TaskState
+    {
+        std::shared_future<String> completion_future;
+        std::shared_ptr<std::atomic<bool>> cancelled = std::make_shared<std::atomic<bool>>(false);
+        std::shared_ptr<Progress> progress = std::make_shared<Progress>();
+    };
+
+    using TaskStatePtr = std::shared_ptr<TaskState>;
+
+    ThreadPool thread_pool;
+
+    std::unordered_map<String, TaskStatePtr> tasks TSA_GUARDED_BY(tasks_mutex);
+    std::mutex tasks_mutex;
+};
+
+}
diff --git a/src/Server/StatelessWorker/StatelessWorkerClient.cpp b/src/Server/StatelessWorker/StatelessWorkerClient.cpp
new file mode 100644
index 000000000000..7e17a706faf3
--- /dev/null
+++ b/src/Server/StatelessWorker/StatelessWorkerClient.cpp
@@ -0,0 +1,211 @@
+#include <Server/StatelessWorker/StatelessWorkerClient.h>
+#include <Interpreters/Context.h>
+#include <Interpreters/InterserverCredentials.h>
+#include <Poco/Net/HTTPBasicCredentials.h>
+#include <QueryPipeline/DistributedPlanExecutor.h>
+#include <Processors/QueryPlan/QueryPlan.h>
+#include <IO/WriteBufferFromOStream.h>
+#include <IO/ReadWriteBufferFromHTTP.h>
+#include <Core/ProtocolDefines.h>
+#include <base/types.h>
+
+namespace DB
+{
+
+namespace
+{
+
+String doSendTask(const String & endpoint_uri, const String & task_id, std::function<void(WriteBuffer&)> task_serializer, const String & unique_temp_file_path, const ContextPtr & context)
+{
+    auto credentials = context->getInterserverCredentials();
+    Poco::Net::HTTPBasicCredentials creds{};
+    if (!credentials->getUser().empty())
+    {
+        creds.setUsername(credentials->getUser());
+        creds.setPassword(credentials->getPassword());
+    }
+
+    ConnectionTimeouts timeouts;
+    timeouts.connection_timeout = Poco::Timespan(100 * 1000);
+    timeouts.send_timeout = Poco::Timespan(100 * 1000 * 1000);
+    timeouts.receive_timeout = Poco::Timespan(100 * 1000 * 1000);
+    ReadSettings read_settings;
+    /// Not safe to retry: worker would schedule a duplicate task.
+    read_settings.http_settings.max_tries = 1;
+    read_settings.http_settings.retry_initial_backoff_ms = 500;
+    read_settings.http_settings.retry_max_backoff_ms = 1000;
+
+    Poco::URI uri(endpoint_uri);
+    uri.addQueryParameter("operation",   "start");
+    uri.addQueryParameter("compress",    "false");
+    uri.addQueryParameter("task_id",     task_id);
+    uri.addQueryParameter("temp_path",   unique_temp_file_path);
+
+    auto write_body_callback = [&task_serializer] (std::ostream & os)
+    {
+        WriteBufferFromOStream buf(os);
+        task_serializer(buf);
+        buf.finalize();
+    };
+
+    auto in = BuilderRWBufferFromHTTP(uri)
+        .withConnectionGroup(HTTPConnectionGroupType::HTTP)
+        .withMethod(Poco::Net::HTTPRequest::HTTP_POST)
+        .withTimeouts(timeouts)
+        .withSettings(read_settings)
+        .withOutCallback(write_body_callback)
+        .withDelayInit(false)
+        .create(creds);
+
+    std::string s;
+    readStringUntilEOF(s, *in);
+
+    return s;
+}
+
+}
+
+void serializeTask(const DistributedQueryTaskDescription & task_description, WriteBuffer & out);
+
+
+String sendTask(const String & endpoint_uri, const String & unique_task_id, const DistributedQueryTaskDescription & task_description, const String & unique_temp_file_path, const ContextPtr & context)
+{
+    auto task_serializer = [task_description] (WriteBuffer & buf)
+    {
+        serializeTask(task_description, buf);
+    };
+
+    return doSendTask(endpoint_uri, unique_task_id, task_serializer, unique_temp_file_path, context);
+}
+
+/// Get task status by its id.
+/// If wait_for_ms is set, the function will wait for the task to finish for the specified amount of time.
+DistributedQueryTaskStatus getTaskStatus(const String & endpoint_uri, const String & task_id, UInt32 wait_for_ms, const ContextPtr & context, bool for_cleanup)
+{
+    auto credentials = context->getInterserverCredentials();
+    Poco::Net::HTTPBasicCredentials creds{};
+    if (!credentials->getUser().empty())
+    {
+        creds.setUsername(credentials->getUser());
+        creds.setPassword(credentials->getPassword());
+    }
+
+    ConnectionTimeouts timeouts;
+    timeouts.connection_timeout = Poco::Timespan(100 * 1000);
+    ReadSettings read_settings;
+    if (for_cleanup)
+    {
+        /// Bound a best-effort cleanup poll: the receive timeout only needs to cover the worker's
+        /// server-side wait plus a small network margin, and a failed poll must not retry.
+        timeouts.send_timeout = Poco::Timespan(2 * 1000 * 1000);
+        timeouts.receive_timeout = Poco::Timespan((wait_for_ms + 2000) * 1000);
+        read_settings.http_settings.max_tries = 1;
+    }
+    else
+    {
+        timeouts.send_timeout = Poco::Timespan(100 * 1000 * 1000);
+        timeouts.receive_timeout = Poco::Timespan(100 * 1000 * 1000);
+        /// Safe to retry: read-only.
+        read_settings.http_settings.max_tries = 3;
+        read_settings.http_settings.retry_initial_backoff_ms = 200;
+        read_settings.http_settings.retry_max_backoff_ms = 1000;
+    }
+
+    Poco::URI uri(endpoint_uri);
+    uri.addQueryParameter("operation",   "get_status");
+    uri.addQueryParameter("compress",    "false");
+    uri.addQueryParameter("task_id",     task_id);
+    uri.addQueryParameter("wait_for_ms", std::to_string(wait_for_ms));
+
+    auto in = BuilderRWBufferFromHTTP(uri)
+        .withConnectionGroup(HTTPConnectionGroupType::HTTP)
+        .withMethod(Poco::Net::HTTPRequest::HTTP_GET)
+        .withTimeouts(timeouts)
+        .withSettings(read_settings)
+        .withDelayInit(false)
+        .create(creds);
+
+    DistributedQueryTaskStatus result;
+    result.read(*in, DBMS_MIN_PROTOCOL_VERSION_WITH_SERVER_QUERY_TIME_IN_PROGRESS);
+    in->eof();
+
+    return result;
+}
+
+void cancelTask(const String & endpoint_uri, const String & task_id, const ContextPtr & context)
+{
+    auto credentials = context->getInterserverCredentials();
+    Poco::Net::HTTPBasicCredentials creds{};
+    if (!credentials->getUser().empty())
+    {
+        creds.setUsername(credentials->getUser());
+        creds.setPassword(credentials->getPassword());
+    }
+
+    /// Short timeouts for cancel — it's best-effort. Workers will
+    /// eventually clean up orphaned tasks on their own.
+    ConnectionTimeouts timeouts;
+    timeouts.connection_timeout = Poco::Timespan(100 * 1000);
+    timeouts.send_timeout = Poco::Timespan(5 * 1000 * 1000);
+    timeouts.receive_timeout = Poco::Timespan(5 * 1000 * 1000);
+    ReadSettings read_settings;
+    /// Safe to retry: idempotent.
+    read_settings.http_settings.max_tries = 3;
+    read_settings.http_settings.retry_initial_backoff_ms = 200;
+    read_settings.http_settings.retry_max_backoff_ms = 1000;
+
+    Poco::URI uri(endpoint_uri);
+    uri.addQueryParameter("operation",   "cancel");
+    uri.addQueryParameter("compress",    "false");
+    uri.addQueryParameter("task_id",     task_id);
+
+    auto in = BuilderRWBufferFromHTTP(uri)
+        .withConnectionGroup(HTTPConnectionGroupType::HTTP)
+        .withMethod(Poco::Net::HTTPRequest::HTTP_POST)
+        .withTimeouts(timeouts)
+        .withSettings(read_settings)
+        .withDelayInit(false)
+        .create(creds);
+
+    std::string s;
+    readStringUntilEOF(s, *in);
+}
+
+void forgetTask(const String & endpoint_uri, const String & task_id, const ContextPtr & context)
+{
+    auto credentials = context->getInterserverCredentials();
+    Poco::Net::HTTPBasicCredentials creds{};
+    if (!credentials->getUser().empty())
+    {
+        creds.setUsername(credentials->getUser());
+        creds.setPassword(credentials->getPassword());
+    }
+
+    ConnectionTimeouts timeouts;
+    timeouts.connection_timeout = Poco::Timespan(100 * 1000);
+    timeouts.send_timeout = Poco::Timespan(100 * 1000 * 1000);
+    timeouts.receive_timeout = Poco::Timespan(100 * 1000 * 1000);
+    ReadSettings read_settings;
+    /// Safe to retry: idempotent.
+    read_settings.http_settings.max_tries = 3;
+    read_settings.http_settings.retry_initial_backoff_ms = 200;
+    read_settings.http_settings.retry_max_backoff_ms = 1000;
+
+    Poco::URI uri(endpoint_uri);
+    uri.addQueryParameter("operation",   "forget");
+    uri.addQueryParameter("compress",    "false");
+    uri.addQueryParameter("task_id",     task_id);
+
+    auto in = BuilderRWBufferFromHTTP(uri)
+        .withConnectionGroup(HTTPConnectionGroupType::HTTP)
+        .withMethod(Poco::Net::HTTPRequest::HTTP_POST)
+        .withTimeouts(timeouts)
+        .withSettings(read_settings)
+        .withDelayInit(false)
+        .create(creds);
+
+    std::string s;
+    readStringUntilEOF(s, *in);
+}
+
+}
diff --git a/src/Server/StatelessWorker/StatelessWorkerClient.h b/src/Server/StatelessWorker/StatelessWorkerClient.h
new file mode 100644
index 000000000000..dee2ffbe6d6a
--- /dev/null
+++ b/src/Server/StatelessWorker/StatelessWorkerClient.h
@@ -0,0 +1,22 @@
+#pragma once
+
+#include <Interpreters/Context_fwd.h>
+#include <Server/StatelessWorker/StatelessWorkerProtocol.h>
+#include <base/types.h>
+
+namespace DB
+{
+
+struct DistributedQueryTaskDescription;
+
+String sendTask(const String & endpoint_uri, const String & unique_task_id, const DistributedQueryTaskDescription & task_description, const String & unique_temp_file_path, const ContextPtr & context);
+
+/// `for_cleanup` uses short HTTP timeouts and no retries so best-effort cleanup cannot block for the
+/// normal multi-minute request budget when a worker is slow or unreachable.
+DistributedQueryTaskStatus getTaskStatus(const String & endpoint_uri, const String & task_id, UInt32 wait_for_ms, const ContextPtr & context, bool for_cleanup = false);
+
+void cancelTask(const String & endpoint_uri, const String & task_id, const ContextPtr & context);
+
+void forgetTask(const String & endpoint_uri, const String & task_id, const ContextPtr & context);
+
+}
diff --git a/src/Server/StatelessWorker/StatelessWorkerEndpoint.cpp b/src/Server/StatelessWorker/StatelessWorkerEndpoint.cpp
new file mode 100644
index 000000000000..64f9c87a4cbb
--- /dev/null
+++ b/src/Server/StatelessWorker/StatelessWorkerEndpoint.cpp
@@ -0,0 +1,335 @@
+#include <Server/StatelessWorker/StatelessWorkerEndpoint.h>
+#include <Server/StatelessWorker/StatelessTaskExecutor.h>
+#include <Server/StatelessWorker/StatelessWorkerProtocol.h>
+#include <Server/HTTP/HTMLForm.h>
+#include <Server/HTTP/HTTPServerResponse.h>
+#include <IO/WriteHelpers.h>
+#include <IO/ReadHelpers.h>
+#include <Poco/Net/HTTPResponse.h>
+#include <Common/logger_useful.h>
+#include <Processors/QueryPlan/QueryPlan.h>
+#include <QueryPipeline/DistributedPlanExecutor.h>
+#include <Core/ProtocolDefines.h>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int NOT_IMPLEMENTED;
+}
+
+StatelessWorkerEndpoint::StatelessWorkerEndpoint()
+    : endpoint_name("stateless_worker/")
+    , log(Poco::Logger::getShared("StatelessWorkerEndpoint"))
+    , task_runner(std::make_shared<StatelessTaskExecutor>())
+{
+}
+
+StatelessWorkerEndpoint::~StatelessWorkerEndpoint()
+{
+    shutdown();
+}
+
+std::string StatelessWorkerEndpoint::getId(const std::string & path) const
+{
+    return endpoint_name + path;
+}
+
+void serializeTask(const DistributedQueryTaskDescription & task_description, WriteBuffer & out);
+void serializeTask(const DistributedQueryTaskDescription & task_description, WriteBuffer & out)
+{
+    writeVarUInt(DBMS_DISTRIBUTED_TASK_SERIALIZATION_VERSION, out);
+
+    writeStringBinary(task_description.initial_query_id, out);
+
+    const auto & task = task_description.task;
+
+    writeStringBinary(task.task_id, out);
+    writeStringBinary(task_description.serialized_query_plan, out);
+
+    writeVarUInt(task.parameters.parameters.size(), out);
+    for (const auto & [name, field] : task.parameters.parameters)
+    {
+        writeStringBinary(name, out);
+        writeFieldBinary(field, out);
+    }
+
+    writeVarUInt(task.input_exchange_streams.size(), out);
+    for (const auto & stream_id : task.input_exchange_streams)
+    {
+        writeStringBinary(stream_id.exchange_id, out);
+        writeStringBinary(stream_id.source_bucket, out);
+        writeStringBinary(stream_id.destination_bucket, out);
+    }
+
+    writeVarUInt(task.output_exchange_streams.size(), out);
+    for (const auto & stream_id : task.output_exchange_streams)
+    {
+        writeStringBinary(stream_id.exchange_id, out);
+        writeStringBinary(stream_id.source_bucket, out);
+        writeStringBinary(stream_id.destination_bucket, out);
+    }
+
+    writeVarUInt(task_description.exchanges.size(), out);
+    for (const auto & [name, exchange] : task_description.exchanges)
+    {
+        chassert(name == exchange.name);
+        writeStringBinary(exchange.name, out);
+        writeVarUInt(static_cast<size_t>(exchange.kind), out);
+        writeVarUInt(exchange.source_bucket_count, out);
+        writeVarUInt(exchange.destination_bucket_count, out);
+    }
+
+    writeVarUInt(task_description.exchange_stream_sources.stream_hosts.size(), out);
+    for (const auto & [stream, host] : task_description.exchange_stream_sources.stream_hosts)
+    {
+        writeStringBinary(stream, out);
+        writeStringBinary(host, out);
+    }
+
+    writeVarUInt(task_description.settings_changes.size(), out);
+    for (const auto & change : task_description.settings_changes)
+    {
+        writeStringBinary(change.name, out);
+        writeFieldBinary(change.value, out);
+    }
+}
+
+namespace
+{
+
+void deserializeTask(DistributedQueryTaskDescription & task_description, ReadBuffer & in)
+{
+    UInt64 version = 0;
+    readVarUInt(version, in);
+    if (version > DBMS_DISTRIBUTED_TASK_SERIALIZATION_VERSION)
+        throw Exception(ErrorCodes::NOT_IMPLEMENTED,
+            "Distributed task serialization version {} is not supported. The last supported version is {}",
+            version, DBMS_DISTRIBUTED_TASK_SERIALIZATION_VERSION);
+
+    readStringBinary(task_description.initial_query_id, in);
+
+    auto & task = task_description.task;
+
+    readStringBinary(task.task_id, in);
+    readStringBinary(task_description.serialized_query_plan, in);
+
+    size_t parameters_size = 0;
+    readVarUInt(parameters_size, in);
+    for (size_t i = 0; i < parameters_size; ++i)
+    {
+        String name;
+        readStringBinary(name, in);
+        Field field = readFieldBinary(in);
+        task.parameters.parameters[name] = field;
+    }
+
+    size_t input_files_size = 0;
+    readVarUInt(input_files_size, in);
+    task.input_exchange_streams.resize(input_files_size);
+    for (size_t i = 0; i < input_files_size; ++i)
+    {
+        readStringBinary(task.input_exchange_streams[i].exchange_id, in);
+        readStringBinary(task.input_exchange_streams[i].source_bucket, in);
+        readStringBinary(task.input_exchange_streams[i].destination_bucket, in);
+    }
+
+    size_t output_files_size = 0;
+    readVarUInt(output_files_size, in);
+    task.output_exchange_streams.resize(output_files_size);
+    for (size_t i = 0; i < output_files_size; ++i)
+    {
+        readStringBinary(task.output_exchange_streams[i].exchange_id, in);
+        readStringBinary(task.output_exchange_streams[i].source_bucket, in);
+        readStringBinary(task.output_exchange_streams[i].destination_bucket, in);
+    }
+
+    size_t exchanges_size = 0;
+    readVarUInt(exchanges_size, in);
+    for (size_t i = 0; i < exchanges_size; ++i)
+    {
+        String name;
+        readStringBinary(name, in);
+        ExchangeDescription exchange;
+        UInt64 kind = 0;
+        readVarUInt(kind, in);
+        exchange.kind = static_cast<ExchangeDescription::Kind>(kind);
+        readVarUInt(exchange.source_bucket_count, in);
+        readVarUInt(exchange.destination_bucket_count, in);
+        task_description.exchanges[name] = exchange;
+    }
+
+    size_t exchange_stream_sources_size = 0;
+    readVarUInt(exchange_stream_sources_size, in);
+    for (size_t i = 0; i < exchange_stream_sources_size; ++i)
+    {
+        String stream;
+        readStringBinary(stream, in);
+        String host;
+        readStringBinary(host, in);
+        task_description.exchange_stream_sources.stream_hosts[stream] = host;
+    }
+
+    if (version >= 1)
+    {
+        size_t settings_changes_size = 0;
+        readVarUInt(settings_changes_size, in);
+        task_description.settings_changes.reserve(settings_changes_size);
+        for (size_t i = 0; i < settings_changes_size; ++i)
+        {
+            String name;
+            readStringBinary(name, in);
+            Field value = readFieldBinary(in);
+            task_description.settings_changes.emplace_back(name, value);
+        }
+    }
+}
+
+}
+
+void StatelessWorkerEndpoint::processQuery(const HTMLForm & params, ReadBufferPtr body, WriteBuffer & out, HTTPServerResponse & response)
+{
+    auto operation = params.get("operation");
+    auto task_id = params.get("task_id");
+
+    if (operation == "start")
+    {
+        auto unique_temp_file_path = params.get("temp_path");
+        /// Deserialize task fields from the request body
+        DistributedQueryTaskDescription task_description;
+        deserializeTask(task_description, *body);
+        body->eof();
+        body.reset();
+
+        /// Pass it to the runner to start execution
+        task_runner->startTask(task_id, task_description, unique_temp_file_path);
+    }
+    else if (operation == "get_status")
+    {
+        UInt64 wait_milliseconds = 0;
+        if (params.has("wait_for_ms"))
+            wait_milliseconds = parse<UInt64>(params.get("wait_for_ms"));
+
+        UInt64 client_version = DBMS_MIN_PROTOCOL_VERSION_WITH_SERVER_QUERY_TIME_IN_PROGRESS;
+        if (params.has("client_version"))
+            client_version = parse<UInt64>(params.get("client_version"));
+
+        body->eof();
+        body.reset();
+
+        auto status = task_runner->getStatus(task_id, wait_milliseconds);
+        DistributedQueryTaskStatus task_status;
+        task_status.progress = std::move(status.progress);
+
+        switch (status.result)
+        {
+            case StatelessTaskExecutor::TaskRunnig:
+            {
+                response.setStatus(Poco::Net::HTTPResponse::HTTP_OK);
+                task_status.status = "Running";
+                break;
+            }
+            case StatelessTaskExecutor::TaskFinished:
+            {
+                response.setStatus(Poco::Net::HTTPResponse::HTTP_OK);
+                task_status.status = "Finished";
+                break;
+            }
+            case StatelessTaskExecutor::TaskCancelled:
+            {
+                response.setStatus(Poco::Net::HTTPResponse::HTTP_OK);
+                task_status.status = "Cancelled";
+                break;
+            }
+            case StatelessTaskExecutor::TaskFailed:
+            {
+                response.setStatus(Poco::Net::HTTPResponse::HTTP_OK);
+                task_status.status = "Failed";
+                task_status.error_message = status.message;
+                break;
+            }
+            case StatelessTaskExecutor::UnknownTaskId:
+            {
+                /// A gone task is a normal status answer, not a transport error: it has already
+                /// finished and been reclaimed, or was never started. Report it as a successful status
+                /// query carrying an "Unknown task" state so the coordinator treats it as terminal.
+                response.setStatus(Poco::Net::HTTPResponse::HTTP_OK);
+                task_status.status = "Unknown task";
+                break;
+            }
+            default:
+            {
+                response.setStatus(Poco::Net::HTTPResponse::HTTP_INTERNAL_SERVER_ERROR);
+                task_status.status = fmt::format("Unexpected task status: {}", static_cast<int>(status.result));
+                break;
+            }
+        }
+        task_status.write(out, client_version);
+    }
+    else if (operation == "cancel")
+    {
+        body->eof();
+        body.reset();
+        auto result = task_runner->cancelTask(task_id);
+        switch (result)
+        {
+            case StatelessTaskExecutor::Ok:
+            case StatelessTaskExecutor::TaskCancelled:
+            {
+                response.setStatus(Poco::Net::HTTPResponse::HTTP_OK);
+                writeString("Cancelled\n", out);
+                break;
+            }
+            case StatelessTaskExecutor::UnknownTaskId:
+            {
+                /// For idempotency: a gone task is already not running, so cancellation succeeded.
+                response.setStatus(Poco::Net::HTTPResponse::HTTP_OK);
+                writeString("Unknown task\n", out);
+                break;
+            }
+            default:
+            {
+                response.setStatus(Poco::Net::HTTPResponse::HTTP_INTERNAL_SERVER_ERROR);
+                writeString(fmt::format("Unexpected task status: {}\n", static_cast<int>(result)), out);
+                break;
+            }
+        }
+    }
+    else if (operation == "forget")
+    {
+        body->eof();
+        body.reset();
+        auto result = task_runner->forgetTask(task_id);
+        switch (result)
+        {
+            case StatelessTaskExecutor::Ok:
+            case StatelessTaskExecutor::UnknownTaskId: /// For idempotency
+            {
+                response.setStatus(Poco::Net::HTTPResponse::HTTP_OK);
+                break;
+            }
+            default:
+            {
+                response.setStatus(Poco::Net::HTTPResponse::HTTP_INTERNAL_SERVER_ERROR);
+                writeString(fmt::format("Unexpected task status: {}\n", static_cast<int>(result)), out);
+                break;
+            }
+        }
+    }
+    else
+    {
+        body->eof();
+        body.reset();
+        LOG_WARNING(log, "Unsupported operation '{}'", operation);
+        response.setStatus(Poco::Net::HTTPResponse::HTTP_BAD_REQUEST);
+        writeString("Unknown operation type\n", out);
+    }
+}
+
+void StatelessWorkerEndpoint::shutdown()
+{
+    task_runner->shutdown();
+}
+
+}
diff --git a/src/Server/StatelessWorker/StatelessWorkerEndpoint.h b/src/Server/StatelessWorker/StatelessWorkerEndpoint.h
new file mode 100644
index 000000000000..d1100f0a8d28
--- /dev/null
+++ b/src/Server/StatelessWorker/StatelessWorkerEndpoint.h
@@ -0,0 +1,28 @@
+#pragma once
+
+#include <Interpreters/InterserverIOHandler.h>
+#include <boost/noncopyable.hpp>
+
+namespace DB
+{
+
+class StatelessTaskExecutor;
+
+/// Endpoint for serving merge requests.
+class StatelessWorkerEndpoint final : public InterserverIOEndpoint, private boost::noncopyable
+{
+public:
+    StatelessWorkerEndpoint();
+    ~StatelessWorkerEndpoint() override;
+
+    std::string getId(const std::string & path) const override;
+    void processQuery(const HTMLForm & params, ReadBufferPtr body, WriteBuffer & out, HTTPServerResponse & response) override;
+    void shutdown();
+
+private:
+    const std::string endpoint_name;
+    LoggerPtr log;
+    std::shared_ptr<StatelessTaskExecutor> task_runner;
+};
+
+}
diff --git a/src/Server/StatelessWorker/StatelessWorkerProtocol.cpp b/src/Server/StatelessWorker/StatelessWorkerProtocol.cpp
new file mode 100644
index 000000000000..949a3f221759
--- /dev/null
+++ b/src/Server/StatelessWorker/StatelessWorkerProtocol.cpp
@@ -0,0 +1,24 @@
+#include <Server/StatelessWorker/StatelessWorkerProtocol.h>
+#include <IO/WriteBuffer.h>
+#include <IO/ReadBuffer.h>
+#include <IO/WriteHelpers.h>
+#include <IO/ReadHelpers.h>
+
+namespace DB
+{
+
+void DistributedQueryTaskStatus::write(WriteBuffer & out, UInt64 version) const
+{
+    writeStringBinary(status, out);
+    writeStringBinary(error_message, out);
+    progress.write(out, version);
+}
+
+void DistributedQueryTaskStatus::read(ReadBuffer & in, UInt64 version)
+{
+    readStringBinary(status, in);
+    readStringBinary(error_message, in);
+    progress.read(in, version);
+}
+
+}
diff --git a/src/Server/StatelessWorker/StatelessWorkerProtocol.h b/src/Server/StatelessWorker/StatelessWorkerProtocol.h
new file mode 100644
index 000000000000..b94025e6d85f
--- /dev/null
+++ b/src/Server/StatelessWorker/StatelessWorkerProtocol.h
@@ -0,0 +1,21 @@
+#pragma once
+#include <IO/Progress.h>
+#include <base/types.h>
+
+namespace DB
+{
+
+class WriteBuffer;
+class ReadBuffer;
+
+struct DistributedQueryTaskStatus
+{
+    String status;
+    String error_message;
+    Progress progress;
+
+    void write(WriteBuffer & out, UInt64 version) const;
+    void read(ReadBuffer & in, UInt64 version);
+};
+
+}
diff --git a/src/Storages/SelectQueryInfo.cpp b/src/Storages/SelectQueryInfo.cpp
index 84254910ac62..88812aa52b2b 100644
--- a/src/Storages/SelectQueryInfo.cpp
+++ b/src/Storages/SelectQueryInfo.cpp
@@ -55,6 +55,7 @@ void PrewhereInfo::serialize(IQueryPlanStep::Serialization & ctx) const
     prewhere_actions.serialize(ctx.out, ctx.registry);
     writeStringBinary(prewhere_column_name, ctx.out);
     writeBinary(remove_prewhere_column, ctx.out);
+    writeBinary(need_filter, ctx.out);
 }
 
 PrewhereInfo PrewhereInfo::deserialize(IQueryPlanStep::Deserialization & ctx)
@@ -64,7 +65,7 @@ PrewhereInfo PrewhereInfo::deserialize(IQueryPlanStep::Deserialization & ctx)
     prewhere_info.prewhere_actions = ActionsDAG::deserialize(ctx.in, ctx.registry, ctx.context);
     readStringBinary(prewhere_info.prewhere_column_name, ctx.in);
     readBinary(prewhere_info.remove_prewhere_column, ctx.in);
-    prewhere_info.need_filter = true;
+    readBinary(prewhere_info.need_filter, ctx.in);
 
     return prewhere_info;
 }
diff --git a/tests/config/config.d/distributed_query.xml b/tests/config/config.d/distributed_query.xml
new file mode 100644
index 000000000000..f386624641be
--- /dev/null
+++ b/tests/config/config.d/distributed_query.xml
@@ -0,0 +1,39 @@
+<clickhouse>
+
+    <!-- Enable Stateless Worker endpoint -->
+    <stateless_worker_server>
+        <enabled>1</enabled>
+        <endpoint>localhost</endpoint>
+    </stateless_worker_server>
+
+    <stateless_worker_client>
+        <enabled>1</enabled>
+        <!-- Distributed reads require a single-shard worker cluster (replicas hold the same data). -->
+        <cluster>test_cluster_one_shard_two_replicas</cluster>
+    </stateless_worker_client>
+
+    <distributed_query>
+        <streaming_exchange_port>9223</streaming_exchange_port>
+        <streaming_exchange_listen_host>0.0.0.0</streaming_exchange_listen_host>
+        <streaming_exchange_listen_host>::</streaming_exchange_listen_host>
+        <temporary_files_storage>
+            <!-- Using local object storage  -->
+            <type>local</type>
+            <path>./local_object_storage/</path>
+            <endpoint_subpath>tmp_sub_path/</endpoint_subpath>
+
+            <!-- Using S3 object strorage -->
+<!--
+                <type>object_storage</type>
+                <object_storage_type>s3</object_storage_type>
+                <path>s3_disk/</path>
+                <endpoint>http://localhost:11111/test/test/</endpoint>
+                <endpoint_subpath>tmp_sub_path/</endpoint_subpath>
+                <access_key_id>clickhouse</access_key_id>
+                <secret_access_key>clickhouse</secret_access_key>
+                <request_timeout_ms>20000</request_timeout_ms>
+ -->
+        </temporary_files_storage>
+    </distributed_query>
+
+</clickhouse>
diff --git a/tests/config/install.sh b/tests/config/install.sh
index f5b5e383ed0c..f08faa656e57 100755
--- a/tests/config/install.sh
+++ b/tests/config/install.sh
@@ -158,6 +158,12 @@ cp $SRC_PATH/config.d/storage_conf_backups.xml $DEST_SERVER_PATH/config.d/
 cp $SRC_PATH/config.d/backups.xml $DEST_SERVER_PATH/config.d/
 cp $SRC_PATH/config.d/filesystem_caches_path.xml $DEST_SERVER_PATH/config.d/
 ln -sf $SRC_PATH/config.d/validate_tcp_client_information.xml $DEST_SERVER_PATH/config.d/
+# distributed_query.xml sets distributed_query.streaming_exchange_port, which the server rejects on
+# non-Linux builds; only install it where the streaming exchange is supported.
+if [ "$(uname -s)" = "Linux" ]; then
+    ln -sf $SRC_PATH/config.d/distributed_query.xml $DEST_SERVER_PATH/config.d/
+fi
+
 ln -sf $SRC_PATH/config.d/zero_copy_destructive_operations.xml $DEST_SERVER_PATH/config.d/
 ln -sf $SRC_PATH/config.d/handlers.yaml $DEST_SERVER_PATH/config.d/
 ln -sf $SRC_PATH/config.d/threadpool_writer_pool_size.yaml $DEST_SERVER_PATH/config.d/
diff --git a/tests/integration/test_distributed_plan_cancel/__init__.py b/tests/integration/test_distributed_plan_cancel/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/integration/test_distributed_plan_cancel/configs/config.d/stateless_worker.xml b/tests/integration/test_distributed_plan_cancel/configs/config.d/stateless_worker.xml
new file mode 100644
index 000000000000..922c3fd2fbec
--- /dev/null
+++ b/tests/integration/test_distributed_plan_cancel/configs/config.d/stateless_worker.xml
@@ -0,0 +1,36 @@
+<clickhouse>
+    <stateless_worker_server>
+        <enabled>1</enabled>
+        <endpoint>localhost</endpoint>
+    </stateless_worker_server>
+
+    <stateless_worker_client>
+        <enabled>1</enabled>
+        <cluster>default</cluster>
+    </stateless_worker_client>
+
+    <distributed_query>
+        <streaming_exchange_port>9223</streaming_exchange_port>
+        <streaming_exchange_listen_host>0.0.0.0</streaming_exchange_listen_host>
+        <temporary_files_storage>
+            <type>local</type>
+            <path>./local_object_storage/</path>
+            <endpoint_subpath>tmp_sub_path/</endpoint_subpath>
+        </temporary_files_storage>
+    </distributed_query>
+
+    <remote_servers>
+        <default>
+            <shard>
+                <replica>
+                    <host>node1</host>
+                    <port>9000</port>
+                </replica>
+                <replica>
+                    <host>node2</host>
+                    <port>9000</port>
+                </replica>
+            </shard>
+        </default>
+    </remote_servers>
+</clickhouse>
diff --git a/tests/integration/test_distributed_plan_cancel/test.py b/tests/integration/test_distributed_plan_cancel/test.py
new file mode 100644
index 000000000000..64d442baacf2
--- /dev/null
+++ b/tests/integration/test_distributed_plan_cancel/test.py
@@ -0,0 +1,87 @@
+"""
+Test that worker task failures in distributed plan queries are propagated
+to the initiator with the correct error message instead of QUERY_WAS_CANCELLED.
+
+Reproduces https://github.com/ClickHouse/clickhouse-private/issues/40546
+(error-propagation part).
+"""
+
+import logging
+import pytest
+
+from helpers.cluster import ClickHouseCluster
+from helpers.client import QueryRuntimeException
+
+cluster = ClickHouseCluster(__file__)
+
+node1 = cluster.add_instance(
+    "node1",
+    main_configs=["configs/config.d/stateless_worker.xml"],
+    stay_alive=True,
+)
+node2 = cluster.add_instance(
+    "node2",
+    main_configs=["configs/config.d/stateless_worker.xml"],
+    stay_alive=True,
+)
+
+
+@pytest.fixture(scope="module")
+def started_cluster():
+    try:
+        cluster.start()
+        yield cluster
+    finally:
+        cluster.shutdown()
+
+
+DISTRIBUTED_SETTINGS = (
+    "make_distributed_plan = 1, "
+    "enable_parallel_replicas = 0, "
+    "distributed_plan_default_shuffle_join_bucket_count = 2, "
+    "distributed_plan_default_reader_bucket_count = 2, "
+    "distributed_plan_max_rows_to_broadcast = 0"
+)
+
+
+def test_worker_error_not_masked(started_cluster):
+    """A worker task error must surface to the initiator with the real error, not QUERY_WAS_CANCELLED.
+
+    The initiator sets max_rows_to_read = 5, which is propagated to the workers; each worker reads
+    more than 5 rows and fails with TOO_MANY_ROWS, and the initiator must report that. This also
+    covers settings propagation: without it the workers would not enforce the limit and the query
+    would silently succeed.
+    """
+    for node in [node1, node2]:
+        node.query(
+            "CREATE TABLE IF NOT EXISTS test_err_mask (id UInt64) "
+            "ENGINE = MergeTree() ORDER BY id"
+        )
+        node.query("INSERT INTO test_err_mask SELECT number FROM numbers(10000)")
+
+    with pytest.raises(QueryRuntimeException) as exc_info:
+        node1.query(
+            f"""
+            SELECT sum(id)
+            FROM test_err_mask
+            SETTINGS {DISTRIBUTED_SETTINGS},
+                     max_rows_to_read = 5
+            """,
+            timeout=30,
+        )
+
+    error_msg = str(exc_info.value)
+    logging.info(f"Query failed with: {error_msg}")
+
+    # The error should be the worker's limit failure, not QUERY_WAS_CANCELLED.
+    assert "QUERY_WAS_CANCELLED" not in error_msg, (
+        f"Initiator masked the worker error with QUERY_WAS_CANCELLED: {error_msg}"
+    )
+    assert (
+        "TOO_MANY_ROWS" in error_msg
+        or "max_rows_to_read" in error_msg
+        or "Limit for rows" in error_msg
+    ), f"Expected the propagated max_rows_to_read limit to be enforced on the worker, got: {error_msg}"
+
+    for node in [node1, node2]:
+        node.query("DROP TABLE IF EXISTS test_err_mask")
diff --git a/tests/integration/test_distributed_plan_replicated_merge_tree/__init__.py b/tests/integration/test_distributed_plan_replicated_merge_tree/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/integration/test_distributed_plan_replicated_merge_tree/configs/config.d/stateless_worker.xml b/tests/integration/test_distributed_plan_replicated_merge_tree/configs/config.d/stateless_worker.xml
new file mode 100644
index 000000000000..5fb07caa730f
--- /dev/null
+++ b/tests/integration/test_distributed_plan_replicated_merge_tree/configs/config.d/stateless_worker.xml
@@ -0,0 +1,46 @@
+<clickhouse>
+    <stateless_worker_server>
+        <enabled>1</enabled>
+        <endpoint>localhost</endpoint>
+    </stateless_worker_server>
+
+    <stateless_worker_client>
+        <enabled>1</enabled>
+        <cluster>default</cluster>
+    </stateless_worker_client>
+
+    <distributed_query>
+        <streaming_exchange_port>9223</streaming_exchange_port>
+        <streaming_exchange_listen_host>0.0.0.0</streaming_exchange_listen_host>
+        <!-- Persisted exchange needs storage that all workers can read; use the
+             MinIO instance provided by `with_minio=True`. The Streaming kind
+             does not touch this section. -->
+        <temporary_files_storage>
+            <type>object_storage</type>
+            <object_storage_type>s3</object_storage_type>
+            <endpoint>http://minio1:9001/root/distributed_query_tmp/</endpoint>
+            <access_key_id>minio</access_key_id>
+            <secret_access_key>ClickHouse_Minio_P@ssw0rd</secret_access_key>
+            <endpoint_subpath>tmp_sub_path/</endpoint_subpath>
+        </temporary_files_storage>
+    </distributed_query>
+
+    <remote_servers>
+        <default>
+            <shard>
+                <replica>
+                    <host>node1</host>
+                    <port>9000</port>
+                </replica>
+                <replica>
+                    <host>node2</host>
+                    <port>9000</port>
+                </replica>
+                <replica>
+                    <host>node3</host>
+                    <port>9000</port>
+                </replica>
+            </shard>
+        </default>
+    </remote_servers>
+</clickhouse>
diff --git a/tests/integration/test_distributed_plan_replicated_merge_tree/test.py b/tests/integration/test_distributed_plan_replicated_merge_tree/test.py
new file mode 100644
index 000000000000..328a361f9044
--- /dev/null
+++ b/tests/integration/test_distributed_plan_replicated_merge_tree/test.py
@@ -0,0 +1,444 @@
+"""
+Distributed query execution against ReplicatedMergeTree tables.
+
+Spins up a 3-node cluster, creates several ReplicatedMergeTree tables (filled
+with multiple parts on each replica), and exercises:
+  - parallel read of an RMT table,
+  - shuffle hash join,
+  - broadcast join,
+  - shuffle aggregation,
+  - distributed sort,
+  - aggregation feeding a join.
+
+Each distributed-plan query is compared against the same query without
+make_distributed_plan to confirm the distributed path produces matching results.
+The EXPLAIN PLAN of the distributed query is also compared against a baked-in
+reference; if optimizer changes reshape a plan the test fails with a diff so
+the reference can be updated deliberately.
+"""
+
+import logging
+import textwrap
+from typing import Optional
+
+import pytest
+
+from helpers.client import QueryRuntimeException
+from helpers.cluster import ClickHouseCluster
+
+pytestmark = pytest.mark.timeout(300)
+
+cluster = ClickHouseCluster(__file__)
+
+node1 = cluster.add_instance(
+    "node1",
+    main_configs=["configs/config.d/stateless_worker.xml"],
+    with_zookeeper=True,
+    with_minio=True,
+    stay_alive=True,
+    macros={"shard": 1, "replica": 1},
+)
+node2 = cluster.add_instance(
+    "node2",
+    main_configs=["configs/config.d/stateless_worker.xml"],
+    with_zookeeper=True,
+    with_minio=True,
+    stay_alive=True,
+    macros={"shard": 1, "replica": 2},
+)
+node3 = cluster.add_instance(
+    "node3",
+    main_configs=["configs/config.d/stateless_worker.xml"],
+    with_zookeeper=True,
+    with_minio=True,
+    stay_alive=True,
+    macros={"shard": 1, "replica": 3},
+)
+
+NODES = [node1, node2, node3]
+INITIATOR = node1
+
+# Settings common to all distributed-plan queries in this test. The 3 buckets
+# match the 3-node cluster size. distributed_plan_max_rows_to_broadcast = 0
+# prevents the optimizer from broadcasting the join's right side by default;
+# individual tests override it when broadcast is the path under test.
+#
+# Several settings are pinned to keep the EXPLAIN snapshot stable against
+# unrelated default-value changes elsewhere in the codebase: anything that
+# can swap join sides, change join-side conversion, or fuse/split exchange
+# steps would otherwise produce a different plan shape from one ClickHouse
+# version to the next without indicating a real regression in this feature.
+DISTRIBUTED_SETTINGS = ", ".join([
+    "make_distributed_plan = 1",
+    "enable_parallel_replicas = 0",
+    "distributed_plan_default_shuffle_join_bucket_count = 3",
+    "distributed_plan_default_reader_bucket_count = 3",
+    "distributed_plan_max_rows_to_broadcast = 0",
+    "distributed_plan_optimize_exchanges = 1",
+    "query_plan_join_swap_table = 'false'",
+    "query_plan_optimize_join_order_limit = 0",
+    "query_plan_use_new_logical_join_step = 1",
+    "query_plan_convert_join_to_in = 0",
+    "query_plan_convert_outer_join_to_inner_join = 0",
+    "query_plan_convert_any_join_to_semi_or_anti_join = 0",
+    # Runtime filters are not yet implemented for distributed queries.
+    "enable_join_runtime_filters = 0",
+])
+
+
+@pytest.fixture(scope="module")
+def started_cluster():
+    try:
+        cluster.start()
+        _create_tables_and_load_data()
+        yield cluster
+    finally:
+        cluster.shutdown()
+
+
+def _create_tables_and_load_data():
+    """Create RMT tables on every node and insert data in multiple batches so
+    each replica ends up with several data parts."""
+    for node in NODES:
+        node.query(
+            """
+            CREATE TABLE big (id UInt64, group_key UInt32, payload String)
+            ENGINE = ReplicatedMergeTree('/clickhouse/tables/{shard}/big', '{replica}')
+            ORDER BY id
+            """
+        )
+        node.query(
+            """
+            CREATE TABLE small (id UInt64, label String)
+            ENGINE = ReplicatedMergeTree('/clickhouse/tables/{shard}/small', '{replica}')
+            ORDER BY id
+            """
+        )
+
+    # Stop background merges so the inserted parts are not collapsed before the
+    # sanity check below — that check expects each replica to see > 1 part.
+    for node in NODES:
+        node.query("SYSTEM STOP MERGES big")
+        node.query("SYSTEM STOP MERGES small")
+
+    # Load data on a single replica; replication propagates it to the others.
+    # Multiple inserts produce multiple data parts so the parallel read path
+    # has more than one part to split per worker bucket.
+    for batch in range(4):
+        offset = batch * 25_000
+        INITIATOR.query(
+            f"""
+            INSERT INTO big
+            SELECT
+                number + {offset} AS id,
+                (number + {offset}) % 100 AS group_key,
+                concat('p_', toString(number + {offset})) AS payload
+            FROM numbers(25000)
+            """
+        )
+
+    INITIATOR.query(
+        """
+        INSERT INTO small
+        SELECT number AS id, concat('lbl_', toString(number)) AS label
+        FROM numbers(500)
+        """
+    )
+
+    # Wait for replication so all replicas see the full data set.
+    for node in NODES:
+        node.query("SYSTEM SYNC REPLICA big")
+        node.query("SYSTEM SYNC REPLICA small")
+
+    # Sanity-check: every replica sees all rows and more than one part on big.
+    for node in NODES:
+        assert int(node.query("SELECT count() FROM big").strip()) == 100_000
+        assert int(node.query("SELECT count() FROM small").strip()) == 500
+        parts_count = int(
+            node.query(
+                "SELECT count() FROM system.parts "
+                "WHERE table = 'big' AND active = 1"
+            ).strip()
+        )
+        assert parts_count >= 2, (
+            f"expected >= 2 active parts on {node.name}, got {parts_count}"
+        )
+
+
+def _explain_and_check(query: str, settings: str, expected_plan: str):
+    """Run EXPLAIN PLAN on the query and compare its output to the expected
+    plan, line-by-line after stripping the common indentation. On mismatch
+    the failure message includes both plans so the test author can update
+    the reference.
+    """
+    actual = INITIATOR.query(f"EXPLAIN PLAN {query} SETTINGS {settings}").strip("\n")
+    expected = textwrap.dedent(expected_plan).strip("\n")
+    if actual != expected:
+        pytest.fail(
+            "distributed plan does not match the expected reference\n"
+            "--- ACTUAL PLAN ---\n" + actual
+            + "\n--- EXPECTED PLAN ---\n" + expected
+        )
+    return actual
+
+
+def _run_both_ways(
+    query: str,
+    settings_override: str = "",
+    expected_plan: Optional[str] = None,
+):
+    """Run the query in distributed and non-distributed modes and return both
+    results for the caller to compare. If expected_plan is provided, also
+    EXPLAIN PLAN the distributed query and assert the dumped plan matches.
+    """
+    settings = DISTRIBUTED_SETTINGS
+    if settings_override:
+        settings = settings + ", " + settings_override
+    if expected_plan is not None:
+        _explain_and_check(query, settings, expected_plan)
+    distributed = INITIATOR.query(f"{query} SETTINGS {settings}")
+    baseline = INITIATOR.query(f"{query} SETTINGS make_distributed_plan = 0")
+    logging.info("distributed result:\n%s", distributed)
+    logging.info("baseline result:\n%s", baseline)
+    return distributed, baseline
+
+
+EXCHANGE_KINDS = pytest.mark.parametrize("exchange_kind", ["Streaming", "Persisted"])
+
+
+def _override(exchange_kind: str, *extra: str) -> str:
+    parts = [f"distributed_plan_force_exchange_kind = '{exchange_kind}'"]
+    parts.extend(extra)
+    return ", ".join(parts)
+
+
+@EXCHANGE_KINDS
+def test_parallel_read(started_cluster, exchange_kind):
+    """A scan of the RMT table goes through the distributed-read path with
+    one task per reader bucket and a GatherExchange feeding the initiator."""
+    distributed, baseline = _run_both_ways(
+        "SELECT count(), sum(id), sum(group_key) FROM big",
+        settings_override=_override(exchange_kind),
+        expected_plan="""\
+            Expression ((Project names + Projection))
+              MergingAggregated (merge)
+                GatherExchange
+                  Aggregating (partial)
+                    Expression ((Before GROUP BY + Change column names to column identifiers))
+                      ReadFromMergeTree (default.big)
+        """,
+    )
+    assert distributed == baseline
+
+
+def test_parallel_read_missing_part_on_worker_errors(started_cluster):
+    """A distributed read buckets over the parts the coordinator selected. If a
+    worker replica is missing one of those parts (replication lag), the read must
+    fail cleanly instead of silently returning a divergent slice of the data.
+
+    Stop fetches on node2/node3 and add a new part only on the coordinator
+    (node1); the read assigns a bucket to a lagging replica, which cannot find
+    the coordinator-selected part and raises NO_SUCH_DATA_PART."""
+    table = "big_lagging"
+    for node in NODES:
+        node.query(f"DROP TABLE IF EXISTS {table} SYNC")
+        node.query(
+            f"""
+            CREATE TABLE {table} (id UInt64, group_key UInt32, payload String)
+            ENGINE = ReplicatedMergeTree('/clickhouse/tables/{{shard}}/{table}', '{{replica}}')
+            ORDER BY id
+            """
+        )
+        node.query(f"SYSTEM STOP MERGES {table}")
+
+    # Initial data, replicated to every node (multiple parts).
+    for batch in range(2):
+        offset = batch * 25_000
+        INITIATOR.query(
+            f"INSERT INTO {table} SELECT number + {offset}, (number + {offset}) % 100, "
+            f"concat('p_', toString(number + {offset})) FROM numbers(25000)"
+        )
+    for node in NODES:
+        node.query(f"SYSTEM SYNC REPLICA {table}")
+
+    node2.query(f"SYSTEM STOP FETCHES {table}")
+    node3.query(f"SYSTEM STOP FETCHES {table}")
+    try:
+        # New part lands only on the coordinator; node2/node3 stay behind.
+        INITIATOR.query(
+            f"INSERT INTO {table} SELECT number + 50000, (number + 50000) % 100, "
+            f"concat('p_', toString(number + 50000)) FROM numbers(25000)"
+        )
+        assert int(INITIATOR.query(f"SELECT count() FROM {table}").strip()) == 75_000
+        assert int(node2.query(f"SELECT count() FROM {table}").strip()) == 50_000
+        assert int(node3.query(f"SELECT count() FROM {table}").strip()) == 50_000
+
+        settings = (
+            DISTRIBUTED_SETTINGS
+            + ", distributed_plan_force_exchange_kind = 'Persisted'"
+            + ", distributed_plan_prefer_replicas_over_workers = 1"
+        )
+        with pytest.raises(QueryRuntimeException) as exc:
+            INITIATOR.query(f"SELECT count(), sum(id) FROM {table} SETTINGS {settings}")
+        assert "is not available on this replica" in str(exc.value)
+    finally:
+        node2.query(f"SYSTEM START FETCHES {table}")
+        node3.query(f"SYSTEM START FETCHES {table}")
+        for node in NODES:
+            node.query(f"SYSTEM SYNC REPLICA {table}")
+            node.query(f"DROP TABLE IF EXISTS {table} SYNC")
+
+
+@EXCHANGE_KINDS
+def test_shuffle_hash_join(started_cluster, exchange_kind):
+    """Self-join on a non-key expression so the optimizer cannot reuse the
+    primary-key sort order and must shuffle both inputs by the join key."""
+    distributed, baseline = _run_both_ways(
+        """
+        SELECT count()
+        FROM big AS a
+        INNER JOIN big AS b ON a.id = b.id + 1
+        WHERE a.group_key < 5
+        """,
+        settings_override=_override(exchange_kind),
+        expected_plan="""\
+            Expression ((Project names + Projection))
+              MergingAggregated (merge)
+                GatherExchange
+                  Aggregating (partial)
+                    Expression ((Before GROUP BY + ))
+                      JoinLogical
+                        ShuffleExchange (by hash([__table1.id]))
+                          Expression ((WHERE + Change column names to column identifiers))
+                            ReadFromMergeTree (default.big)
+                        ShuffleExchange (by hash([plus(__table2.id, 1_UInt8)]))
+                          Expression (Calculate right join keys)
+                            Expression (Change column names to column identifiers)
+                              ReadFromMergeTree (default.big)
+        """,
+    )
+    assert distributed == baseline
+
+
+@EXCHANGE_KINDS
+def test_broadcast_join(started_cluster, exchange_kind):
+    """Join a large left side with a small right side; the small side is
+    broadcast to every worker when the estimator returns its row count."""
+    distributed, baseline = _run_both_ways(
+        """
+        SELECT count(), sum(b.id)
+        FROM big AS b
+        INNER JOIN small AS s ON b.id = s.id
+        """,
+        # Raise the broadcast threshold high enough to include `small`.
+        settings_override=_override(
+            exchange_kind,
+            "distributed_plan_max_rows_to_broadcast = 10000",
+        ),
+        expected_plan="""\
+            Expression ((Project names + Projection))
+              MergingAggregated (merge)
+                GatherExchange
+                  Aggregating (partial)
+                    Expression (Before GROUP BY)
+                      JoinLogical
+                        Expression (Change column names to column identifiers)
+                          ReadFromMergeTree (default.big)
+                        BroadcastExchange
+                          Expression (Change column names to column identifiers)
+                            ReadFromMergeTree (default.small)
+        """,
+    )
+    assert distributed == baseline
+
+
+@EXCHANGE_KINDS
+def test_shuffle_aggregation(started_cluster, exchange_kind):
+    """GROUP BY with a moderate number of groups so the optimizer picks the
+    one-stage shuffle path (scatter by hash → aggregate → gather)."""
+    distributed, baseline = _run_both_ways(
+        """
+        SELECT group_key, count(), sum(id)
+        FROM big
+        GROUP BY group_key
+        ORDER BY group_key
+        """,
+        settings_override=_override(
+            exchange_kind,
+            "distributed_plan_force_shuffle_aggregation = 1",
+        ),
+        expected_plan="""\
+            Expression (Project names)
+              GatherExchange (sorted by (__table1.group_key ASC))
+                Sorting (Sorting for ORDER BY)
+                  Expression ((Before ORDER BY + Projection))
+                    Aggregating
+                      ShuffleExchange (by hash([__table1.group_key]))
+                        Expression ((Before GROUP BY + Change column names to column identifiers))
+                          ReadFromMergeTree (default.big)
+        """,
+    )
+    assert distributed == baseline
+
+
+@EXCHANGE_KINDS
+def test_distributed_sort(started_cluster, exchange_kind):
+    """ORDER BY ... LIMIT exercises the distributed sort path: each worker
+    performs a partial sort, results are gathered and merged on the initiator."""
+    distributed, baseline = _run_both_ways(
+        """
+        SELECT id, group_key
+        FROM big
+        WHERE group_key > 10
+        ORDER BY id DESC, group_key ASC
+        LIMIT 50
+        """,
+        settings_override=_override(exchange_kind),
+        expected_plan="""\
+            Expression (Project names)
+              Limit (preliminary LIMIT)
+                GatherExchange (sorted by (__table1.id DESC, __table1.group_key ASC))
+                  Sorting (Sorting for ORDER BY)
+                    Expression ((Before ORDER BY + Projection))
+                      Expression ((WHERE + Change column names to column identifiers))
+                        ReadFromMergeTree (default.big)
+        """,
+    )
+    assert distributed == baseline
+
+
+@EXCHANGE_KINDS
+def test_join_with_aggregation(started_cluster, exchange_kind):
+    """Combined shuffle: aggregate one side, broadcast the small side, then
+    join. Verifies that exchanges around aggregation and join compose."""
+    distributed, baseline = _run_both_ways(
+        """
+        SELECT g.group_key, g.cnt, s.label
+        FROM (
+            SELECT group_key, count() AS cnt FROM big GROUP BY group_key
+        ) AS g
+        INNER JOIN small AS s ON g.group_key = s.id
+        ORDER BY g.group_key
+        """,
+        settings_override=_override(
+            exchange_kind,
+            "distributed_plan_max_rows_to_broadcast = 10000",
+            "distributed_plan_force_shuffle_aggregation = 1",
+        ),
+        expected_plan="""\
+            Expression (Project names)
+              GatherExchange (sorted by (__table1.group_key ASC))
+                Sorting (Sorting for ORDER BY)
+                  Expression ((Before ORDER BY + Projection))
+                    JoinLogical
+                      Expression ((Change column names to column identifiers + (Project names + Projection)))
+                        Aggregating
+                          ShuffleExchange (by hash([__table2.group_key]))
+                            Expression ((Before GROUP BY + Change column names to column identifiers))
+                              ReadFromMergeTree (default.big)
+                      BroadcastExchange
+                        Expression (Change column names to column identifiers)
+                          ReadFromMergeTree (default.small)
+        """,
+    )
+    assert distributed == baseline
diff --git a/tests/queries/0_stateless/03394_distributed_broadcast_join.reference b/tests/queries/0_stateless/03394_distributed_broadcast_join.reference
new file mode 100644
index 000000000000..c6c027e85739
--- /dev/null
+++ b/tests/queries/0_stateless/03394_distributed_broadcast_join.reference
@@ -0,0 +1,57 @@
+Expression ((Project names + Projection))
+  MergingAggregated (merge)
+    GatherExchange
+      Aggregating (partial)
+        Expression (Before GROUP BY)
+          JoinLogical
+            Expression (Change column names to column identifiers)
+              ReadFromMergeTree (default.big)
+            BroadcastExchange
+              Expression (Change column names to column identifiers)
+                ReadFromMergeTree (default.small)
+20000
+------------
+Expression ((Project names + Projection))
+  MergingAggregated (merge)
+    GatherExchange
+      Aggregating (partial)
+        ScatterExchange (any)
+          Expression (Before GROUP BY)
+            GatherExchange
+              JoinLogical
+                ScatterExchange (any scatter)
+                  Expression (Change column names to column identifiers)
+                    GatherExchange
+                      ReadFromMergeTree (default.big)
+                BroadcastExchange
+                  Expression (Change column names to column identifiers)
+                    ReadFromMergeTree (default.small)
+------------
+Expression ((Project names + Projection))
+  MergingAggregated (merge)
+    GatherExchange
+      Aggregating (partial)
+        Expression (Before GROUP BY)
+          JoinLogical
+            ScatterExchange (by hash([__table1.sid]))
+              Expression (Change column names to column identifiers)
+                ReadFromMergeTree (default.small)
+            ShuffleExchange (by hash([modulo(plus(__table2.bid, 1_UInt8), 5000_UInt16)]))
+              Expression (Calculate right join keys)
+                Expression (Change column names to column identifiers)
+                  ReadFromMergeTree (default.big)
+20000
+------------
+Expression ((Project names + Projection))
+  MergingAggregated (merge)
+    GatherExchange
+      Aggregating (partial)
+        Expression (Before GROUP BY)
+          JoinLogical
+            ShuffleExchange (any scatter)
+              Expression (Change column names to column identifiers)
+                ReadFromMergeTree (default.big)
+            BroadcastExchange
+              Expression (Change column names to column identifiers)
+                ReadFromMergeTree (default.small)
+20000
diff --git a/tests/queries/0_stateless/03394_distributed_broadcast_join.sql b/tests/queries/0_stateless/03394_distributed_broadcast_join.sql
new file mode 100644
index 000000000000..80e4cc56f05c
--- /dev/null
+++ b/tests/queries/0_stateless/03394_distributed_broadcast_join.sql
@@ -0,0 +1,57 @@
+-- Tags: no-fasttest, no-old-analyzer
+-- no-fasttest: requires object storage
+
+CREATE TABLE small(sid UInt64, s Array(Int64)) ENGINE = MergeTree ORDER BY sid;
+CREATE TABLE big(bid UInt64, b Array(Int64)) ENGINE = MergeTree ORDER BY bid;
+
+insert into small select number, [number] from numbers(0, 1000);
+insert into big select number, [number] from numbers(0, 100000);
+
+SET query_plan_join_swap_table = 0;
+-- Distributed aggregation cannot enforce a global max_rows_to_group_by, so pin it to 0.
+SET max_rows_to_group_by = 0;
+
+SET
+    make_distributed_plan=1,
+    enable_parallel_replicas=0,
+    enable_join_runtime_filters=0,
+    use_statistics=1,
+    distributed_plan_optimize_exchanges=1;
+
+EXPLAIN SELECT count()
+FROM big, small
+WHERE (small.sid = (big.bid + 1) % 5000);
+
+SELECT count()
+FROM big, small
+WHERE (small.sid = (big.bid + 1) % 5000);
+
+SELECT '------------';
+
+EXPLAIN SELECT count()
+FROM big, small
+WHERE (small.sid = (big.bid + 1) % 5000)
+SETTINGS distributed_plan_optimize_exchanges=0;
+
+SELECT '------------';
+
+EXPLAIN SELECT count()
+FROM small, big
+WHERE (small.sid = (big.bid + 1) % 5000);
+
+SELECT count()
+FROM small, big
+WHERE (small.sid = (big.bid + 1) % 5000);
+
+SELECT '------------';
+
+-- Check with big table read bucket count not matching join bucket count
+EXPLAIN SELECT count()
+FROM big, small
+WHERE (small.sid = (big.bid + 1) % 5000)
+SETTINGS distributed_plan_default_shuffle_join_bucket_count=5, distributed_plan_default_reader_bucket_count=2;
+
+SELECT count()
+FROM big, small
+WHERE (small.sid = (big.bid + 1) % 5000)
+SETTINGS distributed_plan_default_shuffle_join_bucket_count=5, distributed_plan_default_reader_bucket_count=2;
diff --git a/tests/queries/0_stateless/03394_distributed_shuffle_join.reference b/tests/queries/0_stateless/03394_distributed_shuffle_join.reference
new file mode 100644
index 000000000000..d189ba3e0870
--- /dev/null
+++ b/tests/queries/0_stateless/03394_distributed_shuffle_join.reference
@@ -0,0 +1,3 @@
+809128
+809128
+809128
diff --git a/tests/queries/0_stateless/03394_distributed_shuffle_join.sql b/tests/queries/0_stateless/03394_distributed_shuffle_join.sql
new file mode 100644
index 000000000000..66f295c4b4cd
--- /dev/null
+++ b/tests/queries/0_stateless/03394_distributed_shuffle_join.sql
@@ -0,0 +1,27 @@
+-- Tags: long, no-fasttest, no-old-analyzer
+-- no-fasttest: requires object storage
+
+DROP TABLE IF EXISTS test_3;
+CREATE TABLE test_3(id UInt64, a Array(Int64)) ENGINE = MergeTree ORDER BY id;
+
+insert into test_3 select number, [number] from numbers(0, 100000);
+
+-- Distributed aggregation cannot enforce a global max_rows_to_group_by, so pin it to 0.
+SET max_rows_to_group_by = 0;
+
+SELECT count()
+FROM test_3 AS a, test_3 AS b, test_3 AS c, test_3 AS d
+WHERE (a.id = (b.id + 1)) AND (b.id = (c.id + 100)) AND ((c.id % 11111) = ((d.id % 12345) + 17));
+
+
+SELECT count()
+FROM test_3 AS a, test_3 AS b, test_3 AS c, test_3 AS d
+WHERE (a.id = (b.id + 1)) AND (b.id = (c.id + 100)) AND ((c.id % 11111) = ((d.id % 12345) + 17))
+SETTINGS make_distributed_plan = 1, enable_parallel_replicas = 0, distributed_plan_default_shuffle_join_bucket_count = 5,
+    query_plan_use_new_logical_join_step=1, distributed_plan_force_exchange_kind='Persisted';
+
+SELECT count()
+FROM test_3 AS a, test_3 AS b, test_3 AS c, test_3 AS d
+WHERE (a.id = (b.id + 1)) AND (b.id = (c.id + 100)) AND ((c.id % 11111) = ((d.id % 12345) + 17))
+SETTINGS make_distributed_plan = 1, enable_parallel_replicas = 0, distributed_plan_default_shuffle_join_bucket_count = 3,
+    query_plan_use_new_logical_join_step=1, distributed_plan_force_exchange_kind='Streaming';
diff --git a/tests/queries/0_stateless/03394_distributed_shuffle_join_early_close_sink.reference b/tests/queries/0_stateless/03394_distributed_shuffle_join_early_close_sink.reference
new file mode 100644
index 000000000000..f40d4e97d177
--- /dev/null
+++ b/tests/queries/0_stateless/03394_distributed_shuffle_join_early_close_sink.reference
@@ -0,0 +1 @@
+50000000
diff --git a/tests/queries/0_stateless/03394_distributed_shuffle_join_early_close_sink.sql b/tests/queries/0_stateless/03394_distributed_shuffle_join_early_close_sink.sql
new file mode 100644
index 000000000000..11e262385a74
--- /dev/null
+++ b/tests/queries/0_stateless/03394_distributed_shuffle_join_early_close_sink.sql
@@ -0,0 +1,17 @@
+-- Tags: long, no-fasttest, no-old-analyzer
+
+SET query_plan_join_swap_table = 0;
+-- Distributed aggregation cannot enforce a global max_rows_to_group_by, so pin it to 0.
+SET max_rows_to_group_by = 0;
+
+CREATE TABLE test(id UInt64, data String) ENGINE=MergeTree() ORDER BY id SETTINGS index_granularity=10000;
+
+INSERT INTO test SELECT number, '' FROM numbers(10000000);
+
+-- Joining on a.id%1 so that only one bucket is not empty
+-- After building hash table for an empty bucket the pipeline will stop reading from probing input and will close Exchange stream before reading all the data from it
+SELECT count()
+FROM test AS b JOIN test AS a ON a.id%1 = b.id%2
+WHERE a.id < 10 AND b.id < 10000000 AND NOT sleepEachRow(0.000001)
+SETTINGS make_distributed_plan=1, enable_parallel_replicas=0, distributed_plan_default_shuffle_join_bucket_count=4, distributed_plan_default_reader_bucket_count=5, distributed_plan_max_rows_to_broadcast=0;
+
diff --git a/tests/queries/0_stateless/03394_distributed_shuffle_join_with_aggregation.reference b/tests/queries/0_stateless/03394_distributed_shuffle_join_with_aggregation.reference
new file mode 100644
index 000000000000..5ccb82979bb8
--- /dev/null
+++ b/tests/queries/0_stateless/03394_distributed_shuffle_join_with_aggregation.reference
@@ -0,0 +1,54 @@
+----------
+Expression (Project names)
+  GatherExchange (sorted by (__table1.path ASC, __table1.hits ASC, __table3.path ASC, __table3.hits ASC))
+    Sorting (Sorting for ORDER BY)
+      ScatterExchange (any scatter)
+        Expression ((Before ORDER BY + Projection))
+          GatherExchange
+            JoinLogical
+              ScatterExchange (by hash([__table1.path]))
+                Expression
+                  GatherExchange
+                    Aggregating
+                      ScatterExchange (by hash([__table2.path]))
+                        Expression
+                          GatherExchange
+                            ReadFromMergeTree (default.test)
+              ScatterExchange (by hash([__table3.path]))
+                BuildRuntimeFilter (Build runtime join filter on __table3.path)
+                  Expression ((Change column names to column identifiers + (Project names + Projection)))
+                    GatherExchange
+                      Aggregating
+                        ScatterExchange (by hash([__table4.path]))
+                          Expression (Before GROUP BY)
+                            Expression ((WHERE + Change column names to column identifiers))
+                              GatherExchange
+                                ReadFromMergeTree (default.test)
+----------
+Expression (Project names)
+  GatherExchange (sorted by (__table1.path ASC, __table1.hits ASC, __table3.path ASC, __table3.hits ASC))
+    Sorting (Sorting for ORDER BY)
+      Expression ((Before ORDER BY + Projection))
+        JoinLogical
+          ShuffleExchange (by hash([__table1.path]))
+            Expression
+              Aggregating
+                ShuffleExchange (by hash([__table2.path]))
+                  Expression
+                    ReadFromMergeTree (default.test)
+          ShuffleExchange (by hash([__table3.path]))
+            BuildRuntimeFilter (Build runtime join filter on __table3.path)
+              Expression ((Change column names to column identifiers + (Project names + Projection)))
+                Aggregating
+                  ShuffleExchange (by hash([__table4.path]))
+                    Expression (Before GROUP BY)
+                      Expression ((WHERE + Change column names to column identifiers))
+                        ReadFromMergeTree (default.test)
+----------
+path_0	0	path_0	12
+path_1	2	path_1	8
+path_2	4	path_2	6
+----------
+path_0	16	0	24
+path_1	12	12	16
+path_2	12	24	12
diff --git a/tests/queries/0_stateless/03394_distributed_shuffle_join_with_aggregation.sql b/tests/queries/0_stateless/03394_distributed_shuffle_join_with_aggregation.sql
new file mode 100644
index 000000000000..cdf868a409fc
--- /dev/null
+++ b/tests/queries/0_stateless/03394_distributed_shuffle_join_with_aggregation.sql
@@ -0,0 +1,68 @@
+-- Tags: no-old-analyzer
+
+-- Reset the global max_rows_to_group_by; distributed aggregation rejects a nonzero limit.
+SET max_rows_to_group_by = 0;
+SET distributed_plan_optimize_exchanges = 1;
+
+DROP TABLE IF EXISTS test;
+CREATE TABLE test(path String, lang String, hits UInt64) ENGINE MergeTree() ORDER BY tuple();
+
+INSERT INTO test SELECT 'path_' || number::String, 'en', number FROM numbers(5);
+INSERT INTO test SELECT 'path_' || (number%3)::String, 'de', number%4 FROM numbers(10);
+
+INSERT INTO test SELECT 'path_' || number::String, 'en', number FROM numbers(5);
+INSERT INTO test SELECT 'path_' || (number%3)::String, 'de', number%4 FROM numbers(10);
+
+SET query_plan_join_swap_table = 0;
+
+SET
+    optimize_move_to_prewhere = 1,
+    query_plan_optimize_prewhere = 1,
+    make_distributed_plan = 1,
+    enable_parallel_replicas = 0,
+    enable_join_runtime_filters=1,
+    distributed_plan_default_shuffle_join_bucket_count=3,
+    distributed_plan_default_reader_bucket_count=3,
+    distributed_plan_force_exchange_kind='Streaming',
+    distributed_plan_max_rows_to_broadcast=0;
+
+SELECT '----------';
+
+EXPLAIN SELECT *
+FROM
+   (SELECT path, sum(hits) as hits FROM test WHERE lang = 'en' GROUP BY path) AS en,
+   (SELECT path, sum(hits) as hits FROM test WHERE lang = 'de' GROUP BY path) AS de
+WHERE (en.path = de.path)
+ORDER BY ALL
+SETTINGS distributed_plan_optimize_exchanges=0;
+
+
+SELECT '----------';
+
+EXPLAIN SELECT *
+FROM
+   (SELECT path, sum(hits) as hits FROM test WHERE lang = 'en' GROUP BY path) AS en,
+   (SELECT path, sum(hits) as hits FROM test WHERE lang = 'de' GROUP BY path) AS de
+WHERE (en.path = de.path)
+ORDER BY ALL;
+
+
+SELECT '----------';
+
+SELECT *
+FROM
+   (SELECT path, sum(hits) as hits FROM test WHERE lang = 'en' GROUP BY path) AS en,
+   (SELECT path, sum(hits) as hits FROM test WHERE lang = 'de' GROUP BY path) AS de
+WHERE (en.path = de.path)
+ORDER BY ALL;
+
+
+SELECT '----------';
+
+SELECT en.path, count(), sum(en.hits), sum(de.hits)
+FROM
+   (SELECT * FROM test WHERE lang = 'en') AS en,
+   (SELECT * FROM test WHERE lang = 'de') AS de
+WHERE (en.path = de.path)
+GROUP BY en.path
+ORDER BY ALL;
diff --git a/tests/queries/0_stateless/03394_distributed_shuffle_join_with_filter.reference b/tests/queries/0_stateless/03394_distributed_shuffle_join_with_filter.reference
new file mode 100644
index 000000000000..4bf486e52820
--- /dev/null
+++ b/tests/queries/0_stateless/03394_distributed_shuffle_join_with_filter.reference
@@ -0,0 +1,36 @@
+-------------------------
+Expression ((Project names + Projection))
+  MergingAggregated (merge)
+    GatherExchange
+      Aggregating (partial)
+        ScatterExchange (any)
+          Expression ((Before GROUP BY + ))
+            GatherExchange
+              JoinLogical
+                ScatterExchange (by hash([__table1.src_ip]))
+                  Expression
+                    GatherExchange
+                      ReadFromMergeTree (default.test)
+                ScatterExchange (by hash([__table2.dst_ip]))
+                  BuildRuntimeFilter (Build runtime join filter on __table2.dst_ip)
+                    Filter ((WHERE + Change column names to column identifiers))
+                      GatherExchange
+                        ReadFromMergeTree (default.test)
+-------------------------
+Expression ((Project names + Projection))
+  MergingAggregated (merge)
+    GatherExchange
+      Aggregating (partial)
+        Expression ((Before GROUP BY + ))
+          JoinLogical
+            ShuffleExchange (by hash([__table1.src_ip]))
+              Expression
+                ReadFromMergeTree (default.test)
+            ShuffleExchange (by hash([__table2.dst_ip]))
+              BuildRuntimeFilter (Build runtime join filter on __table2.dst_ip)
+                Filter ((WHERE + Change column names to column identifiers))
+                  ReadFromMergeTree (default.test)
+-------------------------
+1019
+1019
+1019
diff --git a/tests/queries/0_stateless/03394_distributed_shuffle_join_with_filter.sql b/tests/queries/0_stateless/03394_distributed_shuffle_join_with_filter.sql
new file mode 100644
index 000000000000..41738ea60884
--- /dev/null
+++ b/tests/queries/0_stateless/03394_distributed_shuffle_join_with_filter.sql
@@ -0,0 +1,40 @@
+-- Tags: no-old-analyzer
+
+-- Reset the global max_rows_to_group_by; distributed aggregation rejects a nonzero limit.
+SET max_rows_to_group_by = 0;
+SET distributed_plan_optimize_exchanges = 1;
+SET optimize_move_to_prewhere = 1;
+SET query_plan_optimize_prewhere = 1;
+SET query_plan_remove_unused_columns = 1;
+CREATE TABLE test(src_ip UInt32, dst_ip UInt32, bytes UInt64) ENGINE MergeTree() ORDER BY src_ip settings auto_statistics_types='';
+
+INSERT INTO test SELECT number%30, (number+10)%30, number%50 FROM numbers(100);
+INSERT INTO test SELECT number%30, (number+10)%30, number%50 FROM numbers(100, 100);
+
+-- t1.src_ip!=0 condition is not moved to prewhere because src_ip is in primary key
+
+SET query_plan_join_swap_table = 0;
+SET enable_join_runtime_filters = 1;
+
+SELECT '-------------------------';
+
+EXPLAIN
+SELECT count() FROM test AS t1 JOIN test AS t2 ON t1.src_ip = t2.dst_ip WHERE t1.src_ip != 0 AND t1.bytes > 10
+SETTINGS make_distributed_plan=1, enable_parallel_replicas=0, distributed_plan_optimize_exchanges=0, distributed_plan_max_rows_to_broadcast=0;
+
+SELECT '-------------------------';
+
+EXPLAIN
+SELECT count() FROM test AS t1 JOIN test AS t2 ON t1.src_ip = t2.dst_ip WHERE t1.src_ip != 0 AND t1.bytes > 10
+SETTINGS make_distributed_plan=1, enable_parallel_replicas=0, distributed_plan_max_rows_to_broadcast=0;
+
+SELECT '-------------------------';
+
+SELECT count() FROM test AS t1 JOIN test AS t2 ON t1.src_ip = t2.dst_ip WHERE t1.src_ip != 0 AND t1.bytes > 10
+SETTINGS make_distributed_plan=1, enable_parallel_replicas=0, distributed_plan_optimize_exchanges=0, distributed_plan_default_shuffle_join_bucket_count = 3, distributed_plan_default_reader_bucket_count = 3;
+
+SELECT count() FROM test AS t1 JOIN test AS t2 ON t1.src_ip = t2.dst_ip WHERE t1.src_ip != 0 AND t1.bytes > 10
+SETTINGS make_distributed_plan=1, enable_parallel_replicas=0, distributed_plan_default_shuffle_join_bucket_count = 3, distributed_plan_default_reader_bucket_count = 3;
+
+SELECT count() FROM test AS t1 JOIN test AS t2 ON t1.src_ip = t2.dst_ip WHERE t1.src_ip != 0 AND t1.bytes > 10
+SETTINGS make_distributed_plan=0;
diff --git a/tests/queries/0_stateless/03394_distributed_shuffle_join_with_in.reference b/tests/queries/0_stateless/03394_distributed_shuffle_join_with_in.reference
new file mode 100644
index 000000000000..e2c93545e130
--- /dev/null
+++ b/tests/queries/0_stateless/03394_distributed_shuffle_join_with_in.reference
@@ -0,0 +1,24 @@
+----------
+Expression (Project names)
+  GatherExchange (sorted by (__table1.path ASC, __table1.hits ASC, __table3.path ASC, __table3.hits ASC))
+    Sorting (Sorting for ORDER BY)
+      Expression ((Before ORDER BY + Projection))
+        JoinLogical
+          ShuffleExchange (by hash([__table1.path]))
+            Expression
+              Aggregating
+                ShuffleExchange (by hash([__table2.path]))
+                  Expression
+                    ReadFromMergeTree (default.test)
+          ShuffleExchange (by hash([__table3.path]))
+            BuildRuntimeFilter (Build runtime join filter on __table3.path)
+              Expression ((Change column names to column identifiers + (Project names + Projection)))
+                Aggregating
+                  ShuffleExchange (by hash([__table4.path]))
+                    Expression (Before GROUP BY)
+                      Expression ((WHERE + Change column names to column identifiers))
+                        ReadFromMergeTree (default.test)
+----------
+path_0	12	path_0	12
+path_1	10	path_1	8
+path_2	10	path_2	6
diff --git a/tests/queries/0_stateless/03394_distributed_shuffle_join_with_in.sql b/tests/queries/0_stateless/03394_distributed_shuffle_join_with_in.sql
new file mode 100644
index 000000000000..6e7680024dc9
--- /dev/null
+++ b/tests/queries/0_stateless/03394_distributed_shuffle_join_with_in.sql
@@ -0,0 +1,51 @@
+-- Tags: no-old-analyzer
+
+-- Reset the global max_rows_to_group_by; distributed aggregation rejects a nonzero limit.
+SET max_rows_to_group_by = 0;
+
+DROP TABLE IF EXISTS test;
+
+CREATE TABLE test(path String, lang String, hits UInt64) ENGINE MergeTree() ORDER BY tuple();
+
+INSERT INTO test SELECT 'path_' || number::String, 'en', number FROM numbers(5);
+INSERT INTO test SELECT 'path_' || (number%3)::String, 'de', number%4 FROM numbers(10);
+
+INSERT INTO test SELECT 'path_' || number::String, 'en', number FROM numbers(5);
+INSERT INTO test SELECT 'path_' || (number%3)::String, 'de', number%4 FROM numbers(10);
+
+SET query_plan_join_swap_table = 0;
+
+
+SET
+    optimize_move_to_prewhere = 1,
+    query_plan_optimize_prewhere = 1,
+    make_distributed_plan = 1,
+    enable_parallel_replicas = 0,
+    enable_join_runtime_filters = 1,
+    distributed_plan_default_shuffle_join_bucket_count=3,
+    distributed_plan_default_reader_bucket_count=3,
+    distributed_plan_force_exchange_kind='Streaming',
+    distributed_plan_optimize_exchanges = 1,
+    distributed_plan_max_rows_to_broadcast=0;
+
+SELECT '----------';
+
+-- Query with col IN (val1, val2, ...)
+-- It passes the set corresponding to IN conditions as ColumnSet
+EXPLAIN SELECT *
+FROM
+   (SELECT path, sum(hits) as hits FROM test WHERE lang IN ('en', 'de') GROUP BY path) AS en,
+   (SELECT path, sum(hits) as hits FROM test WHERE lang = 'de' GROUP BY path) AS de
+WHERE (en.path = de.path)
+ORDER BY ALL;
+
+SELECT '----------';
+
+SELECT *
+FROM
+   (SELECT path, sum(hits) as hits FROM test WHERE lang IN ('en', 'de') GROUP BY path) AS en,
+   (SELECT path, sum(hits) as hits FROM test WHERE lang = 'de' GROUP BY path) AS de
+WHERE (en.path = de.path)
+ORDER BY ALL;
+
+DROP TABLE test;
diff --git a/tests/queries/0_stateless/03394_distributed_shuffle_join_with_prewhere.reference b/tests/queries/0_stateless/03394_distributed_shuffle_join_with_prewhere.reference
new file mode 100644
index 000000000000..31ba7f564164
--- /dev/null
+++ b/tests/queries/0_stateless/03394_distributed_shuffle_join_with_prewhere.reference
@@ -0,0 +1,12 @@
+5
+  MergingAggregated (merge)
+    GatherExchange
+      Aggregating (partial)
+          JoinLogical
+          Join: en[N] ⋈ de[N]
+            ShuffleExchange (by hash([__table1.path]))
+                ReadFromMergeTree (default.test)
+                  Prewhere filter column: and(equals(__table1.lang, \'en\'_String), __applyFilter(_runtime_filter_UNIQ_ID_0, __table1.path)) (removed)
+            ShuffleExchange (by hash([__table2.path]))
+                  ReadFromMergeTree (default.test)
+                    Prewhere filter column: equals(__table2.lang, \'de\'_String) (removed)
diff --git a/tests/queries/0_stateless/03394_distributed_shuffle_join_with_prewhere.sql b/tests/queries/0_stateless/03394_distributed_shuffle_join_with_prewhere.sql
new file mode 100644
index 000000000000..f5ad87493bb1
--- /dev/null
+++ b/tests/queries/0_stateless/03394_distributed_shuffle_join_with_prewhere.sql
@@ -0,0 +1,40 @@
+-- Tags: no-old-analyzer
+
+-- Reset the global max_rows_to_group_by; distributed aggregation rejects a nonzero limit.
+SET max_rows_to_group_by = 0;
+SET distributed_plan_optimize_exchanges = 1;
+
+CREATE TABLE test(path String, lang String, hits UInt64) ENGINE MergeTree()
+ORDER BY tuple()
+SETTINGS auto_statistics_types = 'tdigest,uniq,minmax';
+
+SET materialize_statistics_on_insert = 1;
+
+INSERT INTO test SELECT 'path' || number::String, 'en', number FROM numbers(5);
+INSERT INTO test SELECT 'path' || number::String, 'de', number FROM numbers(10);
+INSERT INTO test SELECT 'path' || number::String, 'ua', number FROM numbers(15);
+INSERT INTO test SELECT 'path' || number::String, 'jp', number FROM numbers(20);
+
+SET query_plan_join_swap_table = 0;
+
+SET
+    make_distributed_plan = 1,
+    enable_parallel_replicas = 0,
+    distributed_plan_default_shuffle_join_bucket_count=3,
+    distributed_plan_default_reader_bucket_count=3,
+    distributed_plan_force_exchange_kind='Streaming',
+    distributed_plan_max_rows_to_broadcast=0;
+
+SET enable_join_runtime_filters=1;
+SET query_plan_optimize_prewhere = 1;
+SET optimize_move_to_prewhere = 1;
+SET query_plan_optimize_join_order_limit = 10;
+SET use_statistics = 1, use_statistics_cache = 1;
+
+SELECT count() FROM test AS en, test AS de WHERE (en.path = de.path) AND (en.lang = 'en') AND (de.lang = 'de');
+
+SELECT REGEXP_REPLACE(REGEXP_REPLACE(explain, '_runtime_filter_\\d+', '_runtime_filter_UNIQ_ID'), '\\[\\d+\\]', '[N]') AS explain FROM (
+    EXPLAIN actions = 1 SELECT count() FROM test AS en, test AS de WHERE (en.path = de.path) AND (en.lang = 'en') AND (de.lang = 'de')
+) WHERE
+    explain LIKE '%Join%' OR explain LIKE '%ReadFrom%' OR explain LIKE '%Aggregating%' OR explain LIKE '%Merging%' OR explain LIKE '%filter column%'
+    OR explain LIKE '%Shuffle%' OR explain LIKE '%Broadcast%' OR explain LIKE '%Scatter%' OR explain LIKE '%Gather%';
diff --git a/tests/queries/0_stateless/03394_distributed_sort.reference b/tests/queries/0_stateless/03394_distributed_sort.reference
new file mode 100644
index 000000000000..aea91a2d6337
--- /dev/null
+++ b/tests/queries/0_stateless/03394_distributed_sort.reference
@@ -0,0 +1,19 @@
+0	3	18
+1	3	13
+1	4	19
+2	4	14
+Expression (Project names)
+  GatherExchange (sorted by (__table1.dst_ip ASC, __table1.src_ip ASC, __table1.bytes ASC))
+    Sorting (Sorting for ORDER BY)
+      ScatterExchange (any scatter)
+        Expression ((Before ORDER BY + Projection))
+          Expression ((WHERE + Change column names to column identifiers))
+            ReadFromMergeTree (default.test)
+------------------
+Expression (Project names)
+  GatherExchange (sorted by (__table1.dst_ip ASC, __table1.src_ip ASC, __table1.bytes ASC))
+    Sorting (Sorting for ORDER BY)
+      ScatterExchange (any scatter)
+        Expression ((Before ORDER BY + Projection))
+          Expression ((WHERE + Change column names to column identifiers))
+            ReadFromMergeTree (default.test)
diff --git a/tests/queries/0_stateless/03394_distributed_sort.sh b/tests/queries/0_stateless/03394_distributed_sort.sh
new file mode 100755
index 000000000000..71c55a1122f5
--- /dev/null
+++ b/tests/queries/0_stateless/03394_distributed_sort.sh
@@ -0,0 +1,36 @@
+#!/usr/bin/env bash
+# Tags: no-old-analyzer
+
+CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+# shellcheck source=../shell_config.sh
+. "$CURDIR"/../shell_config.sh
+
+$CLICKHOUSE_CLIENT -q "CREATE TABLE test(src_ip UInt32, dst_ip UInt32, bytes UInt64) ENGINE MergeTree() ORDER BY src_ip"
+
+$CLICKHOUSE_CLIENT -q "INSERT INTO test SELECT number%3, number%4, number FROM numbers(10)"
+$CLICKHOUSE_CLIENT -q "INSERT INTO test SELECT number%5, number%3, number FROM numbers(10, 10)"
+
+$CLICKHOUSE_CLIENT -q "
+SELECT dst_ip, src_ip, bytes
+FROM test
+WHERE bytes > 5 AND src_ip > 2
+ORDER BY dst_ip, src_ip, bytes
+SETTINGS make_distributed_plan=1, enable_parallel_replicas=0"
+
+# The WHERE step may appear as either Expression or Filter depending on optimizer settings.
+# Normalize to Expression so the test is deterministic with randomized settings.
+$CLICKHOUSE_CLIENT -q "
+EXPLAIN SELECT dst_ip, src_ip, bytes
+FROM test
+WHERE bytes > 5 AND src_ip > 2
+ORDER BY dst_ip, src_ip, bytes
+SETTINGS make_distributed_plan=1, enable_parallel_replicas=0, distributed_plan_optimize_exchanges=0" | sed 's/Filter ((WHERE/Expression ((WHERE/'
+
+echo '------------------'
+
+$CLICKHOUSE_CLIENT -q "
+EXPLAIN SELECT dst_ip, src_ip, bytes
+FROM test
+WHERE bytes > 5 AND src_ip > 2
+ORDER BY dst_ip, src_ip, bytes
+SETTINGS make_distributed_plan=1, enable_parallel_replicas=0, distributed_plan_optimize_exchanges=1" | sed 's/Filter ((WHERE/Expression ((WHERE/'
diff --git a/tests/queries/0_stateless/04097_distributed_join_kinds.reference b/tests/queries/0_stateless/04097_distributed_join_kinds.reference
new file mode 100644
index 000000000000..6d490df27963
--- /dev/null
+++ b/tests/queries/0_stateless/04097_distributed_join_kinds.reference
@@ -0,0 +1,195 @@
+-- INNER JOIN (broadcast)
+Expression ((Project names + Projection))
+  MergingAggregated (merge)
+    GatherExchange
+      Aggregating (partial)
+        Expression (Before GROUP BY)
+          JoinLogical
+            ScatterExchange (any scatter)
+              Expression (Change column names to column identifiers)
+                ReadFromMergeTree (default.dist_orders)
+            BroadcastExchange
+              Expression (Change column names to column identifiers)
+                ReadFromMergeTree (default.dist_items)
+350	91612.5
+350	91612.5
+-- LEFT JOIN (broadcast)
+Expression ((Project names + Projection))
+  MergingAggregated (merge)
+    GatherExchange
+      Aggregating (partial)
+        Expression (Before GROUP BY)
+          JoinLogical
+            ScatterExchange (any scatter)
+              Expression (Change column names to column identifiers)
+                ReadFromMergeTree (default.dist_orders)
+            BroadcastExchange
+              Expression (Change column names to column identifiers)
+                ReadFromMergeTree (default.dist_items)
+400
+400
+-- RIGHT JOIN (shuffle)
+Expression ((Project names + Projection))
+  MergingAggregated (merge)
+    GatherExchange
+      Aggregating (partial)
+        Expression (Before GROUP BY)
+          JoinLogical
+            ScatterExchange (by hash([__table1.order_id]))
+              Expression (Change column names to column identifiers)
+                ReadFromMergeTree (default.dist_orders)
+            ScatterExchange (by hash([__table2.order_id]))
+              Expression (Change column names to column identifiers)
+                ReadFromMergeTree (default.dist_items)
+400
+400
+-- FULL JOIN (shuffle)
+Expression ((Project names + Projection))
+  MergingAggregated (merge)
+    GatherExchange
+      Aggregating (partial)
+        Expression (Before GROUP BY)
+          JoinLogical
+            ScatterExchange (by hash([__table1.order_id]))
+              Expression (Change column names to column identifiers)
+                ReadFromMergeTree (default.dist_orders)
+            ScatterExchange (by hash([__table2.order_id]))
+              Expression (Change column names to column identifiers)
+                ReadFromMergeTree (default.dist_items)
+450
+450
+-- LEFT SEMI JOIN (broadcast)
+Expression ((Project names + Projection))
+  MergingAggregated (merge)
+    GatherExchange
+      Aggregating (partial)
+        Expression (Before GROUP BY)
+          JoinLogical
+            ScatterExchange (any scatter)
+              Expression (Change column names to column identifiers)
+                ReadFromMergeTree (default.dist_orders)
+            BroadcastExchange
+              Expression (Change column names to column identifiers)
+                ReadFromMergeTree (default.dist_items)
+350
+350
+-- LEFT ANTI JOIN (broadcast)
+Expression ((Project names + Projection))
+  MergingAggregated (merge)
+    GatherExchange
+      Aggregating (partial)
+        Expression (Before GROUP BY)
+          JoinLogical
+            ScatterExchange (any scatter)
+              Expression (Change column names to column identifiers)
+                ReadFromMergeTree (default.dist_orders)
+            BroadcastExchange
+              Expression (Change column names to column identifiers)
+                ReadFromMergeTree (default.dist_items)
+50
+50
+-- RIGHT SEMI JOIN (shuffle)
+Expression ((Project names + Projection))
+  MergingAggregated (merge)
+    GatherExchange
+      Aggregating (partial)
+        Expression (Before GROUP BY)
+          JoinLogical
+            ScatterExchange (by hash([__table1.order_id]))
+              Expression (Change column names to column identifiers)
+                ReadFromMergeTree (default.dist_orders)
+            ScatterExchange (by hash([__table2.order_id]))
+              Expression (Change column names to column identifiers)
+                ReadFromMergeTree (default.dist_items)
+350
+350
+-- RIGHT ANTI JOIN (shuffle)
+Expression ((Project names + Projection))
+  MergingAggregated (merge)
+    GatherExchange
+      Aggregating (partial)
+        Expression (Before GROUP BY)
+          JoinLogical
+            ScatterExchange (by hash([__table1.order_id]))
+              Expression (Change column names to column identifiers)
+                ReadFromMergeTree (default.dist_orders)
+            ScatterExchange (by hash([__table2.order_id]))
+              Expression (Change column names to column identifiers)
+                ReadFromMergeTree (default.dist_items)
+50
+50
+-- ANY INNER JOIN (broadcast)
+Expression ((Project names + Projection))
+  MergingAggregated (merge)
+    GatherExchange
+      Aggregating (partial)
+        Expression (Before GROUP BY)
+          JoinLogical
+            ScatterExchange (any scatter)
+              Expression (Change column names to column identifiers)
+                ReadFromMergeTree (default.dist_orders)
+            BroadcastExchange
+              Expression (Change column names to column identifiers)
+                ReadFromMergeTree (default.dist_items)
+350
+350
+-- ANY LEFT JOIN (broadcast)
+Expression ((Project names + Projection))
+  MergingAggregated (merge)
+    GatherExchange
+      Aggregating (partial)
+        Expression (Before GROUP BY)
+          JoinLogical
+            ScatterExchange (any scatter)
+              Expression (Change column names to column identifiers)
+                ReadFromMergeTree (default.dist_orders)
+            BroadcastExchange
+              Expression (Change column names to column identifiers)
+                ReadFromMergeTree (default.dist_items)
+400
+400
+-- ASOF JOIN (broadcast)
+Expression (Project names)
+  GatherExchange (sorted by (__table1.symbol ASC))
+    Sorting (Sorting for ORDER BY)
+      Expression ((Before ORDER BY + Projection))
+        Aggregating
+          ShuffleExchange (by hash([__table1.symbol]))
+            Expression (Before GROUP BY)
+              JoinLogical
+                ScatterExchange (any scatter)
+                  Expression (Change column names to column identifiers)
+                    ReadFromMergeTree (default.dist_trades)
+                BroadcastExchange
+                  Expression (Change column names to column identifiers)
+                    ReadFromMergeTree (default.dist_quotes)
+S0	40	4390	4370
+S1	40	4394	4382
+S2	40	4398	4384
+S3	40	4402	4386
+S4	40	4406	4388
+-- ASOF JOIN (shuffle)
+Expression (Project names)
+  GatherExchange (sorted by (__table1.symbol ASC))
+    Sorting (Sorting for ORDER BY)
+      Expression ((Before ORDER BY + Projection))
+        Aggregating
+          ShuffleExchange (by hash([__table1.symbol]))
+            Expression (Before GROUP BY)
+              JoinLogical
+                ShuffleExchange (by hash([__table1.symbol]))
+                  Expression (Change column names to column identifiers)
+                    ReadFromMergeTree (default.dist_trades)
+                ShuffleExchange (by hash([__table2.symbol]))
+                  Expression (Change column names to column identifiers)
+                    ReadFromMergeTree (default.dist_quotes)
+S0	40	4390	4370
+S1	40	4394	4382
+S2	40	4398	4384
+S3	40	4402	4386
+S4	40	4406	4388
+S0	40	4390	4370
+S1	40	4394	4382
+S2	40	4398	4384
+S3	40	4402	4386
+S4	40	4406	4388
diff --git a/tests/queries/0_stateless/04097_distributed_join_kinds.sql b/tests/queries/0_stateless/04097_distributed_join_kinds.sql
new file mode 100644
index 000000000000..8d0b53168666
--- /dev/null
+++ b/tests/queries/0_stateless/04097_distributed_join_kinds.sql
@@ -0,0 +1,242 @@
+-- Test that the old heuristic-based distributed join (tryMakeDistributedJoin)
+-- supports all join kinds and picks the correct distribution strategy.
+--
+-- dist_items (right side) has 400 rows, well below the broadcast threshold
+-- (20000), so broadcast is chosen when safe.  For RIGHT and FULL joins,
+-- broadcast is blocked because the right side can produce unmatched output
+-- rows that would be duplicated across workers — shuffle is used instead.
+--
+-- Each join kind is tested with:
+--   1. EXPLAIN to verify the chosen strategy (Broadcast vs Shuffle)
+--   2. Distributed execution result
+--   3. Single-node baseline for correctness comparison
+
+SET enable_analyzer = 1;
+SET make_distributed_plan = 1;
+SET enable_parallel_replicas = 0;
+SET query_plan_use_new_logical_join_step = 1;
+SET distributed_plan_default_shuffle_join_bucket_count = 4;
+SET distributed_plan_force_exchange_kind = 'Persisted';
+-- Pin settings that affect plan shape to make EXPLAIN output stable.
+-- Reset the global max_rows_to_group_by; distributed aggregation rejects a nonzero limit.
+SET max_rows_to_group_by = 0;
+SET enable_join_runtime_filters = 0;
+SET optimize_move_to_prewhere = 0;
+SET query_plan_convert_outer_join_to_inner_join = 0;
+
+DROP TABLE IF EXISTS dist_orders;
+DROP TABLE IF EXISTS dist_items;
+
+CREATE TABLE dist_orders (
+    order_id UInt64,
+    customer String
+) ENGINE = MergeTree ORDER BY order_id
+  SETTINGS index_granularity = 8192, auto_statistics_types = '';
+
+CREATE TABLE dist_items (
+    item_id UInt64,
+    order_id UInt64,
+    amount Decimal(10, 2)
+) ENGINE = MergeTree ORDER BY item_id
+  SETTINGS index_granularity = 8192, auto_statistics_types = '';
+
+-- 4 parts for dist_orders: orders 0..399
+SYSTEM STOP MERGES dist_orders;
+INSERT INTO dist_orders SELECT number, 'C' || toString(number % 10) FROM numbers(100);
+INSERT INTO dist_orders SELECT number + 100, 'C' || toString((number + 100) % 10) FROM numbers(100);
+INSERT INTO dist_orders SELECT number + 200, 'C' || toString((number + 200) % 10) FROM numbers(100);
+INSERT INTO dist_orders SELECT number + 300, 'C' || toString((number + 300) % 10) FROM numbers(100);
+
+-- 4 parts for dist_items: items for orders 50..449 (overlap: 50..399 match, 400..449 don't)
+SYSTEM STOP MERGES dist_items;
+INSERT INTO dist_items SELECT number, number + 50, toDecimal64(number * 1.5, 2) FROM numbers(100);
+INSERT INTO dist_items SELECT number + 100, number + 150, toDecimal64((number + 100) * 1.5, 2) FROM numbers(100);
+INSERT INTO dist_items SELECT number + 200, number + 250, toDecimal64((number + 200) * 1.5, 2) FROM numbers(100);
+INSERT INTO dist_items SELECT number + 300, number + 350, toDecimal64((number + 300) * 1.5, 2) FROM numbers(100);
+
+
+-- INNER JOIN: broadcast (safe, right side small)
+SELECT '-- INNER JOIN (broadcast)';
+EXPLAIN PLAN SELECT count(), sum(amount)
+FROM dist_orders INNER JOIN dist_items ON dist_orders.order_id = dist_items.order_id;
+
+SELECT count(), sum(amount)
+FROM dist_orders INNER JOIN dist_items ON dist_orders.order_id = dist_items.order_id;
+
+SELECT count(), sum(amount)
+FROM dist_orders INNER JOIN dist_items ON dist_orders.order_id = dist_items.order_id
+SETTINGS make_distributed_plan = 0;
+
+
+-- LEFT JOIN: broadcast (safe, right side small)
+SELECT '-- LEFT JOIN (broadcast)';
+EXPLAIN PLAN SELECT count()
+FROM dist_orders LEFT JOIN dist_items ON dist_orders.order_id = dist_items.order_id;
+
+SELECT count()
+FROM dist_orders LEFT JOIN dist_items ON dist_orders.order_id = dist_items.order_id;
+
+SELECT count()
+FROM dist_orders LEFT JOIN dist_items ON dist_orders.order_id = dist_items.order_id
+SETTINGS make_distributed_plan = 0;
+
+
+-- RIGHT JOIN: shuffle (broadcast blocked — right side produces unmatched rows)
+SELECT '-- RIGHT JOIN (shuffle)';
+EXPLAIN PLAN SELECT count()
+FROM dist_orders RIGHT JOIN dist_items ON dist_orders.order_id = dist_items.order_id;
+
+SELECT count()
+FROM dist_orders RIGHT JOIN dist_items ON dist_orders.order_id = dist_items.order_id;
+
+SELECT count()
+FROM dist_orders RIGHT JOIN dist_items ON dist_orders.order_id = dist_items.order_id
+SETTINGS make_distributed_plan = 0;
+
+
+-- FULL JOIN: shuffle (broadcast blocked — both sides produce unmatched rows)
+SELECT '-- FULL JOIN (shuffle)';
+EXPLAIN PLAN SELECT count()
+FROM dist_orders FULL JOIN dist_items ON dist_orders.order_id = dist_items.order_id;
+
+SELECT count()
+FROM dist_orders FULL JOIN dist_items ON dist_orders.order_id = dist_items.order_id;
+
+SELECT count()
+FROM dist_orders FULL JOIN dist_items ON dist_orders.order_id = dist_items.order_id
+SETTINGS make_distributed_plan = 0;
+
+
+-- LEFT SEMI JOIN: broadcast (safe, right side small)
+SELECT '-- LEFT SEMI JOIN (broadcast)';
+EXPLAIN PLAN SELECT count()
+FROM dist_orders LEFT SEMI JOIN dist_items ON dist_orders.order_id = dist_items.order_id;
+
+SELECT count()
+FROM dist_orders LEFT SEMI JOIN dist_items ON dist_orders.order_id = dist_items.order_id;
+
+SELECT count()
+FROM dist_orders LEFT SEMI JOIN dist_items ON dist_orders.order_id = dist_items.order_id
+SETTINGS make_distributed_plan = 0;
+
+
+-- LEFT ANTI JOIN: broadcast (safe, right side small)
+SELECT '-- LEFT ANTI JOIN (broadcast)';
+EXPLAIN PLAN SELECT count()
+FROM dist_orders LEFT ANTI JOIN dist_items ON dist_orders.order_id = dist_items.order_id;
+
+SELECT count()
+FROM dist_orders LEFT ANTI JOIN dist_items ON dist_orders.order_id = dist_items.order_id;
+
+SELECT count()
+FROM dist_orders LEFT ANTI JOIN dist_items ON dist_orders.order_id = dist_items.order_id
+SETTINGS make_distributed_plan = 0;
+
+
+-- RIGHT SEMI JOIN: shuffle (broadcast blocked — kind is RIGHT)
+SELECT '-- RIGHT SEMI JOIN (shuffle)';
+EXPLAIN PLAN SELECT count()
+FROM dist_orders RIGHT SEMI JOIN dist_items ON dist_orders.order_id = dist_items.order_id;
+
+SELECT count()
+FROM dist_orders RIGHT SEMI JOIN dist_items ON dist_orders.order_id = dist_items.order_id;
+
+SELECT count()
+FROM dist_orders RIGHT SEMI JOIN dist_items ON dist_orders.order_id = dist_items.order_id
+SETTINGS make_distributed_plan = 0;
+
+
+-- RIGHT ANTI JOIN: shuffle (broadcast blocked — kind is RIGHT)
+SELECT '-- RIGHT ANTI JOIN (shuffle)';
+EXPLAIN PLAN SELECT count()
+FROM dist_orders RIGHT ANTI JOIN dist_items ON dist_orders.order_id = dist_items.order_id;
+
+SELECT count()
+FROM dist_orders RIGHT ANTI JOIN dist_items ON dist_orders.order_id = dist_items.order_id;
+
+SELECT count()
+FROM dist_orders RIGHT ANTI JOIN dist_items ON dist_orders.order_id = dist_items.order_id
+SETTINGS make_distributed_plan = 0;
+
+
+-- ANY INNER JOIN: broadcast (safe, right side small)
+SELECT '-- ANY INNER JOIN (broadcast)';
+EXPLAIN PLAN SELECT count()
+FROM dist_orders ANY INNER JOIN dist_items ON dist_orders.order_id = dist_items.order_id;
+
+SELECT count()
+FROM dist_orders ANY INNER JOIN dist_items ON dist_orders.order_id = dist_items.order_id;
+
+SELECT count()
+FROM dist_orders ANY INNER JOIN dist_items ON dist_orders.order_id = dist_items.order_id
+SETTINGS make_distributed_plan = 0;
+
+
+-- ANY LEFT JOIN: broadcast (safe, right side small)
+SELECT '-- ANY LEFT JOIN (broadcast)';
+EXPLAIN PLAN SELECT count()
+FROM dist_orders ANY LEFT JOIN dist_items ON dist_orders.order_id = dist_items.order_id;
+
+SELECT count()
+FROM dist_orders ANY LEFT JOIN dist_items ON dist_orders.order_id = dist_items.order_id;
+
+SELECT count()
+FROM dist_orders ANY LEFT JOIN dist_items ON dist_orders.order_id = dist_items.order_id
+SETTINGS make_distributed_plan = 0;
+
+
+-- ASOF JOIN: broadcast (equality predicate is sufficient for shuffle partitioning,
+-- the HashJoin ASOF implementation sorts the right side internally per equality-key
+-- bucket, so input order after shuffle does not matter)
+DROP TABLE IF EXISTS dist_trades;
+DROP TABLE IF EXISTS dist_quotes;
+
+CREATE TABLE dist_trades (symbol String, ts DateTime, price Decimal(10, 2))
+ENGINE = MergeTree ORDER BY (symbol, ts)
+SETTINGS index_granularity = 8192, auto_statistics_types = '';
+
+CREATE TABLE dist_quotes (symbol String, ts DateTime, bid Decimal(10, 2))
+ENGINE = MergeTree ORDER BY (symbol, ts)
+SETTINGS index_granularity = 8192, auto_statistics_types = '';
+
+-- Multiple symbols across multiple parts to exercise shuffle partitioning.
+SYSTEM STOP MERGES dist_trades;
+INSERT INTO dist_trades SELECT 'S' || toString(number % 5), toDateTime('2024-01-01 10:00:00') + number * 60, toDecimal64(100 + number * 0.1, 2) FROM numbers(100);
+INSERT INTO dist_trades SELECT 'S' || toString(number % 5), toDateTime('2024-01-01 10:00:00') + (number + 100) * 60, toDecimal64(100 + (number + 100) * 0.1, 2) FROM numbers(100);
+
+SYSTEM STOP MERGES dist_quotes;
+INSERT INTO dist_quotes SELECT 'S' || toString(number % 5), toDateTime('2024-01-01 09:58:00') + number * 30, toDecimal64(99.5 + number * 0.05, 2) FROM numbers(200);
+INSERT INTO dist_quotes SELECT 'S' || toString(number % 5), toDateTime('2024-01-01 09:58:00') + (number + 200) * 30, toDecimal64(99.5 + (number + 200) * 0.05, 2) FROM numbers(200);
+
+SELECT '-- ASOF JOIN (broadcast)';
+EXPLAIN PLAN SELECT symbol, count(), sum(price), sum(bid)
+FROM dist_trades ASOF LEFT JOIN dist_quotes ON dist_trades.symbol = dist_quotes.symbol AND dist_trades.ts >= dist_quotes.ts
+GROUP BY symbol ORDER BY symbol;
+
+SELECT symbol, count(), sum(price), sum(bid)
+FROM dist_trades ASOF LEFT JOIN dist_quotes ON dist_trades.symbol = dist_quotes.symbol AND dist_trades.ts >= dist_quotes.ts
+GROUP BY symbol ORDER BY symbol;
+
+-- Force shuffle by setting broadcast threshold to 0.
+SELECT '-- ASOF JOIN (shuffle)';
+EXPLAIN PLAN SELECT symbol, count(), sum(price), sum(bid)
+FROM dist_trades ASOF LEFT JOIN dist_quotes ON dist_trades.symbol = dist_quotes.symbol AND dist_trades.ts >= dist_quotes.ts
+GROUP BY symbol ORDER BY symbol
+SETTINGS distributed_plan_max_rows_to_broadcast = 0;
+
+SELECT symbol, count(), sum(price), sum(bid)
+FROM dist_trades ASOF LEFT JOIN dist_quotes ON dist_trades.symbol = dist_quotes.symbol AND dist_trades.ts >= dist_quotes.ts
+GROUP BY symbol ORDER BY symbol
+SETTINGS distributed_plan_max_rows_to_broadcast = 0;
+
+-- Single-node baseline.
+SELECT symbol, count(), sum(price), sum(bid)
+FROM dist_trades ASOF LEFT JOIN dist_quotes ON dist_trades.symbol = dist_quotes.symbol AND dist_trades.ts >= dist_quotes.ts
+GROUP BY symbol ORDER BY symbol
+SETTINGS make_distributed_plan = 0;
+
+DROP TABLE dist_trades;
+DROP TABLE dist_quotes;
+
+DROP TABLE dist_orders;
+DROP TABLE dist_items;
diff --git a/tests/queries/0_stateless/04105_distributed_final_replacing.reference b/tests/queries/0_stateless/04105_distributed_final_replacing.reference
new file mode 100644
index 000000000000..51f104a032a8
--- /dev/null
+++ b/tests/queries/0_stateless/04105_distributed_final_replacing.reference
@@ -0,0 +1,4 @@
+-- Local
+100000	200000	100000
+-- Distributed
+100000	200000	100000
diff --git a/tests/queries/0_stateless/04105_distributed_final_replacing.sql b/tests/queries/0_stateless/04105_distributed_final_replacing.sql
new file mode 100644
index 000000000000..8e45f42a33da
--- /dev/null
+++ b/tests/queries/0_stateless/04105_distributed_final_replacing.sql
@@ -0,0 +1,24 @@
+-- Tags: no-old-analyzer
+-- Regression test: distributed query plan on SELECT FINAL from engines with specialized merging
+-- (Replacing, Collapsing, ...) must not reroute same-sort-key rows to different buckets, or
+-- deduplication is broken.
+
+DROP TABLE IF EXISTS t_replacing_final_correctness;
+
+CREATE TABLE t_replacing_final_correctness(pk UInt64, version UInt64, val String) ENGINE = ReplacingMergeTree(version) ORDER BY pk;
+
+SYSTEM STOP MERGES t_replacing_final_correctness;
+
+-- Two parts with full PK overlap and different versions.
+INSERT INTO t_replacing_final_correctness SELECT number, 1, 'old' FROM numbers(100000);
+INSERT INTO t_replacing_final_correctness SELECT number, 2, 'new' FROM numbers(100000);
+
+SELECT '-- Local';
+SELECT count(), sum(version), uniqExact(pk) FROM t_replacing_final_correctness FINAL;
+
+SELECT '-- Distributed';
+-- Reset the global max_rows_to_group_by; distributed aggregation rejects a nonzero limit.
+SELECT count(), sum(version), uniqExact(pk) FROM t_replacing_final_correctness FINAL
+SETTINGS make_distributed_plan = 1, enable_parallel_replicas = 0, max_rows_to_group_by = 0;
+
+DROP TABLE t_replacing_final_correctness;
diff --git a/tests/queries/0_stateless/04305_distributed_shuffle_join_type_mixed.reference b/tests/queries/0_stateless/04305_distributed_shuffle_join_type_mixed.reference
new file mode 100644
index 000000000000..f2b3ff1fe936
--- /dev/null
+++ b/tests/queries/0_stateless/04305_distributed_shuffle_join_type_mixed.reference
@@ -0,0 +1,17 @@
+-- Local
+1000000
+-- Distributed plan
+Expression ((Project names + Projection))
+  MergingAggregated (merge)
+    GatherExchange
+      Aggregating (partial)
+        Expression (Before GROUP BY)
+          JoinLogical
+            ShuffleExchange (by hash([__table1.k]))
+              Expression (Change column names to column identifiers)
+                ReadFromMergeTree (default.t_shuffle_join_left)
+            ShuffleExchange (by hash([__table2.k]))
+              Expression (Change column names to column identifiers)
+                ReadFromMergeTree (default.t_shuffle_join_right)
+-- Distributed
+1000000
diff --git a/tests/queries/0_stateless/04305_distributed_shuffle_join_type_mixed.sql b/tests/queries/0_stateless/04305_distributed_shuffle_join_type_mixed.sql
new file mode 100644
index 000000000000..cf4ed6dc0364
--- /dev/null
+++ b/tests/queries/0_stateless/04305_distributed_shuffle_join_type_mixed.sql
@@ -0,0 +1,43 @@
+-- Tags: no-old-analyzer
+-- no-old-analyzer: make_distributed_plan requires the analyzer.
+
+-- Regression test: shuffle-join key types differ on left and right side.
+-- Without casting to a common supertype, the scatter step on each side uses different
+-- hashing because of different physical types, so matching rows are routed to different
+-- buckets and the join silently drops them.
+
+-- Reset the global max_rows_to_group_by; distributed aggregation rejects a nonzero limit.
+SET max_rows_to_group_by = 0;
+
+DROP TABLE IF EXISTS t_shuffle_join_left;
+DROP TABLE IF EXISTS t_shuffle_join_right;
+
+CREATE TABLE t_shuffle_join_left  (k Int8,  v UInt32) ENGINE = MergeTree ORDER BY k;
+CREATE TABLE t_shuffle_join_right (k Int64, v UInt32) ENGINE = MergeTree ORDER BY k;
+
+INSERT INTO t_shuffle_join_left  SELECT (number % 100) - 50, number FROM numbers(10000);
+INSERT INTO t_shuffle_join_right SELECT (number % 100) - 50, number * 2 FROM numbers(10000);
+
+SELECT '-- Local';
+SELECT count() FROM t_shuffle_join_left AS l JOIN t_shuffle_join_right AS r ON l.k = r.k;
+
+SELECT '-- Distributed plan';
+EXPLAIN SELECT count() FROM t_shuffle_join_left AS l JOIN t_shuffle_join_right AS r ON l.k = r.k
+SETTINGS
+    make_distributed_plan = 1,
+    enable_parallel_replicas = 0,
+    distributed_plan_execute_locally = 1,
+    distributed_plan_max_rows_to_broadcast = 0,
+    enable_join_runtime_filters = 0;
+
+SELECT '-- Distributed';
+SELECT count() FROM t_shuffle_join_left AS l JOIN t_shuffle_join_right AS r ON l.k = r.k
+SETTINGS
+    make_distributed_plan = 1,
+    enable_parallel_replicas = 0,
+    distributed_plan_execute_locally = 1,
+    distributed_plan_max_rows_to_broadcast = 0,
+    enable_join_runtime_filters = 0;
+
+DROP TABLE t_shuffle_join_left;
+DROP TABLE t_shuffle_join_right;
diff --git a/tests/queries/0_stateless/04306_distributed_aggregation_correctness_guards.reference b/tests/queries/0_stateless/04306_distributed_aggregation_correctness_guards.reference
new file mode 100644
index 000000000000..01dd3c633a29
--- /dev/null
+++ b/tests/queries/0_stateless/04306_distributed_aggregation_correctness_guards.reference
@@ -0,0 +1,2 @@
+-- GROUPING SETS rejected
+-- max_rows_to_group_by rejected
diff --git a/tests/queries/0_stateless/04306_distributed_aggregation_correctness_guards.sql b/tests/queries/0_stateless/04306_distributed_aggregation_correctness_guards.sql
new file mode 100644
index 000000000000..e536ffa5b122
--- /dev/null
+++ b/tests/queries/0_stateless/04306_distributed_aggregation_correctness_guards.sql
@@ -0,0 +1,25 @@
+-- Tags: no-old-analyzer
+-- no-old-analyzer: make_distributed_plan requires the analyzer.
+
+-- Regression test: make_distributed_plan rejects aggregations it cannot distribute correctly,
+-- rather than silently running them single-node.
+--   * GROUPING SETS: shuffle scatters by the full key set, so subtotals (over key subsets) would be
+--     produced independently in several buckets and duplicated.
+--   * A global GROUP BY limit (max_rows_to_group_by) cannot be enforced once split per bucket.
+
+DROP TABLE IF EXISTS t_agg_guard;
+
+CREATE TABLE t_agg_guard (a UInt32, b UInt32, v UInt32) ENGINE = MergeTree ORDER BY (a, b);
+INSERT INTO t_agg_guard SELECT number % 10, number % 7, number FROM numbers(100000);
+
+SET make_distributed_plan = 1, enable_parallel_replicas = 0, distributed_plan_execute_locally = 1,
+    distributed_plan_max_rows_to_broadcast = 1000000000, enable_join_runtime_filters = 0;
+
+SELECT '-- GROUPING SETS rejected';
+SELECT a, b, sum(v) AS s FROM t_agg_guard GROUP BY GROUPING SETS ((a), (b), ()); -- { serverError SUPPORT_IS_DISABLED }
+
+SELECT '-- max_rows_to_group_by rejected';
+SELECT a, sum(v) FROM t_agg_guard GROUP BY a
+SETTINGS max_rows_to_group_by = 5; -- { serverError SUPPORT_IS_DISABLED }
+
+DROP TABLE t_agg_guard;
diff --git a/tests/queries/0_stateless/04307_distributed_read_error_terminates.reference b/tests/queries/0_stateless/04307_distributed_read_error_terminates.reference
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/queries/0_stateless/04307_distributed_read_error_terminates.sql b/tests/queries/0_stateless/04307_distributed_read_error_terminates.sql
new file mode 100644
index 000000000000..458134772560
--- /dev/null
+++ b/tests/queries/0_stateless/04307_distributed_read_error_terminates.sql
@@ -0,0 +1,20 @@
+-- Tags: no-old-analyzer
+-- no-old-analyzer: make_distributed_plan requires the analyzer.
+
+-- Regression test: when a distributed read errors mid-flight, the query must terminate with that
+-- error rather than hang. The local in-memory exchanges are cancelled on teardown so tasks waiting
+-- for input do not block forever. max_rows_to_read trips during the distributed read while it is
+-- still feeding the downstream aggregation.
+
+DROP TABLE IF EXISTS t_distr_read_error;
+
+CREATE TABLE t_distr_read_error (a UInt32, v UInt32) ENGINE = MergeTree ORDER BY a;
+INSERT INTO t_distr_read_error SELECT number % 10, number FROM numbers(100000);
+
+SELECT a, sum(v) FROM t_distr_read_error GROUP BY a
+SETTINGS make_distributed_plan = 1, enable_parallel_replicas = 0, distributed_plan_execute_locally = 1,
+    distributed_plan_max_rows_to_broadcast = 0, distributed_plan_default_reader_bucket_count = 4,
+    enable_join_runtime_filters = 0, max_rows_to_group_by = 0,
+    max_rows_to_read = 10, read_overflow_mode = 'throw'; -- { serverError TOO_MANY_ROWS }
+
+DROP TABLE t_distr_read_error;
diff --git a/tests/queries/0_stateless/04308_distributed_totals_rollup_cube_not_distributed.reference b/tests/queries/0_stateless/04308_distributed_totals_rollup_cube_not_distributed.reference
new file mode 100644
index 000000000000..bf7b5aaba123
--- /dev/null
+++ b/tests/queries/0_stateless/04308_distributed_totals_rollup_cube_not_distributed.reference
@@ -0,0 +1,3 @@
+-- WITH TOTALS
+-- ROLLUP
+-- CUBE
diff --git a/tests/queries/0_stateless/04308_distributed_totals_rollup_cube_not_distributed.sql b/tests/queries/0_stateless/04308_distributed_totals_rollup_cube_not_distributed.sql
new file mode 100644
index 000000000000..7fff5bd8f102
--- /dev/null
+++ b/tests/queries/0_stateless/04308_distributed_totals_rollup_cube_not_distributed.sql
@@ -0,0 +1,25 @@
+-- Tags: no-old-analyzer
+-- no-old-analyzer: make_distributed_plan requires the analyzer.
+
+-- Regression test: WITH TOTALS / ROLLUP / CUBE produce extra streams (a totals stream, or subtotal
+-- rows from a Rollup/Cube step) that the distributed exchange protocol does not carry. make_distributed_plan
+-- rejects such plans rather than silently running them single-node.
+
+DROP TABLE IF EXISTS t_totals_guard;
+
+CREATE TABLE t_totals_guard (a UInt32, v UInt32) ENGINE = MergeTree ORDER BY a;
+INSERT INTO t_totals_guard SELECT number % 10, number FROM numbers(100000);
+
+SET make_distributed_plan = 1, enable_parallel_replicas = 0, distributed_plan_execute_locally = 1,
+    distributed_plan_max_rows_to_broadcast = 0, enable_join_runtime_filters = 0, max_rows_to_group_by = 0;
+
+SELECT '-- WITH TOTALS';
+SELECT a, sum(v) FROM t_totals_guard GROUP BY a WITH TOTALS ORDER BY a; -- { serverError SUPPORT_IS_DISABLED }
+
+SELECT '-- ROLLUP';
+SELECT a, sum(v) FROM t_totals_guard GROUP BY a WITH ROLLUP ORDER BY a; -- { serverError SUPPORT_IS_DISABLED }
+
+SELECT '-- CUBE';
+SELECT a, sum(v) FROM t_totals_guard GROUP BY a WITH CUBE ORDER BY a; -- { serverError SUPPORT_IS_DISABLED }
+
+DROP TABLE t_totals_guard;
diff --git a/tests/queries/0_stateless/04309_distributed_aggregation_persisted_exchange.reference b/tests/queries/0_stateless/04309_distributed_aggregation_persisted_exchange.reference
new file mode 100644
index 000000000000..cf0981de79a3
--- /dev/null
+++ b/tests/queries/0_stateless/04309_distributed_aggregation_persisted_exchange.reference
@@ -0,0 +1,5 @@
+0	100
+1	100
+2	100
+3	100
+4	100
diff --git a/tests/queries/0_stateless/04309_distributed_aggregation_persisted_exchange.sql b/tests/queries/0_stateless/04309_distributed_aggregation_persisted_exchange.sql
new file mode 100644
index 000000000000..f83f65843e78
--- /dev/null
+++ b/tests/queries/0_stateless/04309_distributed_aggregation_persisted_exchange.sql
@@ -0,0 +1,18 @@
+-- Tags: no-old-analyzer
+
+-- Regression test: distributed partial aggregation delivered through a persisted exchange must not
+-- deadlock. The result reader drains final_result after the driver (the executor) has finished, so
+-- the query's in-memory exchanges must outlive the executor rather than be removed on its completion.
+-- A high distributed_plan_max_rows_to_broadcast keeps the aggregation on the partial+merge path.
+
+DROP TABLE IF EXISTS t_agg_persisted;
+
+CREATE TABLE t_agg_persisted (k UInt32, v UInt32) ENGINE = MergeTree ORDER BY k;
+INSERT INTO t_agg_persisted SELECT number % 500, number FROM numbers(50000);
+
+SELECT k, count() AS c FROM t_agg_persisted GROUP BY k ORDER BY k LIMIT 5
+SETTINGS make_distributed_plan = 1, enable_parallel_replicas = 0, distributed_plan_execute_locally = 1,
+    distributed_plan_max_rows_to_broadcast = 1000000000, distributed_plan_force_exchange_kind = 'Persisted',
+    enable_join_runtime_filters = 0, max_rows_to_group_by = 0;
+
+DROP TABLE t_agg_persisted;
diff --git a/tests/queries/0_stateless/04310_distributed_unserializable_step_rejected.reference b/tests/queries/0_stateless/04310_distributed_unserializable_step_rejected.reference
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/queries/0_stateless/04310_distributed_unserializable_step_rejected.sql b/tests/queries/0_stateless/04310_distributed_unserializable_step_rejected.sql
new file mode 100644
index 000000000000..b15249382c68
--- /dev/null
+++ b/tests/queries/0_stateless/04310_distributed_unserializable_step_rejected.sql
@@ -0,0 +1,17 @@
+-- Tags: no-old-analyzer
+
+-- Regression test: make_distributed_plan fails early and explicitly when a stage fragment contains a
+-- step that cannot be serialized for remote execution (here a window function), instead of failing
+-- late mid-execution with a generic "Method serialize is not implemented" error.
+
+DROP TABLE IF EXISTS t_unserializable;
+
+CREATE TABLE t_unserializable (a UInt32, v UInt32) ENGINE = MergeTree ORDER BY a;
+INSERT INTO t_unserializable SELECT number % 10, number FROM numbers(100000);
+
+SELECT a, sum(v) OVER (PARTITION BY a) AS w FROM t_unserializable
+SETTINGS make_distributed_plan = 1, enable_parallel_replicas = 0, distributed_plan_execute_locally = 1,
+    distributed_plan_max_rows_to_broadcast = 0, distributed_plan_default_reader_bucket_count = 4,
+    enable_join_runtime_filters = 0, max_rows_to_group_by = 0; -- { serverError SUPPORT_IS_DISABLED }
+
+DROP TABLE t_unserializable;
diff --git a/tests/queries/0_stateless/04319_distributed_plan_set_operation_const_columns.reference b/tests/queries/0_stateless/04319_distributed_plan_set_operation_const_columns.reference
new file mode 100644
index 000000000000..3a48788e7543
--- /dev/null
+++ b/tests/queries/0_stateless/04319_distributed_plan_set_operation_const_columns.reference
@@ -0,0 +1 @@
+\N	\N
diff --git a/tests/queries/0_stateless/04319_distributed_plan_set_operation_const_columns.sql b/tests/queries/0_stateless/04319_distributed_plan_set_operation_const_columns.sql
new file mode 100644
index 000000000000..da3c074ec357
--- /dev/null
+++ b/tests/queries/0_stateless/04319_distributed_plan_set_operation_const_columns.sql
@@ -0,0 +1,44 @@
+-- Tags: no-old-analyzer
+-- no-old-analyzer: make_distributed_plan requires the analyzer.
+
+-- A distributed set operation (UNION / INTERSECT / EXCEPT) can produce a column as a constant in one
+-- branch and as a full column (aliased from an exchange) in another. Plan serialization re-derives
+-- constness per step, so the branches used to mismatch at the strict set-operation header check.
+-- Constants are now materialized on every branch so they agree.
+
+DROP TABLE IF EXISTS t_union_const;
+CREATE TABLE t_union_const (x UInt64) ENGINE = MergeTree ORDER BY tuple();
+INSERT INTO t_union_const SELECT number FROM numbers(1000000);
+
+-- Distributed aggregation cannot enforce a global max_rows_to_group_by, so pin it to 0.
+SET max_rows_to_group_by = 0;
+
+SET make_distributed_plan = 1, enable_parallel_replicas = 0, distributed_plan_execute_locally = 1;
+
+-- UNION: a constant in one branch vs a full column aliased from an exchange in the other.
+SELECT DISTINCT toFixedString(NULL, 'null'), minus(NULL, (SELECT NULL))
+FROM t_union_const WHERE x < 1 GROUP BY 1, NULL
+UNION DISTINCT
+SELECT DISTINCT toFixedString(NULL, 'null'), minus(NULL, (SELECT NULL))
+FROM t_union_const WHERE x < 9 GROUP BY 1, NULL;
+
+DROP TABLE t_union_const;
+
+-- INTERSECT / EXCEPT with a UNION of constants inside each input. The set operation must be the root
+-- step to reproduce (a parent step would be distributed and reject the unserializable set operation),
+-- so the result cannot be ordered; use FORMAT Null.
+SET distributed_plan_max_rows_to_broadcast = 0;
+
+WITH cte AS (SELECT DISTINCT -9223372036854775808 UNION ALL SELECT DISTINCT NULL LIMIT 1025)
+SELECT DISTINCT toNullable(NULL), * FROM cte LIMIT 65536
+INTERSECT DISTINCT
+WITH cte AS (SELECT DISTINCT -9223372036854775808 UNION ALL SELECT DISTINCT NULL LIMIT 1025)
+SELECT DISTINCT NULL, * FROM cte LIMIT 65536
+FORMAT Null;
+
+WITH cte AS (SELECT DISTINCT -9223372036854775808 UNION ALL SELECT DISTINCT NULL LIMIT 1025)
+SELECT DISTINCT toNullable(NULL), * FROM cte LIMIT 65536
+EXCEPT DISTINCT
+WITH cte AS (SELECT DISTINCT 0 UNION ALL SELECT DISTINCT NULL LIMIT 1025)
+SELECT DISTINCT NULL, * FROM cte LIMIT 65536
+FORMAT Null;
diff --git a/tests/queries/0_stateless/04320_distributed_plan_read_rejects.reference b/tests/queries/0_stateless/04320_distributed_plan_read_rejects.reference
new file mode 100644
index 000000000000..e613ab682c66
--- /dev/null
+++ b/tests/queries/0_stateless/04320_distributed_plan_read_rejects.reference
@@ -0,0 +1,2 @@
+19999900000
+19999900000
diff --git a/tests/queries/0_stateless/04320_distributed_plan_read_rejects.sql b/tests/queries/0_stateless/04320_distributed_plan_read_rejects.sql
new file mode 100644
index 000000000000..85c740037e9c
--- /dev/null
+++ b/tests/queries/0_stateless/04320_distributed_plan_read_rejects.sql
@@ -0,0 +1,26 @@
+-- Tags: no-old-analyzer
+-- no-old-analyzer: make_distributed_plan requires the analyzer.
+
+DROP TABLE IF EXISTS t_read_rejects;
+CREATE TABLE t_read_rejects (x UInt64) ENGINE = MergeTree ORDER BY tuple();
+INSERT INTO t_read_rejects SELECT number FROM numbers(200000);
+
+-- Distributed aggregation cannot enforce a global max_rows_to_group_by, so pin it to 0 (randomized
+-- settings set it nonzero, which would make make_distributed_plan reject the count/sum below).
+SET max_rows_to_group_by = 0;
+SET make_distributed_plan = 1, enable_parallel_replicas = 0, distributed_plan_execute_locally = 1,
+    distributed_plan_max_rows_to_broadcast = 0;
+
+-- A distributed read cannot reproduce the coordinator's part ordering, so the part-order virtual
+-- columns are rejected at planning time (rather than silently returning worker-local values).
+SELECT _part_index FROM t_read_rejects; -- { serverError SUPPORT_IS_DISABLED }
+SELECT _part_starting_offset FROM t_read_rejects; -- { serverError SUPPORT_IS_DISABLED }
+
+-- _part_offset alone is per-part and order-independent, so it stays supported.
+SELECT sum(_part_offset) FROM t_read_rejects;
+
+-- A per-block function must keep its global numbering: the GatherExchange is not pushed below it,
+-- so the row numbers stay a single 0..N-1 sequence (sum is order-independent).
+SELECT sum(rn) FROM (SELECT rowNumberInAllBlocks() AS rn FROM t_read_rejects);
+
+DROP TABLE t_read_rejects;
diff --git a/tests/queries/0_stateless/04321_distributed_plan_count_implicit_projection.reference b/tests/queries/0_stateless/04321_distributed_plan_count_implicit_projection.reference
new file mode 100644
index 000000000000..fc7bed0c3a6c
--- /dev/null
+++ b/tests/queries/0_stateless/04321_distributed_plan_count_implicit_projection.reference
@@ -0,0 +1,3 @@
+200000
+99999
+99999
diff --git a/tests/queries/0_stateless/04321_distributed_plan_count_implicit_projection.sql b/tests/queries/0_stateless/04321_distributed_plan_count_implicit_projection.sql
new file mode 100644
index 000000000000..c603a5f4e2e6
--- /dev/null
+++ b/tests/queries/0_stateless/04321_distributed_plan_count_implicit_projection.sql
@@ -0,0 +1,32 @@
+-- Tags: no-old-analyzer
+-- no-old-analyzer: make_distributed_plan requires the analyzer.
+
+-- The implicit count/minmax projection counts rows from part metadata. A distributed read buckets the
+-- part across workers; if the projection is left enabled it is replicated to every bucket and counts
+-- the whole part each time, so the result is multiplied by the bucket count. These distributed counts
+-- must match a single-node read.
+
+DROP TABLE IF EXISTS t_one;
+DROP TABLE IF EXISTS t_cnt;
+CREATE TABLE t_one (x UInt64) ENGINE = MergeTree ORDER BY tuple();
+CREATE TABLE t_cnt (k UInt64, v UInt64) ENGINE = MergeTree ORDER BY k;
+INSERT INTO t_one SELECT number FROM numbers(200000);
+INSERT INTO t_cnt SELECT number, number FROM numbers(200000);
+
+-- Pin the implicit projection on so the bug path is exercised; the fix disables it for distributed
+-- plans regardless. Without pinning, randomized settings can turn it off and mask the regression.
+-- Pin max_rows_to_group_by to 0 too: distributed aggregation rejects a nonzero limit (randomized).
+SET max_rows_to_group_by = 0;
+SET make_distributed_plan = 1, enable_parallel_replicas = 0, distributed_plan_execute_locally = 1,
+    distributed_plan_max_rows_to_broadcast = 0, distributed_plan_default_reader_bucket_count = 8,
+    optimize_use_implicit_projections = 1;
+
+-- Trivial count over a distributed read (counted from part metadata).
+SELECT count() FROM (SELECT x FROM t_one);
+-- Primary-key range: full granules counted from the index, only the boundary granule scanned.
+SELECT count() FROM t_cnt WHERE k > 100000;
+-- Non-index filter: a real per-row scan (always correct; guards against regression the other way).
+SELECT count() FROM t_cnt WHERE v > 100000;
+
+DROP TABLE t_one;
+DROP TABLE t_cnt;

From 4087e7a7f27801e5f68b549dfafb8f2e2b9b0b46 Mon Sep 17 00:00:00 2001
From: Anton Ivashkin <ianton@live.com>
Date: Thu, 2 Jul 2026 10:28:58 +0200
Subject: [PATCH 2/2] Fix after cherry-pick

---
 .../QueryPlan/BroadcastReceiveStep.cpp        |  2 +-
 .../Optimizations/makeDistributed.cpp         |  6 --
 src/Processors/QueryPlan/QueryPlan.h          |  4 ++
 .../QueryPlan/ReadFromMergeTree.cpp           | 71 +------------------
 src/Processors/QueryPlan/ReadFromMergeTree.h  |  7 ++
 .../QueryPlan/ShuffleReceiveStep.cpp          |  2 +-
 src/QueryPipeline/DistributedPlanExecutor.cpp |  2 -
 .../StatelessWorker/StatelessWorkerClient.cpp | 26 +++----
 8 files changed, 27 insertions(+), 93 deletions(-)

diff --git a/src/Processors/QueryPlan/BroadcastReceiveStep.cpp b/src/Processors/QueryPlan/BroadcastReceiveStep.cpp
index 633ad85be66a..e5a37f1c15c3 100644
--- a/src/Processors/QueryPlan/BroadcastReceiveStep.cpp
+++ b/src/Processors/QueryPlan/BroadcastReceiveStep.cpp
@@ -18,7 +18,7 @@ void BroadcastReceiveStep::initializePipeline(QueryPipelineBuilder & pipeline, c
 {
     const String bucket_id = settings.parameter_lookup->getParameter("bucket_id").safeGet<String>();
 
-    VectorWithMemoryTracking<std::unique_ptr<QueryPipelineBuilder>> pipelines;
+    std::vector<std::unique_ptr<QueryPipelineBuilder>> pipelines;
 
     /// Read all shards
     for (const String & shard_id : source_shards)
diff --git a/src/Processors/QueryPlan/Optimizations/makeDistributed.cpp b/src/Processors/QueryPlan/Optimizations/makeDistributed.cpp
index 4c436448749c..0108d4224e27 100644
--- a/src/Processors/QueryPlan/Optimizations/makeDistributed.cpp
+++ b/src/Processors/QueryPlan/Optimizations/makeDistributed.cpp
@@ -619,12 +619,6 @@ void optimizeExchanges(QueryPlan::Node & root)
 
                     bool can_move_gather_up = true;
 
-                    /// Per-block functions (`rowNumberInAllBlocks`, `blockNumber`, `nowInBlock`, ...)
-                    /// depend on the whole block stream; below a gather they would run per shard and
-                    /// produce different values. Keep such a step above the gather.
-                    if (dag && dagContainsNonDeterministicFunction(*dag))
-                        can_move_gather_up = false;
-
                     /// Moving the sorted GatherExchange above the step is only valid if every sort column
                     /// survives the step unchanged - otherwise GatherReceive would merge by a sort
                     /// description that no longer matches the data. Expression/Filter may recompute or
diff --git a/src/Processors/QueryPlan/QueryPlan.h b/src/Processors/QueryPlan/QueryPlan.h
index 7aece7baa08b..1618d05e4e6f 100644
--- a/src/Processors/QueryPlan/QueryPlan.h
+++ b/src/Processors/QueryPlan/QueryPlan.h
@@ -7,6 +7,7 @@
 #include <Interpreters/Context_fwd.h>
 #include <Columns/IColumn_fwd.h>
 #include <QueryPipeline/QueryPlanResourceHolder.h>
+#include <Processors/QueryPlan/ExchangeLookup.h>
 #include <Parsers/IAST_fwd.h>
 
 #include <list>
@@ -112,6 +113,9 @@ class QueryPlan
     void resolveStorages(const ContextPtr & context);
 
     void optimize(const QueryPlanOptimizationSettings & optimization_settings);
+    /// Converts the original plan to distributed plan and replaces the original plan with a plan that
+    /// contains a step that executes the distributed plan and a step that receives the result.
+    void convertToDistributed(const QueryPlanOptimizationSettings & optimization_settings);
 
     QueryPipelineBuilderPtr buildQueryPipeline(
         const QueryPlanOptimizationSettings & optimization_settings,
diff --git a/src/Processors/QueryPlan/ReadFromMergeTree.cpp b/src/Processors/QueryPlan/ReadFromMergeTree.cpp
index ecbc502532ed..0e20406785d7 100644
--- a/src/Processors/QueryPlan/ReadFromMergeTree.cpp
+++ b/src/Processors/QueryPlan/ReadFromMergeTree.cpp
@@ -3283,66 +3283,6 @@ bool ReadFromMergeTree::supportsSkipIndexesOnDataRead() const
 
 static const char * indexTypeToString(ReadFromMergeTree::IndexType type);
 
-void ReadFromMergeTree::logPredicateStatistics(const AnalysisResult & result) const
-{
-    UInt64 sample_rate = context->getSettingsRef()[Setting::predicate_statistics_sample_rate];
-    if (sample_rate == 0)
-        return;
-
-    if (sample_rate > 1)
-    {
-        auto qid = CurrentThread::getQueryId();
-        if (CityHash_v1_0_2::CityHash64(qid.data(), qid.size()) % sample_rate != 0)
-            return;
-    }
-
-    auto predicate_stats_log = context->getPredicateStatisticsLog();
-    if (!predicate_stats_log)
-        return;
-
-    if (result.index_stats.empty())
-        return;
-
-    auto storage_id = data.getStorageID();
-    if (storage_id.database_name.empty())
-        return;
-
-    PredicateStatisticsLogElement elem;
-    auto now = time(nullptr);
-    elem.event_date = static_cast<UInt16>(DateLUT::instance().toDayNum(now));
-    elem.event_time = now;
-    elem.database = storage_id.database_name;
-    elem.table = storage_id.table_name;
-    elem.query_id = String(CurrentThread::getQueryId());
-
-    UInt64 prev_granules = 0;
-    for (const auto & stat : result.index_stats)
-    {
-        if (stat.type == IndexType::None)
-        {
-            prev_granules = stat.num_granules_after;
-            continue;
-        }
-
-        if (!stat.part_name.empty())
-            continue;
-
-        UInt64 total = prev_granules > 0 ? prev_granules : stat.num_granules_after;
-        UInt64 after = stat.num_granules_after;
-
-        elem.index_names.push_back(stat.name.empty() ? indexTypeToString(stat.type) : stat.name);
-        elem.index_types.push_back(indexTypeToString(stat.type));
-        elem.total_granules.push_back(total);
-        elem.granules_after.push_back(after);
-        elem.index_selectivities.push_back(total > 0 ? static_cast<Float64>(after) / static_cast<Float64>(total) : 1.0);
-
-        prev_granules = after;
-    }
-
-    if (!elem.index_names.empty())
-        predicate_stats_log->add(std::move(elem));
-}
-
 MarkRanges filterMarkRangesForBucket(const MarkRanges & ranges, size_t & effective_bucket_index, size_t total_buckets)
 {
     MarkRanges result;
@@ -3363,8 +3303,6 @@ void ReadFromMergeTree::initializePipeline(QueryPipelineBuilder & pipeline, [[ma
 {
     auto & result = getAnalysisResult();
 
-    logPredicateStatistics(result);
-
     /// Filter ranges by 'bucket_id' parameter so that each distributed worker reads only its slice of the parts.
     if (distributed_read_bucket_count > 0 && settings.parameter_lookup)
     {
@@ -4458,13 +4396,6 @@ Strings ReadFromMergeTree::getShardsForDistributedRead() const
 
 void ReadFromMergeTree::serialize(Serialization & ctx) const
 {
-    /// Serializing the STREAM modifier is not implemented yet, so reject it instead of silently
-    /// reading a plain snapshot. (Pinned block boundaries and part-order virtual columns are rejected
-    /// earlier in checkDistributedReadSupported.)
-    if (query_info.isStream())
-        throw Exception(ErrorCodes::SUPPORT_IS_DISABLED,
-            "make_distributed_plan does not support a distributed read with the STREAM modifier");
-
     /// Needed only for a bucketed read: it is pinned to the coordinator's part list and cannot
     /// re-derive read-in-order, deferred FINAL filters, a projection, or text index tasks. A
     /// non-bucket read is rebuilt and re-optimized on the worker, which re-derives them.
@@ -4614,7 +4545,7 @@ std::unique_ptr<IQueryPlanStep> ReadFromMergeTree::deserialize(Deserialization &
     MergeTreeData & table = dynamic_cast<MergeTreeData &>(*storage_ptr);
     MergeTreeDataSelectExecutor executor(table);
 
-    StorageSnapshotPtr storage_snapshot = table.getStorageSnapshot(table.getInMemoryMetadataPtr(ctx.context, false), ctx.context);
+    StorageSnapshotPtr storage_snapshot = table.getStorageSnapshot(table.getInMemoryMetadataPtr(), ctx.context);
     const auto & snapshot_data = assert_cast<const MergeTreeData::SnapshotData &>(*storage_snapshot->data);
 
     auto step = executor.readFromParts(
diff --git a/src/Processors/QueryPlan/ReadFromMergeTree.h b/src/Processors/QueryPlan/ReadFromMergeTree.h
index bdcb8b6a3a55..061018a7bb7b 100644
--- a/src/Processors/QueryPlan/ReadFromMergeTree.h
+++ b/src/Processors/QueryPlan/ReadFromMergeTree.h
@@ -385,8 +385,12 @@ class ReadFromMergeTree final : public SourceStepWithFilter
     bool isSkipIndexAvailableForTopK(const String & sort_column) const;
     const ProjectionIndexReadDescription & getProjectionIndexReadDescription() const { return projection_index_read_desc; }
     ProjectionIndexReadDescription & getProjectionIndexReadDescription() { return projection_index_read_desc; }
+    /// In distributed query plan, this step will be executed in a distributed manner - shards will be read in parallel.
+    void setDistributedRead(size_t bucket_count);
     /// Parts (by name) every worker buckets over, so the partition is identical across replicas.
     void setDistributedReadParts(Names part_names);
+    /// Makes a list of shards to read in parallel in distributed query plan
+    Strings getShardsForDistributedRead() const;
 
     bool canRemoveUnusedColumns() const override;
     RemovedUnusedColumns removeUnusedColumns(NameMultiSet required_outputs, bool remove_inputs) override;
@@ -543,6 +547,9 @@ class ReadFromMergeTree final : public SourceStepWithFilter
 
     std::optional<TopKFilterInfo> top_k_filter_info;
     ProjectionIndexReadDescription projection_index_read_desc;
+    /// This is set when this step is part of a distributed query plan and it will be executed in a distributed manner.
+    /// "bucket_id" task parameter will be used to determine what part of the data to read.
+    size_t distributed_read_bucket_count = 0;
     /// Coordinator-selected parts a distributed-read worker buckets over. Empty otherwise.
     Names distributed_read_part_names;
 };
diff --git a/src/Processors/QueryPlan/ShuffleReceiveStep.cpp b/src/Processors/QueryPlan/ShuffleReceiveStep.cpp
index a48ec117c080..252e78f585d7 100644
--- a/src/Processors/QueryPlan/ShuffleReceiveStep.cpp
+++ b/src/Processors/QueryPlan/ShuffleReceiveStep.cpp
@@ -17,7 +17,7 @@ void ShuffleReceiveStep::initializePipeline(QueryPipelineBuilder & pipeline, con
 {
     const String bucket_id = settings.parameter_lookup->getParameter("bucket_id").safeGet<String>();
 
-    VectorWithMemoryTracking<std::unique_ptr<QueryPipelineBuilder>> pipelines;
+    std::vector<std::unique_ptr<QueryPipelineBuilder>> pipelines;
 
     /// Read all shards
     for (const String & shard_id : source_shards)
diff --git a/src/QueryPipeline/DistributedPlanExecutor.cpp b/src/QueryPipeline/DistributedPlanExecutor.cpp
index 8e8d793a6904..ac8ca476e0fd 100644
--- a/src/QueryPipeline/DistributedPlanExecutor.cpp
+++ b/src/QueryPipeline/DistributedPlanExecutor.cpp
@@ -627,12 +627,10 @@ void doExecuteTask(const DistributedQueryTaskDescription & task_description, Obj
     /// swaps the join sides while the others don't), breaking exchange partitioning.
     optimization_settings.join_swap_table = std::make_optional(false);
     optimization_settings.query_plan_optimize_join_order_limit = 0;
-    optimization_settings.query_plan_optimize_join_order_randomize = 0;
     optimization_settings.convert_join_to_in = false;
     optimization_settings.convert_outer_join_to_inner_join = false;
     optimization_settings.convert_any_join_to_semi_or_anti_join = false;
     optimization_settings.merge_filter_into_join_condition = false;
-    optimization_settings.top_k_through_join = false;
 
     /// The fragment's read is bucketed; re-introducing the implicit count projection would count the
     /// whole part per bucket. Keep it off so counts read the bucket's mark ranges.
diff --git a/src/Server/StatelessWorker/StatelessWorkerClient.cpp b/src/Server/StatelessWorker/StatelessWorkerClient.cpp
index 7e17a706faf3..6dd4696c263b 100644
--- a/src/Server/StatelessWorker/StatelessWorkerClient.cpp
+++ b/src/Server/StatelessWorker/StatelessWorkerClient.cpp
@@ -31,9 +31,9 @@ String doSendTask(const String & endpoint_uri, const String & task_id, std::func
     timeouts.receive_timeout = Poco::Timespan(100 * 1000 * 1000);
     ReadSettings read_settings;
     /// Not safe to retry: worker would schedule a duplicate task.
-    read_settings.http_settings.max_tries = 1;
-    read_settings.http_settings.retry_initial_backoff_ms = 500;
-    read_settings.http_settings.retry_max_backoff_ms = 1000;
+    read_settings.http_max_tries = 1;
+    read_settings.http_retry_initial_backoff_ms = 500;
+    read_settings.http_retry_max_backoff_ms = 1000;
 
     Poco::URI uri(endpoint_uri);
     uri.addQueryParameter("operation",   "start");
@@ -99,16 +99,16 @@ DistributedQueryTaskStatus getTaskStatus(const String & endpoint_uri, const Stri
         /// server-side wait plus a small network margin, and a failed poll must not retry.
         timeouts.send_timeout = Poco::Timespan(2 * 1000 * 1000);
         timeouts.receive_timeout = Poco::Timespan((wait_for_ms + 2000) * 1000);
-        read_settings.http_settings.max_tries = 1;
+        read_settings.http_max_tries = 1;
     }
     else
     {
         timeouts.send_timeout = Poco::Timespan(100 * 1000 * 1000);
         timeouts.receive_timeout = Poco::Timespan(100 * 1000 * 1000);
         /// Safe to retry: read-only.
-        read_settings.http_settings.max_tries = 3;
-        read_settings.http_settings.retry_initial_backoff_ms = 200;
-        read_settings.http_settings.retry_max_backoff_ms = 1000;
+        read_settings.http_max_tries = 3;
+        read_settings.http_retry_initial_backoff_ms = 200;
+        read_settings.http_retry_max_backoff_ms = 1000;
     }
 
     Poco::URI uri(endpoint_uri);
@@ -150,9 +150,9 @@ void cancelTask(const String & endpoint_uri, const String & task_id, const Conte
     timeouts.receive_timeout = Poco::Timespan(5 * 1000 * 1000);
     ReadSettings read_settings;
     /// Safe to retry: idempotent.
-    read_settings.http_settings.max_tries = 3;
-    read_settings.http_settings.retry_initial_backoff_ms = 200;
-    read_settings.http_settings.retry_max_backoff_ms = 1000;
+    read_settings.http_max_tries = 3;
+    read_settings.http_retry_initial_backoff_ms = 200;
+    read_settings.http_retry_max_backoff_ms = 1000;
 
     Poco::URI uri(endpoint_uri);
     uri.addQueryParameter("operation",   "cancel");
@@ -187,9 +187,9 @@ void forgetTask(const String & endpoint_uri, const String & task_id, const Conte
     timeouts.receive_timeout = Poco::Timespan(100 * 1000 * 1000);
     ReadSettings read_settings;
     /// Safe to retry: idempotent.
-    read_settings.http_settings.max_tries = 3;
-    read_settings.http_settings.retry_initial_backoff_ms = 200;
-    read_settings.http_settings.retry_max_backoff_ms = 1000;
+    read_settings.http_max_tries = 3;
+    read_settings.http_retry_initial_backoff_ms = 200;
+    read_settings.http_retry_max_backoff_ms = 1000;
 
     Poco::URI uri(endpoint_uri);
     uri.addQueryParameter("operation",   "forget");