From 9a30791d619b94de33ea612c7ce8ad916ad4e557 Mon Sep 17 00:00:00 2001 From: Sandeep Gottimukkala Date: Sat, 4 Apr 2026 22:57:10 -0700 Subject: [PATCH 01/17] feat(rest): add scan plan endpoints to REST catalog client - Add PlanTableScan, FetchPlanningResult, CancelPlanning, FetchScanTasks methods to RestCatalog, wiring them to the corresponding REST endpoints - Add scan plan endpoint definitions and resource path helpers - Add ScanPlanErrorHandler with proper 404/406 error dispatch (NoSuchPlanIdException, NoSuchPlanTaskException, etc.) - Add PlanTableScanRequest/FetchScanTasksRequest serialization helpers - Add kNoSuchPlanId and kNoSuchPlanTask ErrorKind values - Add DataFileFromJson and FileScanTasksFromJson overloads with partition spec and schema support - Add endpoint and integration tests for all 4 scan plan operations --- src/iceberg/catalog.h | 18 ++ src/iceberg/catalog/rest/endpoint.h | 17 ++ src/iceberg/catalog/rest/error_handlers.cc | 33 +++ src/iceberg/catalog/rest/error_handlers.h | 10 + src/iceberg/catalog/rest/json_serde.cc | 125 +++++++++ .../catalog/rest/json_serde_internal.h | 22 ++ src/iceberg/catalog/rest/resource_paths.cc | 21 ++ src/iceberg/catalog/rest/resource_paths.h | 4 + src/iceberg/catalog/rest/rest_catalog.cc | 81 ++++++ src/iceberg/catalog/rest/rest_catalog.h | 8 + src/iceberg/catalog/rest/types.cc | 81 ++++++ src/iceberg/catalog/rest/types.h | 63 +++++ src/iceberg/json_serde.cc | 252 ++++++++++++++++++ src/iceberg/json_serde_internal.h | 35 +++ src/iceberg/result.h | 4 + src/iceberg/table_scan.h | 41 +++ src/iceberg/test/endpoint_test.cc | 33 ++- src/iceberg/test/json_serde_test.cc | 229 ++++++++++++++++ .../test/rest_catalog_integration_test.cc | 90 +++++++ 19 files changed, 1164 insertions(+), 3 deletions(-) diff --git a/src/iceberg/catalog.h b/src/iceberg/catalog.h index 81c9ddd11..c1f2530ea 100644 --- a/src/iceberg/catalog.h +++ b/src/iceberg/catalog.h @@ -26,8 +26,10 @@ #include #include +#include "iceberg/catalog/rest/types.h" #include "iceberg/result.h" #include "iceberg/table_identifier.h" +#include "iceberg/table_scan.h" #include "iceberg/type_fwd.h" namespace iceberg { @@ -188,6 +190,22 @@ class ICEBERG_EXPORT Catalog { /// \return a Table instance or ErrorKind::kAlreadyExists if the table already exists virtual Result> RegisterTable( const TableIdentifier& identifier, const std::string& metadata_file_location) = 0; + + virtual Result PlanTableScan( + const Table& table, const internal::TableScanContext& context) { + return NotImplemented("PlanTableScan is not supported by this catalog"); + } + virtual Result FetchPlanningResult( + const Table& table, const std::string& plan_id) { + return NotImplemented("FetchPlanningResult is not supported by this catalog"); + } + virtual Status CancelPlanning(const Table& table, const std::string& plan_id) { + return NotImplemented("CancelPlanning is not supported by this catalog"); + } + virtual Result FetchScanTasks( + const Table& table, const std::string& plan_task) { + return NotImplemented("FetchScanTasks is not supported by this catalog"); + } }; } // namespace iceberg diff --git a/src/iceberg/catalog/rest/endpoint.h b/src/iceberg/catalog/rest/endpoint.h index 7382955ce..6021be6c1 100644 --- a/src/iceberg/catalog/rest/endpoint.h +++ b/src/iceberg/catalog/rest/endpoint.h @@ -128,6 +128,23 @@ class ICEBERG_REST_EXPORT Endpoint { return {HttpMethod::kPost, "/v1/{prefix}/transactions/commit"}; } + // Scan planning endpoints + static Endpoint PlanTableScan() { + return {HttpMethod::kPost, "/v1/{prefix}/namespaces/{namespace}/tables/{table}/plan"}; + } + + static Endpoint FetchPlanningResult() { + return {HttpMethod::kGet, "/v1/{prefix}/namespaces/{namespace}/tables/{table}/plan/{plan-id}"}; + } + + static Endpoint CancelPlanning() { + return {HttpMethod::kDelete, "/v1/{prefix}/namespaces/{namespace}/tables/{table}/plan/{plan-id}"}; + } + + static Endpoint FetchScanTasks() { + return {HttpMethod::kPost, "/v1/{prefix}/namespaces/{namespace}/tables/{table}/tasks"}; + } + private: Endpoint(HttpMethod method, std::string_view path) : method_(method), path_(path) {} diff --git a/src/iceberg/catalog/rest/error_handlers.cc b/src/iceberg/catalog/rest/error_handlers.cc index f3e5b8fb3..74189c27d 100644 --- a/src/iceberg/catalog/rest/error_handlers.cc +++ b/src/iceberg/catalog/rest/error_handlers.cc @@ -30,6 +30,9 @@ namespace { constexpr std::string_view kIllegalArgumentException = "IllegalArgumentException"; constexpr std::string_view kNoSuchNamespaceException = "NoSuchNamespaceException"; constexpr std::string_view kNamespaceNotEmptyException = "NamespaceNotEmptyException"; +constexpr std::string_view kNoSuchTableException = "NoSuchTableException"; +constexpr std::string_view kNoSuchPlanIdException = "NoSuchPlanIdException"; +constexpr std::string_view kNoSuchPlanTaskException = "NoSuchPlanTaskException"; } // namespace @@ -183,4 +186,34 @@ Status ViewCommitErrorHandler::Accept(const ErrorResponse& error) const { return DefaultErrorHandler::Accept(error); } +const std::shared_ptr& ScanPlanErrorHandler::Instance() { + static const std::shared_ptr instance{ + new ScanPlanErrorHandler()}; + return instance; +} + +Status ScanPlanErrorHandler::Accept(const ErrorResponse& error) const { + switch (error.code) { + case 404: + if (error.type == kNoSuchNamespaceException) { + return NoSuchNamespace(error.message); + } + if (error.type == kNoSuchTableException) { + return NoSuchTable(error.message); + } + if (error.type == kNoSuchPlanIdException) { + return NoSuchPlanId(error.message); + } + if (error.type == kNoSuchPlanTaskException) { + return NoSuchPlanTask(error.message); + } + return NotFound(error.message); + case 406: + return NotSupported(error.message); + } + + return DefaultErrorHandler::Accept(error); +} + + } // namespace iceberg::rest diff --git a/src/iceberg/catalog/rest/error_handlers.h b/src/iceberg/catalog/rest/error_handlers.h index eae2c9b7f..69c7f5007 100644 --- a/src/iceberg/catalog/rest/error_handlers.h +++ b/src/iceberg/catalog/rest/error_handlers.h @@ -127,4 +127,14 @@ class ICEBERG_REST_EXPORT ViewCommitErrorHandler final : public DefaultErrorHand constexpr ViewCommitErrorHandler() = default; }; +class ICEBERG_REST_EXPORT ScanPlanErrorHandler final : public DefaultErrorHandler { + public: + static const std::shared_ptr& Instance(); + + Status Accept(const ErrorResponse& error) const override; + + private: + constexpr ScanPlanErrorHandler() = default; +}; + } // namespace iceberg::rest diff --git a/src/iceberg/catalog/rest/json_serde.cc b/src/iceberg/catalog/rest/json_serde.cc index eebdc1969..b34980f95 100644 --- a/src/iceberg/catalog/rest/json_serde.cc +++ b/src/iceberg/catalog/rest/json_serde.cc @@ -26,8 +26,10 @@ #include "iceberg/catalog/rest/json_serde_internal.h" #include "iceberg/catalog/rest/types.h" +#include "iceberg/expression/json_serde_internal.h" #include "iceberg/json_serde_internal.h" #include "iceberg/partition_spec.h" +#include "iceberg/schema.h" #include "iceberg/sort_order.h" #include "iceberg/table_identifier.h" #include "iceberg/table_requirement.h" @@ -78,6 +80,21 @@ constexpr std::string_view kExpiresIn = "expires_in"; constexpr std::string_view kIssuedTokenType = "issued_token_type"; constexpr std::string_view kRefreshToken = "refresh_token"; constexpr std::string_view kOAuthScope = "scope"; +constexpr std::string_view kPlanStatus = "status"; +constexpr std::string_view kPlanId = "plan-id"; +constexpr std::string_view kPlanTasks = "plan-tasks"; +constexpr std::string_view kFileScanTasks = "file-scan-tasks"; +constexpr std::string_view kDeleteFiles = "delete-files"; +constexpr std::string_view kSnapshotId = "snapshot-id"; +constexpr std::string_view kSelect = "select"; +constexpr std::string_view kFilter = "filter"; +constexpr std::string_view kCaseSensitive = "case-sensitive"; +constexpr std::string_view kUseSnapshotSchema = "use-snapshot-schema"; +constexpr std::string_view kStartSnapshotId = "start-snapshot-id"; +constexpr std::string_view kEndSnapshotId = "end-snapshot-id"; +constexpr std::string_view kStatsFields = "stats-fields"; +constexpr std::string_view kMinRowsRequired = "min-rows-required"; +constexpr std::string_view kPlanTask = "plan-task"; } // namespace @@ -506,6 +523,114 @@ Result OAuthTokenResponseFromJson(const nlohmann::json& json return response; } +Result ToJson(const PlanTableScanRequest& request) { + nlohmann::json json; + if (request.snapshot_id.has_value()) { + json[kSnapshotId] = request.snapshot_id.value(); + } + if (!request.select.empty()) { + json[kSelect] = request.select; + } + if (request.filter) { + ICEBERG_ASSIGN_OR_RAISE(auto filter_json, iceberg::ToJson(*request.filter)); + json[kFilter] = std::move(filter_json); + } + json[kCaseSensitive] = request.case_sensitive; + json[kUseSnapshotSchema] = request.use_snapshot_schema; + if (request.start_snapshot_id.has_value()) { + json[kStartSnapshotId] = request.start_snapshot_id.value(); + } + if (request.end_snapshot_id.has_value()) { + json[kEndSnapshotId] = request.end_snapshot_id.value(); + } + if (!request.statsFields.empty()) { + json[kStatsFields] = request.statsFields; + } + if (request.min_rows_required.has_value()) { + json[kMinRowsRequired] = request.min_rows_required.value(); + } + return json; +} + +nlohmann::json ToJson(const FetchScanTasksRequest& request) { + nlohmann::json json; + json[kPlanTask] = request.planTask; + return json; +} + +Status BaseScanTaskResponseFromJson( + const nlohmann::json& json, BaseScanTaskResponse* response, + const std::unordered_map>& partition_specs_by_id, + const Schema& schema) { + // 1. plan_tasks + ICEBERG_ASSIGN_OR_RAISE(auto plan_tasks, + GetJsonValue(json, kPlanTasks)); + if (!plan_tasks.is_array()) { + return JsonParseError("Cannot parse plan tasks from non-array: {}", + SafeDumpJson(plan_tasks)); + } + ICEBERG_ASSIGN_OR_RAISE(response->plan_tasks, + GetTypedJsonValue>(plan_tasks)); + + // 2. delete_files + ICEBERG_ASSIGN_OR_RAISE(auto delete_files_json, + GetJsonValue(json, kDeleteFiles)); + if (!delete_files_json.is_array()) { + return JsonParseError("Cannot parse delete files from non-array: {}", + SafeDumpJson(delete_files_json)); + } + for (const auto& entry_json : delete_files_json) { + ICEBERG_ASSIGN_OR_RAISE(auto delete_file, + DataFileFromJson(entry_json, partition_specs_by_id, schema)); + response->delete_files.push_back(std::move(delete_file)); + } + + // 3. file_scan_tasks + ICEBERG_ASSIGN_OR_RAISE(auto file_scan_tasks_json, + GetJsonValue(json, kFileScanTasks)); + ICEBERG_ASSIGN_OR_RAISE( + response->file_scan_tasks, + FileScanTasksFromJson(file_scan_tasks_json, response->delete_files, + partition_specs_by_id, schema)); + return {}; +} + +Result PlanTableScanResponseFromJson( + const nlohmann::json& json, + const std::unordered_map>& partition_specs_by_id, + const Schema& schema) { + PlanTableScanResponse response; + ICEBERG_ASSIGN_OR_RAISE(response.plan_status, + GetJsonValue(json, kPlanStatus)); + ICEBERG_ASSIGN_OR_RAISE(response.plan_id, GetJsonValue(json, kPlanId)); + ICEBERG_RETURN_UNEXPECTED( + BaseScanTaskResponseFromJson(json, &response, partition_specs_by_id, schema)); + return response; +} + +Result FetchPlanningResultResponseFromJson( + const nlohmann::json& json, + const std::unordered_map>& partition_specs_by_id, + const Schema& schema) { + FetchPlanningResultResponse response; + ICEBERG_ASSIGN_OR_RAISE(auto status_str, + GetJsonValue(json, kPlanStatus)); + response.plan_status = PlanStatus(PlanStatus::FromString(status_str)); + ICEBERG_RETURN_UNEXPECTED( + BaseScanTaskResponseFromJson(json, &response, partition_specs_by_id, schema)); + return response; +} + +Result FetchScanTasksResponseFromJson( + const nlohmann::json& json, + const std::unordered_map>& partition_specs_by_id, + const Schema& schema) { + FetchScanTasksResponse response; + ICEBERG_RETURN_UNEXPECTED( + BaseScanTaskResponseFromJson(json, &response, partition_specs_by_id, schema)); + return response; +} + #define ICEBERG_DEFINE_FROM_JSON(Model) \ template <> \ Result FromJson(const nlohmann::json& json) { \ diff --git a/src/iceberg/catalog/rest/json_serde_internal.h b/src/iceberg/catalog/rest/json_serde_internal.h index 820e077d7..e50f85d1a 100644 --- a/src/iceberg/catalog/rest/json_serde_internal.h +++ b/src/iceberg/catalog/rest/json_serde_internal.h @@ -19,11 +19,15 @@ #pragma once +#include + #include #include "iceberg/catalog/rest/iceberg_rest_export.h" #include "iceberg/catalog/rest/types.h" +#include "iceberg/partition_spec.h" #include "iceberg/result.h" +#include "iceberg/schema.h" /// \file iceberg/catalog/rest/json_serde_internal.h /// JSON serialization and deserialization for Iceberg REST Catalog API types. @@ -62,4 +66,22 @@ ICEBERG_DECLARE_JSON_SERDE(OAuthTokenResponse) #undef ICEBERG_DECLARE_JSON_SERDE +ICEBERG_REST_EXPORT Result PlanTableScanResponseFromJson( + const nlohmann::json& json, + const std::unordered_map>& partition_specs_by_id, + const Schema& schema); + +ICEBERG_REST_EXPORT Result FetchPlanningResultResponseFromJson( + const nlohmann::json& json, + const std::unordered_map>& partition_specs_by_id, + const Schema& schema); + +ICEBERG_REST_EXPORT Result FetchScanTasksResponseFromJson( + const nlohmann::json& json, + const std::unordered_map>& partition_specs_by_id, + const Schema& schema); + +ICEBERG_REST_EXPORT Result ToJson(const PlanTableScanRequest& request); +ICEBERG_REST_EXPORT nlohmann::json ToJson(const FetchScanTasksRequest& request); + } // namespace iceberg::rest diff --git a/src/iceberg/catalog/rest/resource_paths.cc b/src/iceberg/catalog/rest/resource_paths.cc index f86a75ec0..b6f8030c1 100644 --- a/src/iceberg/catalog/rest/resource_paths.cc +++ b/src/iceberg/catalog/rest/resource_paths.cc @@ -102,4 +102,25 @@ Result ResourcePaths::CommitTransaction() const { return std::format("{}/v1/{}transactions/commit", base_uri_, prefix_); } +Result ResourcePaths::ScanPlan(const TableIdentifier& ident) const { + ICEBERG_ASSIGN_OR_RAISE(std::string encoded_namespace, EncodeNamespace(ident.ns)); + ICEBERG_ASSIGN_OR_RAISE(std::string encoded_table_name, EncodeString(ident.name)); + return std::format("{}/v1/{}namespaces/{}/tables/{}/plan", base_uri_, prefix_, + encoded_namespace, encoded_table_name); +} + +Result ResourcePaths::ScanPlan(const TableIdentifier& ident, const std::string& plan_id) const { + ICEBERG_ASSIGN_OR_RAISE(std::string encoded_namespace, EncodeNamespace(ident.ns)); + ICEBERG_ASSIGN_OR_RAISE(std::string encoded_table_name, EncodeString(ident.name)); + return std::format("{}/v1/{}namespaces/{}/tables/{}/plan/{}", base_uri_, prefix_, + encoded_namespace, encoded_table_name, plan_id); +} + +Result ResourcePaths::ScanTask(const TableIdentifier& ident) const { + ICEBERG_ASSIGN_OR_RAISE(std::string encoded_namespace, EncodeNamespace(ident.ns)); + ICEBERG_ASSIGN_OR_RAISE(std::string encoded_table_name, EncodeString(ident.name)); + return std::format("{}/v1/{}namespaces/{}/tables/{}/tasks", base_uri_, prefix_, + encoded_namespace, encoded_table_name); +} + } // namespace iceberg::rest diff --git a/src/iceberg/catalog/rest/resource_paths.h b/src/iceberg/catalog/rest/resource_paths.h index 1b502aaa7..dbb3b8bf4 100644 --- a/src/iceberg/catalog/rest/resource_paths.h +++ b/src/iceberg/catalog/rest/resource_paths.h @@ -81,6 +81,10 @@ class ICEBERG_REST_EXPORT ResourcePaths { /// \brief Get the /v1/{prefix}/transactions/commit endpoint path. Result CommitTransaction() const; + Result ScanPlan(const TableIdentifier& ident) const; + Result ScanPlan(const TableIdentifier& ident, const std::string& plan_id) const; + Result ScanTask(const TableIdentifier& ident) const; + private: ResourcePaths(std::string base_uri, const std::string& prefix); diff --git a/src/iceberg/catalog/rest/rest_catalog.cc b/src/iceberg/catalog/rest/rest_catalog.cc index 40e112db7..2774f1bba 100644 --- a/src/iceberg/catalog/rest/rest_catalog.cc +++ b/src/iceberg/catalog/rest/rest_catalog.cc @@ -43,6 +43,7 @@ #include "iceberg/sort_order.h" #include "iceberg/table.h" #include "iceberg/table_requirement.h" +#include "iceberg/table_scan.h" #include "iceberg/table_update.h" #include "iceberg/transaction.h" #include "iceberg/util/macros.h" @@ -62,6 +63,8 @@ std::unordered_set GetDefaultEndpoints() { Endpoint::UpdateTable(), Endpoint::DeleteTable(), Endpoint::RenameTable(), Endpoint::RegisterTable(), Endpoint::ReportMetrics(), Endpoint::CommitTransaction(), + Endpoint::PlanTableScan(), Endpoint::FetchPlanningResult(), + Endpoint::CancelPlanning(), Endpoint::FetchScanTasks(), }; } @@ -495,4 +498,82 @@ Result> RestCatalog::RegisterTable( shared_from_this()); } +Result RestCatalog::PlanTableScan( + const Table& table, const internal::TableScanContext& context) { + ICEBERG_ENDPOINT_CHECK(supported_endpoints_, Endpoint::PlanTableScan()); + ICEBERG_ASSIGN_OR_RAISE(auto path, paths_->ScanPlan(table.name())); + ICEBERG_ASSIGN_OR_RAISE(auto schema_ptr, table.schema()); + ICEBERG_ASSIGN_OR_RAISE(auto specs_ref, table.specs()); + + PlanTableScanRequest request; + if (context.snapshot_id.has_value()) { + request.snapshot_id = context.snapshot_id; + } + request.select = context.selected_columns; + request.filter = context.filter; + request.case_sensitive = context.case_sensitive; + request.use_snapshot_schema = false; // TODO + if (context.from_snapshot_id.has_value() && context.to_snapshot_id.has_value()) { + request.start_snapshot_id = context.from_snapshot_id.value(); + request.end_snapshot_id = context.to_snapshot_id.value(); + } + if (context.min_rows_requested.has_value()) { + request.min_rows_required = context.min_rows_requested.value(); + } + + ICEBERG_ASSIGN_OR_RAISE(auto request_json, ToJson(request)); + ICEBERG_ASSIGN_OR_RAISE(auto json_request, ToJsonString(request_json)); + ICEBERG_ASSIGN_OR_RAISE( + const auto response, + client_->Post(path, json_request, /*headers=*/{}, + *ScanPlanErrorHandler::Instance(), *catalog_session_)); + + ICEBERG_ASSIGN_OR_RAISE(auto json, FromJsonString(response.body())); + return PlanTableScanResponseFromJson(json, specs_ref.get(), *schema_ptr); +} + +Result RestCatalog::FetchPlanningResult( + const Table& table, const std::string& plan_id) { + ICEBERG_ENDPOINT_CHECK(supported_endpoints_, Endpoint::FetchPlanningResult()); + ICEBERG_ASSIGN_OR_RAISE(auto path, paths_->ScanPlan(table.name(), plan_id)); + ICEBERG_ASSIGN_OR_RAISE(auto schema_ptr, table.schema()); + ICEBERG_ASSIGN_OR_RAISE(auto specs_ref, table.specs()); + + ICEBERG_ASSIGN_OR_RAISE( + const auto response, + client_->Get(path, /*params=*/{}, /*headers=*/{}, + *ScanPlanErrorHandler::Instance(), *catalog_session_)); + ICEBERG_ASSIGN_OR_RAISE(auto json, FromJsonString(response.body())); + return FetchPlanningResultResponseFromJson(json, specs_ref.get(), *schema_ptr); +} + +Status RestCatalog::CancelPlanning(const Table& table, const std::string& plan_id) { + ICEBERG_ENDPOINT_CHECK(supported_endpoints_, Endpoint::CancelPlanning()); + ICEBERG_ASSIGN_OR_RAISE(auto path, paths_->ScanPlan(table.name(), plan_id)); + + ICEBERG_ASSIGN_OR_RAISE( + const auto response, + client_->Delete(path, /*params=*/{}, /*headers=*/{}, + *ScanPlanErrorHandler::Instance(), *catalog_session_)); + return {}; +} + +Result RestCatalog::FetchScanTasks( + const Table& table, const std::string& plan_task) { + ICEBERG_ENDPOINT_CHECK(supported_endpoints_, Endpoint::FetchScanTasks()); + ICEBERG_ASSIGN_OR_RAISE(auto path, paths_->ScanTask(table.name())); + ICEBERG_ASSIGN_OR_RAISE(auto schema_ptr, table.schema()); + ICEBERG_ASSIGN_OR_RAISE(auto specs_ref, table.specs()); + + FetchScanTasksRequest request{.planTask = plan_task}; + ICEBERG_ASSIGN_OR_RAISE(auto json_request, ToJsonString(ToJson(request))); + ICEBERG_ASSIGN_OR_RAISE( + const auto response, + client_->Post(path, json_request, /*headers=*/{}, + *ScanPlanErrorHandler::Instance(), *catalog_session_)); + + ICEBERG_ASSIGN_OR_RAISE(auto json, FromJsonString(response.body())); + return FetchScanTasksResponseFromJson(json, specs_ref.get(), *schema_ptr); +} + } // namespace iceberg::rest diff --git a/src/iceberg/catalog/rest/rest_catalog.h b/src/iceberg/catalog/rest/rest_catalog.h index 38230a5e2..014929d83 100644 --- a/src/iceberg/catalog/rest/rest_catalog.h +++ b/src/iceberg/catalog/rest/rest_catalog.h @@ -104,6 +104,14 @@ class ICEBERG_REST_EXPORT RestCatalog : public Catalog, const TableIdentifier& identifier, const std::string& metadata_file_location) override; + Result PlanTableScan( + const Table& table, const internal::TableScanContext& context) override; + Result FetchPlanningResult( + const Table& table, const std::string& plan_id) override; + Status CancelPlanning(const Table& table, const std::string& plan_id) override; + Result FetchScanTasks( + const Table& table, const std::string& plan_task) override; + private: RestCatalog(RestCatalogProperties config, std::shared_ptr file_io, std::unique_ptr client, std::unique_ptr paths, diff --git a/src/iceberg/catalog/rest/types.cc b/src/iceberg/catalog/rest/types.cc index 3abfb1406..8a90bfbb8 100644 --- a/src/iceberg/catalog/rest/types.cc +++ b/src/iceberg/catalog/rest/types.cc @@ -135,4 +135,85 @@ Status OAuthTokenResponse::Validate() const { return {}; } +Status PlanTableScanRequest::Validate() const { + if (snapshot_id.has_value()) { + if (start_snapshot_id.has_value() || end_snapshot_id.has_value()) { + return ValidationFailed( + "Invalid scan: cannot provide both snapshotId and startSnapshotId/endSnapshotId"); + } + } + if (start_snapshot_id.has_value() || end_snapshot_id.has_value()) { + if (!start_snapshot_id.has_value() || !end_snapshot_id.has_value()) { + return ValidationFailed( + "Invalid incremental scan: startSnapshotId and endSnapshotId is required"); + } + } + if (min_rows_required.has_value() && min_rows_required.value() < 0) { + return ValidationFailed("Invalid scan: minRowsRequested is negative"); + } + return {}; +} + +Status PlanTableScanResponse::Validate() const { + if (plan_status.empty()) { + return ValidationFailed("Invalid response: plan status must be defined"); + } + if (plan_status == "submitted" && plan_id.empty()) { + return ValidationFailed( + "Invalid response: plan id should be defined when status is 'submitted'"); + } + if (plan_status == "cancelled") { + return ValidationFailed( + "Invalid response: 'cancelled' is not a valid status for planTableScan"); + } + if (plan_status != "completed" && (!plan_tasks.empty() || !file_scan_tasks.empty())) { + return ValidationFailed( + "Invalid response: tasks can only be defined when status is 'completed'"); + } + if (!plan_id.empty() && plan_status != "submitted" && plan_status != "completed") { + return ValidationFailed( + "Invalid response: plan id can only be defined when status is 'submitted' or 'completed'"); + } + if (file_scan_tasks.empty() && !delete_files.empty()) { + return ValidationFailed( + "Invalid response: deleteFiles should only be returned with fileScanTasks that reference them"); + } + return {}; +} + +Status FetchPlanningResultResponse::Validate() const { + if (plan_status.ToString() == "unknown") { + return ValidationFailed("Invalid status: null"); + } + if (plan_status.ToString() != "completed" && + (!plan_tasks.empty() || !file_scan_tasks.empty())) { + return ValidationFailed( + "Invalid response: tasks can only be returned in a 'completed' status"); + } + if (file_scan_tasks.empty() && !delete_files.empty()) { + return ValidationFailed( + "Invalid response: deleteFiles should only be returned with fileScanTasks that reference them"); + } + return {}; +} + +Status FetchScanTasksRequest::Validate() const { + if (planTask.empty()) { + return ValidationFailed("Invalid planTask: null"); + } + return {}; +} + +Status FetchScanTasksResponse::Validate() const { + if (file_scan_tasks.empty() && !delete_files.empty()) { + return ValidationFailed( + "Invalid response: deleteFiles should only be returned with fileScanTasks that reference them"); + } + if (plan_tasks.empty() && file_scan_tasks.empty()) { + return ValidationFailed( + "Invalid response: planTasks and fileScanTask cannot both be null"); + } + return {}; +} + } // namespace iceberg::rest diff --git a/src/iceberg/catalog/rest/types.h b/src/iceberg/catalog/rest/types.h index 6495a6517..1bbb197d8 100644 --- a/src/iceberg/catalog/rest/types.h +++ b/src/iceberg/catalog/rest/types.h @@ -28,9 +28,12 @@ #include "iceberg/catalog/rest/endpoint.h" #include "iceberg/catalog/rest/iceberg_rest_export.h" +#include "iceberg/expression/expression.h" +#include "iceberg/manifest/manifest_entry.h" #include "iceberg/result.h" #include "iceberg/schema.h" #include "iceberg/table_identifier.h" +#include "iceberg/table_scan.h" #include "iceberg/type_fwd.h" #include "iceberg/util/macros.h" @@ -295,4 +298,64 @@ struct ICEBERG_REST_EXPORT OAuthTokenResponse { bool operator==(const OAuthTokenResponse&) const = default; }; +struct ICEBERG_REST_EXPORT PlanTableScanRequest { + std::optional snapshot_id; + std::vector select; + std::shared_ptr filter; + bool case_sensitive; + bool use_snapshot_schema; + std::optional start_snapshot_id; + std::optional end_snapshot_id; + std::vector statsFields; + std::optional min_rows_required; + + Status Validate() const; + + bool operator==(const PlanTableScanRequest&) const; +}; + +struct ICEBERG_REST_EXPORT BaseScanTaskResponse { + std::vector plan_tasks; + std::vector file_scan_tasks; + std::vector delete_files; + // std::unordered_map specsById; + + Status Validate() const { return {}; }; + + bool operator==(const BaseScanTaskResponse&) const; +}; + +struct ICEBERG_REST_EXPORT PlanTableScanResponse : BaseScanTaskResponse { + std::string plan_status; + std::string plan_id; + // TODO: Add credentials. + + Status Validate() const; + + bool operator==(const PlanTableScanResponse&) const; +}; + +struct ICEBERG_REST_EXPORT FetchPlanningResultResponse : BaseScanTaskResponse { + PlanStatus plan_status; + // TODO: Add credentials. + + Status Validate() const; + + bool operator==(const FetchPlanningResultResponse&) const; +}; + +struct ICEBERG_REST_EXPORT FetchScanTasksRequest { + std::string planTask; + + Status Validate() const; + + bool operator==(const FetchScanTasksRequest&) const; +}; + +struct ICEBERG_REST_EXPORT FetchScanTasksResponse : BaseScanTaskResponse { + Status Validate() const; + + bool operator==(const FetchScanTasksResponse&) const; +}; + } // namespace iceberg::rest diff --git a/src/iceberg/json_serde.cc b/src/iceberg/json_serde.cc index 2d8c22255..995c9bf73 100644 --- a/src/iceberg/json_serde.cc +++ b/src/iceberg/json_serde.cc @@ -27,7 +27,10 @@ #include #include "iceberg/constants.h" +#include "iceberg/expression/json_serde_internal.h" +#include "iceberg/file_format.h" #include "iceberg/json_serde_internal.h" +#include "iceberg/manifest/manifest_entry.h" #include "iceberg/name_mapping.h" #include "iceberg/partition_field.h" #include "iceberg/partition_spec.h" @@ -40,6 +43,7 @@ #include "iceberg/table_metadata.h" #include "iceberg/table_properties.h" #include "iceberg/table_requirement.h" +#include "iceberg/table_scan.h" #include "iceberg/table_update.h" #include "iceberg/transform.h" #include "iceberg/type.h" @@ -225,6 +229,60 @@ constexpr std::string_view kRequirementAssertDefaultSortOrderID = constexpr std::string_view kLastAssignedFieldId = "last-assigned-field-id"; constexpr std::string_view kLastAssignedPartitionId = "last-assigned-partition-id"; +// FileScanTask / DataFile constants (kSpecId, kSortOrderId, and kFileSizeInBytes +// are already defined above) +constexpr std::string_view kDataFile = "data-file"; +constexpr std::string_view kDeleteFileReferences = "delete-file-references"; +constexpr std::string_view kResidualFilter = "residual-filter"; +constexpr std::string_view kContent = "content"; +constexpr std::string_view kFilePath = "file-path"; +constexpr std::string_view kFileFormat = "file-format"; +constexpr std::string_view kPartition = "partition"; +constexpr std::string_view kRecordCount = "record-count"; +constexpr std::string_view kColumnSizes = "column-sizes"; +constexpr std::string_view kValueCounts = "value-counts"; +constexpr std::string_view kNullValueCounts = "null-value-counts"; +constexpr std::string_view kNanValueCounts = "nan-value-counts"; +constexpr std::string_view kLowerBounds = "lower-bounds"; +constexpr std::string_view kUpperBounds = "upper-bounds"; +constexpr std::string_view kKeyMetadata = "key-metadata"; +constexpr std::string_view kSplitOffsets = "split-offsets"; +constexpr std::string_view kEqualityIds = "equality-ids"; +constexpr std::string_view kFirstRowId = "first-row-id"; +constexpr std::string_view kReferencedDataFile = "referenced-data-file"; +constexpr std::string_view kContentOffset = "content-offset"; +constexpr std::string_view kContentSizeInBytes = "content-size-in-bytes"; + +// Decode a base64-encoded string to raw bytes. +std::vector Base64Decode(std::string_view encoded) { + static const std::array kDecodeTable = [] { + std::array table; + table.fill(-1); + for (int i = 0; i < 26; i++) table[static_cast('A') + i] = i; + for (int i = 0; i < 26; i++) table[static_cast('a') + i] = 26 + i; + for (int i = 0; i < 10; i++) table[static_cast('0') + i] = 52 + i; + table[static_cast('+')] = 62; + table[static_cast('/')] = 63; + return table; + }(); + + std::vector decoded; + decoded.reserve(encoded.size() * 3 / 4); + int val = 0, bits = -8; + for (unsigned char c : encoded) { + if (c == '=') break; + const int d = kDecodeTable[c]; + if (d == -1) continue; + val = (val << 6) + d; + bits += 6; + if (bits >= 0) { + decoded.push_back(static_cast((val >> bits) & 0xFF)); + bits -= 8; + } + } + return decoded; +} + } // namespace nlohmann::json ToJson(const SortField& sort_field) { @@ -1723,4 +1781,198 @@ Result> TableRequirementFromJson( return JsonParseError("Unknown table requirement type: {}", type); } +Result DataFileFromJson( + const nlohmann::json& json, + const std::unordered_map>& partitionSpecById, + const Schema& schema) { + if (!json.is_object()) { + return JsonParseError("DataFile must be a JSON object: {}", SafeDumpJson(json)); + } + DataFile df; + + ICEBERG_ASSIGN_OR_RAISE(auto content_str, GetJsonValue(json, kContent)); + const auto upper_content = StringUtils::ToUpper(content_str); + if (upper_content == "DATA") { + df.content = DataFile::Content::kData; + } else if (upper_content == "POSITION_DELETES") { + df.content = DataFile::Content::kPositionDeletes; + } else if (upper_content == "EQUALITY_DELETES") { + df.content = DataFile::Content::kEqualityDeletes; + } else { + return JsonParseError("Unknown data file content: {}", content_str); + } + + ICEBERG_ASSIGN_OR_RAISE(df.file_path, GetJsonValue(json, kFilePath)); + ICEBERG_ASSIGN_OR_RAISE(auto format_str, GetJsonValue(json, kFileFormat)); + ICEBERG_ASSIGN_OR_RAISE(df.file_format, FileFormatTypeFromString(format_str)); + + if (json.contains(kSpecId) && !json.at(kSpecId).is_null()) { + ICEBERG_ASSIGN_OR_RAISE(auto spec_id, GetJsonValue(json, kSpecId)); + df.partition_spec_id = spec_id; + } + + if (json.contains(kPartition)) { + ICEBERG_ASSIGN_OR_RAISE(auto partition_vals, + GetJsonValue(json, kPartition)); + if (!partition_vals.is_array()) { + return JsonParseError("PartitionValues must be a JSON array: {}", + SafeDumpJson(partition_vals)); + } + std::vector literals; + auto it = partitionSpecById.find(df.partition_spec_id.value_or(-1)); + if (it == partitionSpecById.end()) { + return JsonParseError("Invalid partition spec id: {}", + df.partition_spec_id.value_or(-1)); + } + ICEBERG_ASSIGN_OR_RAISE(auto struct_type, it->second->PartitionType(schema)); + auto fields = struct_type->fields(); + if (partition_vals.size() != fields.size()) { + return JsonParseError("Invalid partition data size: expected = {}, actual = {}", + fields.size(), partition_vals.size()); + } + for (size_t pos = 0; pos < fields.size(); ++pos) { + ICEBERG_ASSIGN_OR_RAISE( + auto literal, + LiteralFromJson(partition_vals[pos], fields[pos].type().get())); + literals.push_back(std::move(literal)); + } + df.partition = PartitionValues(std::move(literals)); + } + + ICEBERG_ASSIGN_OR_RAISE(df.record_count, GetJsonValue(json, kRecordCount)); + ICEBERG_ASSIGN_OR_RAISE(df.file_size_in_bytes, + GetJsonValue(json, kFileSizeInBytes)); + + // Parse CountMap: {"keys": [int, ...], "values": [long, ...]} + auto parse_int_map = [&](std::string_view key, + std::map& target) -> Status { + if (!json.contains(key) || json.at(key).is_null()) return {}; + ICEBERG_ASSIGN_OR_RAISE(auto map_json, GetJsonValue(json, key)); + ICEBERG_ASSIGN_OR_RAISE(auto keys, + GetTypedJsonValue>(map_json.at("keys"))); + ICEBERG_ASSIGN_OR_RAISE( + auto values, GetTypedJsonValue>(map_json.at("values"))); + if (keys.size() != values.size()) { + return JsonParseError("'{}' map keys and values have different lengths", key); + } + for (size_t i = 0; i < keys.size(); ++i) { + target[keys[i]] = values[i]; + } + return {}; + }; + + ICEBERG_RETURN_UNEXPECTED(parse_int_map(kColumnSizes, df.column_sizes)); + ICEBERG_RETURN_UNEXPECTED(parse_int_map(kValueCounts, df.value_counts)); + ICEBERG_RETURN_UNEXPECTED(parse_int_map(kNullValueCounts, df.null_value_counts)); + ICEBERG_RETURN_UNEXPECTED(parse_int_map(kNanValueCounts, df.nan_value_counts)); + + // Parse BinaryMap: {"keys": [int, ...], "values": [base64string, ...]} + auto parse_binary_map = [&](std::string_view key, + std::map>& target) -> Status { + if (!json.contains(key) || json.at(key).is_null()) return {}; + ICEBERG_ASSIGN_OR_RAISE(auto map_json, GetJsonValue(json, key)); + ICEBERG_ASSIGN_OR_RAISE(auto keys, + GetTypedJsonValue>(map_json.at("keys"))); + ICEBERG_ASSIGN_OR_RAISE( + auto values, + GetTypedJsonValue>(map_json.at("values"))); + if (keys.size() != values.size()) { + return JsonParseError("'{}' binary map keys and values have different lengths", key); + } + for (size_t i = 0; i < keys.size(); ++i) { + target[keys[i]] = Base64Decode(values[i]); + } + return {}; + }; + + ICEBERG_RETURN_UNEXPECTED(parse_binary_map(kLowerBounds, df.lower_bounds)); + ICEBERG_RETURN_UNEXPECTED(parse_binary_map(kUpperBounds, df.upper_bounds)); + + if (json.contains(kKeyMetadata) && !json.at(kKeyMetadata).is_null()) { + ICEBERG_ASSIGN_OR_RAISE(auto key_meta_str, + GetJsonValue(json, kKeyMetadata)); + df.key_metadata = Base64Decode(key_meta_str); + } + if (json.contains(kSplitOffsets) && !json.at(kSplitOffsets).is_null()) { + ICEBERG_ASSIGN_OR_RAISE(df.split_offsets, + GetJsonValue>(json, kSplitOffsets)); + } + if (json.contains(kEqualityIds) && !json.at(kEqualityIds).is_null()) { + ICEBERG_ASSIGN_OR_RAISE(df.equality_ids, + GetJsonValue>(json, kEqualityIds)); + } + if (json.contains(kSortOrderId) && !json.at(kSortOrderId).is_null()) { + ICEBERG_ASSIGN_OR_RAISE(df.sort_order_id, + GetJsonValue(json, kSortOrderId)); + } + if (json.contains(kFirstRowId) && !json.at(kFirstRowId).is_null()) { + ICEBERG_ASSIGN_OR_RAISE(df.first_row_id, GetJsonValue(json, kFirstRowId)); + } + if (json.contains(kReferencedDataFile) && !json.at(kReferencedDataFile).is_null()) { + ICEBERG_ASSIGN_OR_RAISE(df.referenced_data_file, + GetJsonValue(json, kReferencedDataFile)); + } + if (json.contains(kContentOffset) && !json.at(kContentOffset).is_null()) { + ICEBERG_ASSIGN_OR_RAISE(df.content_offset, + GetJsonValue(json, kContentOffset)); + } + if (json.contains(kContentSizeInBytes) && !json.at(kContentSizeInBytes).is_null()) { + ICEBERG_ASSIGN_OR_RAISE(df.content_size_in_bytes, + GetJsonValue(json, kContentSizeInBytes)); + } + + return df; +} + +Result> FileScanTasksFromJson( + const nlohmann::json& json, const std::vector& delete_files, + const std::unordered_map>& partitionSpecById, + const Schema& schema) { + if (!json.is_array()) { + return JsonParseError("Cannot parse file scan tasks from non-array: {}", + SafeDumpJson(json)); + } + std::vector file_scan_tasks; + for (const auto& task_json : json) { + if (!task_json.is_object()) { + return JsonParseError("Cannot parse file scan task from a non-object: {}", + SafeDumpJson(task_json)); + } + + ICEBERG_ASSIGN_OR_RAISE(auto data_file_json, + GetJsonValue(task_json, kDataFile)); + ICEBERG_ASSIGN_OR_RAISE(auto data_file, + DataFileFromJson(data_file_json, partitionSpecById, schema)); + + std::vector> task_delete_files; + if (task_json.contains(kDeleteFileReferences) && + !task_json.at(kDeleteFileReferences).is_null()) { + ICEBERG_ASSIGN_OR_RAISE( + auto refs, + GetJsonValue>(task_json, kDeleteFileReferences)); + for (int32_t ref : refs) { + if (ref < 0 || static_cast(ref) >= delete_files.size()) { + return JsonParseError( + "delete-file-references index {} is out of range (delete_files size: {})", + ref, delete_files.size()); + } + task_delete_files.push_back(std::make_shared(delete_files[ref])); + } + } + + std::shared_ptr residual_filter; + if (task_json.contains(kResidualFilter) && + !task_json.at(kResidualFilter).is_null()) { + ICEBERG_ASSIGN_OR_RAISE(auto filter_json, + GetJsonValue(task_json, kResidualFilter)); + ICEBERG_ASSIGN_OR_RAISE(residual_filter, ExpressionFromJson(filter_json)); + } + + file_scan_tasks.emplace_back(std::make_shared(std::move(data_file)), + std::move(task_delete_files), + std::move(residual_filter)); + } + return file_scan_tasks; +} + } // namespace iceberg diff --git a/src/iceberg/json_serde_internal.h b/src/iceberg/json_serde_internal.h index 8699e3dd1..654c0f24c 100644 --- a/src/iceberg/json_serde_internal.h +++ b/src/iceberg/json_serde_internal.h @@ -28,9 +28,13 @@ #include +#include "iceberg/manifest/manifest_entry.h" +#include "iceberg/partition_spec.h" #include "iceberg/result.h" +#include "iceberg/schema.h" #include "iceberg/statistics_file.h" #include "iceberg/table_metadata.h" +#include "iceberg/table_scan.h" #include "iceberg/type_fwd.h" namespace iceberg { @@ -413,4 +417,35 @@ ICEBERG_EXPORT nlohmann::json ToJson(const TableRequirement& requirement); ICEBERG_EXPORT Result> TableRequirementFromJson( const nlohmann::json& json); +/// \brief Deserializes a JSON object into a `DataFile` object. +/// +/// Parses a DataFile from the REST Catalog JSON format. Maps (column-sizes, +/// value-counts, etc.) use the CountMap/BinaryMap parallel arrays format. +/// Binary fields (lower-bounds, upper-bounds, key-metadata) are base64-encoded. +/// +/// \param json The JSON object representing a `DataFile`. +/// \param partitionSpecById Map from spec ID to PartitionSpec for type-aware partition parsing. +/// \param schema The table schema, used with partitionSpecById to resolve partition types. +/// \return A `DataFile` object or an error if the conversion fails. +ICEBERG_EXPORT Result DataFileFromJson(const nlohmann::json& json); + +ICEBERG_EXPORT Result DataFileFromJson( + const nlohmann::json& json, + const std::unordered_map>& partitionSpecById, + const Schema& schema); + +/// \brief Deserializes a JSON array of file scan tasks. +/// +/// Each task may reference delete files by index into \p delete_files. +/// +/// \param json The JSON array of file scan task objects. +/// \param delete_files Delete files indexed by the tasks' delete-file-references. +/// \param partitionSpecById Map from spec ID to PartitionSpec for type-aware partition parsing. +/// \param schema The table schema, used with partitionSpecById to resolve partition types. +/// \return A vector of `FileScanTask` objects or an error if the conversion fails. +ICEBERG_EXPORT Result> FileScanTasksFromJson( + const nlohmann::json& json, const std::vector& delete_files, + const std::unordered_map>& partitionSpecById, + const Schema& schema); + } // namespace iceberg diff --git a/src/iceberg/result.h b/src/iceberg/result.h index 765508705..98246303d 100644 --- a/src/iceberg/result.h +++ b/src/iceberg/result.h @@ -48,6 +48,8 @@ enum class ErrorKind { kJsonParseError, kNamespaceNotEmpty, kNoSuchNamespace, + kNoSuchPlanId, + kNoSuchPlanTask, kNoSuchTable, kNoSuchView, kNotAllowed, @@ -111,6 +113,8 @@ DEFINE_ERROR_FUNCTION(IOError) DEFINE_ERROR_FUNCTION(JsonParseError) DEFINE_ERROR_FUNCTION(NamespaceNotEmpty) DEFINE_ERROR_FUNCTION(NoSuchNamespace) +DEFINE_ERROR_FUNCTION(NoSuchPlanId) +DEFINE_ERROR_FUNCTION(NoSuchPlanTask) DEFINE_ERROR_FUNCTION(NoSuchTable) DEFINE_ERROR_FUNCTION(NoSuchView) DEFINE_ERROR_FUNCTION(NotAllowed) diff --git a/src/iceberg/table_scan.h b/src/iceberg/table_scan.h index b21adcca6..1914a7824 100644 --- a/src/iceberg/table_scan.h +++ b/src/iceberg/table_scan.h @@ -35,6 +35,47 @@ namespace iceberg { +class ICEBERG_EXPORT PlanStatus { + public: + enum class Status { + kCompleted, + kSubmitted, + kCancelled, + kFailed, + kUnknown + }; + + PlanStatus() : status_(Status::kUnknown) {} + explicit PlanStatus(Status status) : status_(status) {} + + static Status FromString(std::string_view status_str) { + if (status_str == "completed") { + return Status::kCompleted; + } else if (status_str == "submitted") { + return Status::kSubmitted; + } else if (status_str == "cancelled") { + return Status::kCancelled; + } else if (status_str == "failed") { + return Status::kFailed; + } + return Status::kUnknown; + } + + const std::string ToString() const { + switch (status_) { + case Status::kCompleted: return "completed"; + case Status::kSubmitted: return "submitted"; + case Status::kCancelled: return "cancelled"; + case Status::kFailed: return "failed"; + default: return "unknown"; + } + } + + private: + Status status_; + +}; + /// \brief An abstract scan task. class ICEBERG_EXPORT ScanTask { public: diff --git a/src/iceberg/test/endpoint_test.cc b/src/iceberg/test/endpoint_test.cc index fcdc92a78..460eea4d6 100644 --- a/src/iceberg/test/endpoint_test.cc +++ b/src/iceberg/test/endpoint_test.cc @@ -145,6 +145,31 @@ TEST(EndpointTest, TransactionEndpoints) { EXPECT_EQ(commit_transaction.path(), "/v1/{prefix}/transactions/commit"); } +// Test predefined scan planning endpoints +TEST(EndpointTest, ScanPlanEndpoints) { + auto plan_table_scan = Endpoint::PlanTableScan(); + EXPECT_EQ(plan_table_scan.method(), HttpMethod::kPost); + EXPECT_EQ(plan_table_scan.path(), + "/v1/{prefix}/namespaces/{namespace}/tables/{table}/plan"); + EXPECT_EQ(plan_table_scan.ToString(), + "POST /v1/{prefix}/namespaces/{namespace}/tables/{table}/plan"); + + auto fetch_planning_result = Endpoint::FetchPlanningResult(); + EXPECT_EQ(fetch_planning_result.method(), HttpMethod::kGet); + EXPECT_EQ(fetch_planning_result.path(), + "/v1/{prefix}/namespaces/{namespace}/tables/{table}/plan/{plan-id}"); + + auto cancel_planning = Endpoint::CancelPlanning(); + EXPECT_EQ(cancel_planning.method(), HttpMethod::kDelete); + EXPECT_EQ(cancel_planning.path(), + "/v1/{prefix}/namespaces/{namespace}/tables/{table}/plan/{plan-id}"); + + auto fetch_scan_tasks = Endpoint::FetchScanTasks(); + EXPECT_EQ(fetch_scan_tasks.method(), HttpMethod::kPost); + EXPECT_EQ(fetch_scan_tasks.path(), + "/v1/{prefix}/namespaces/{namespace}/tables/{table}/tasks"); +} + // Test endpoint equality TEST(EndpointTest, Equality) { auto endpoint1 = Endpoint::Make(HttpMethod::kGet, "/path"); @@ -239,9 +264,11 @@ TEST(EndpointTest, FromStringInvalid) { TEST(EndpointTest, StringRoundTrip) { // Create various endpoints and verify they survive string round-trip std::vector endpoints = { - Endpoint::ListNamespaces(), Endpoint::GetNamespaceProperties(), - Endpoint::CreateNamespace(), Endpoint::LoadTable(), - Endpoint::CreateTable(), Endpoint::DeleteTable(), + Endpoint::ListNamespaces(), Endpoint::GetNamespaceProperties(), + Endpoint::CreateNamespace(), Endpoint::LoadTable(), + Endpoint::CreateTable(), Endpoint::DeleteTable(), + Endpoint::PlanTableScan(), Endpoint::FetchPlanningResult(), + Endpoint::CancelPlanning(), Endpoint::FetchScanTasks(), }; for (const auto& original : endpoints) { diff --git a/src/iceberg/test/json_serde_test.cc b/src/iceberg/test/json_serde_test.cc index f019375d3..4daff9d4c 100644 --- a/src/iceberg/test/json_serde_test.cc +++ b/src/iceberg/test/json_serde_test.cc @@ -23,7 +23,9 @@ #include #include +#include "iceberg/file_format.h" #include "iceberg/json_serde_internal.h" +#include "iceberg/manifest/manifest_entry.h" #include "iceberg/name_mapping.h" #include "iceberg/partition_spec.h" #include "iceberg/schema.h" @@ -32,6 +34,7 @@ #include "iceberg/sort_order.h" #include "iceberg/statistics_file.h" #include "iceberg/table_requirement.h" +#include "iceberg/table_scan.h" #include "iceberg/table_update.h" #include "iceberg/test/matchers.h" #include "iceberg/transform.h" @@ -772,4 +775,230 @@ TEST(TableRequirementJsonTest, TableRequirementUnknownType) { EXPECT_THAT(result, HasErrorMessage("Unknown table requirement type")); } +// ---- DataFileFromJson tests ---- + +TEST(DataFileFromJsonTest, RequiredFieldsOnly) { + auto json = R"({ + "content": "DATA", + "file-path": "s3://bucket/data/file.parquet", + "file-format": "PARQUET", + "file-size-in-bytes": 12345, + "record-count": 100 + })"_json; + + auto result = DataFileFromJson(json, {}, Schema({}, 0)); + ASSERT_THAT(result, IsOk()); + const auto& df = result.value(); + EXPECT_EQ(df.content, DataFile::Content::kData); + EXPECT_EQ(df.file_path, "s3://bucket/data/file.parquet"); + EXPECT_EQ(df.file_format, FileFormatType::kParquet); + EXPECT_EQ(df.file_size_in_bytes, 12345); + EXPECT_EQ(df.record_count, 100); + EXPECT_TRUE(df.column_sizes.empty()); + EXPECT_FALSE(df.sort_order_id.has_value()); + EXPECT_FALSE(df.partition_spec_id.has_value()); +} + +TEST(DataFileFromJsonTest, LowercaseContentAndFormat) { + // The REST API sends uppercase, but we should handle lowercase too. + auto json = R"({ + "content": "data", + "file-path": "s3://bucket/data/file.avro", + "file-format": "avro", + "file-size-in-bytes": 500, + "record-count": 10 + })"_json; + + auto result = DataFileFromJson(json, {}, Schema({}, 0)); + ASSERT_THAT(result, IsOk()); + EXPECT_EQ(result.value().content, DataFile::Content::kData); + EXPECT_EQ(result.value().file_format, FileFormatType::kAvro); +} + +TEST(DataFileFromJsonTest, WithOptionalFields) { + auto json = R"({ + "content": "DATA", + "file-path": "s3://bucket/data/file.parquet", + "file-format": "PARQUET", + "spec-id": 1, + "file-size-in-bytes": 12345, + "record-count": 100, + "column-sizes": {"keys": [1, 2], "values": [1000, 2000]}, + "value-counts": {"keys": [1, 2], "values": [100, 100]}, + "null-value-counts": {"keys": [1], "values": [0]}, + "nan-value-counts": {"keys": [2], "values": [5]}, + "split-offsets": [0, 4096], + "sort-order-id": 0 + })"_json; + + auto result = DataFileFromJson(json, {}, Schema({}, 0)); + ASSERT_THAT(result, IsOk()); + const auto& df = result.value(); + EXPECT_EQ(df.partition_spec_id, 1); + ASSERT_EQ(df.column_sizes.size(), 2U); + EXPECT_EQ(df.column_sizes.at(1), 1000); + EXPECT_EQ(df.column_sizes.at(2), 2000); + ASSERT_EQ(df.value_counts.size(), 2U); + EXPECT_EQ(df.value_counts.at(1), 100); + ASSERT_EQ(df.null_value_counts.size(), 1U); + EXPECT_EQ(df.null_value_counts.at(1), 0); + ASSERT_EQ(df.nan_value_counts.size(), 1U); + EXPECT_EQ(df.nan_value_counts.at(2), 5); + ASSERT_EQ(df.split_offsets.size(), 2U); + EXPECT_EQ(df.split_offsets[0], 0); + EXPECT_EQ(df.split_offsets[1], 4096); + EXPECT_EQ(df.sort_order_id, 0); +} + +TEST(DataFileFromJsonTest, EqualityDeleteFile) { + auto json = R"({ + "content": "EQUALITY_DELETES", + "file-path": "s3://bucket/deletes/eq_delete.parquet", + "file-format": "PARQUET", + "file-size-in-bytes": 5000, + "record-count": 50, + "equality-ids": [1, 2] + })"_json; + + auto result = DataFileFromJson(json, {}, Schema({}, 0)); + ASSERT_THAT(result, IsOk()); + const auto& df = result.value(); + EXPECT_EQ(df.content, DataFile::Content::kEqualityDeletes); + ASSERT_EQ(df.equality_ids.size(), 2U); + EXPECT_EQ(df.equality_ids[0], 1); + EXPECT_EQ(df.equality_ids[1], 2); +} + +TEST(DataFileFromJsonTest, PositionDeleteFileWithReferencedDataFile) { + auto json = R"({ + "content": "POSITION_DELETES", + "file-path": "s3://bucket/deletes/pos_delete.parquet", + "file-format": "PARQUET", + "file-size-in-bytes": 3000, + "record-count": 20, + "referenced-data-file": "s3://bucket/data/file.parquet" + })"_json; + + auto result = DataFileFromJson(json, {}, Schema({}, 0)); + ASSERT_THAT(result, IsOk()); + const auto& df = result.value(); + EXPECT_EQ(df.content, DataFile::Content::kPositionDeletes); + ASSERT_TRUE(df.referenced_data_file.has_value()); + EXPECT_EQ(df.referenced_data_file.value(), "s3://bucket/data/file.parquet"); +} + +TEST(DataFileFromJsonTest, InvalidContentType) { + auto json = R"({ + "content": "UNKNOWN", + "file-path": "s3://bucket/file.parquet", + "file-format": "PARQUET", + "file-size-in-bytes": 100, + "record-count": 10 + })"_json; + + auto result = DataFileFromJson(json, {}, Schema({}, 0)); + EXPECT_THAT(result, IsError(ErrorKind::kJsonParseError)); + EXPECT_THAT(result, HasErrorMessage("Unknown data file content")); +} + +TEST(DataFileFromJsonTest, MissingRequiredField) { + // Missing "file-path" + auto json = R"({ + "content": "DATA", + "file-format": "PARQUET", + "file-size-in-bytes": 100, + "record-count": 10 + })"_json; + + auto result = DataFileFromJson(json, {}, Schema({}, 0)); + EXPECT_THAT(result, IsError(ErrorKind::kJsonParseError)); +} + +TEST(DataFileFromJsonTest, NotAnObject) { + auto result = DataFileFromJson(nlohmann::json::array(), {}, Schema({}, 0)); + EXPECT_THAT(result, IsError(ErrorKind::kJsonParseError)); + EXPECT_THAT(result, HasErrorMessage("DataFile must be a JSON object")); +} + +// ---- FileScanTasksFromJson tests ---- + +TEST(FileScanTasksFromJsonTest, EmptyArray) { + auto result = FileScanTasksFromJson(nlohmann::json::array(), {}, {}, Schema({}, 0)); + ASSERT_THAT(result, IsOk()); + EXPECT_TRUE(result.value().empty()); +} + +TEST(FileScanTasksFromJsonTest, SingleTaskNoDeleteFiles) { + auto json = R"([{ + "data-file": { + "content": "DATA", + "file-path": "s3://bucket/data/file.parquet", + "file-format": "PARQUET", + "file-size-in-bytes": 12345, + "record-count": 100 + } + }])"_json; + + auto result = FileScanTasksFromJson(json, {}, {}, Schema({}, 0)); + ASSERT_THAT(result, IsOk()); + ASSERT_EQ(result.value().size(), 1U); + const auto& task = result.value()[0]; + ASSERT_NE(task.data_file(), nullptr); + EXPECT_EQ(task.data_file()->file_path, "s3://bucket/data/file.parquet"); + EXPECT_TRUE(task.delete_files().empty()); + EXPECT_EQ(task.residual_filter(), nullptr); +} + +TEST(FileScanTasksFromJsonTest, TaskWithDeleteFileReferences) { + DataFile delete_file; + delete_file.content = DataFile::Content::kPositionDeletes; + delete_file.file_path = "s3://bucket/deletes/pos_delete.parquet"; + delete_file.file_format = FileFormatType::kParquet; + delete_file.file_size_in_bytes = 1000; + delete_file.record_count = 5; + + auto json = R"([{ + "data-file": { + "content": "DATA", + "file-path": "s3://bucket/data/file.parquet", + "file-format": "PARQUET", + "file-size-in-bytes": 12345, + "record-count": 100 + }, + "delete-file-references": [0] + }])"_json; + + auto result = FileScanTasksFromJson(json, {delete_file}, {}, Schema({}, 0)); + ASSERT_THAT(result, IsOk()); + ASSERT_EQ(result.value().size(), 1U); + const auto& task = result.value()[0]; + ASSERT_EQ(task.delete_files().size(), 1U); + EXPECT_EQ(task.delete_files()[0]->file_path, + "s3://bucket/deletes/pos_delete.parquet"); +} + +TEST(FileScanTasksFromJsonTest, DeleteFileReferenceOutOfRange) { + auto json = R"([{ + "data-file": { + "content": "DATA", + "file-path": "s3://bucket/data/file.parquet", + "file-format": "PARQUET", + "file-size-in-bytes": 100, + "record-count": 10 + }, + "delete-file-references": [5] + }])"_json; + + // No delete files provided, so index 5 is out of range + auto result = FileScanTasksFromJson(json, {}, {}, Schema({}, 0)); + EXPECT_THAT(result, IsError(ErrorKind::kJsonParseError)); + EXPECT_THAT(result, HasErrorMessage("out of range")); +} + +TEST(FileScanTasksFromJsonTest, NotAnArray) { + auto result = FileScanTasksFromJson(nlohmann::json::object(), {}, {}, Schema({}, 0)); + EXPECT_THAT(result, IsError(ErrorKind::kJsonParseError)); + EXPECT_THAT(result, HasErrorMessage("non-array")); +} + } // namespace iceberg diff --git a/src/iceberg/test/rest_catalog_integration_test.cc b/src/iceberg/test/rest_catalog_integration_test.cc index b364ffd36..1bd0807be 100644 --- a/src/iceberg/test/rest_catalog_integration_test.cc +++ b/src/iceberg/test/rest_catalog_integration_test.cc @@ -40,6 +40,7 @@ #include "iceberg/catalog/rest/json_serde_internal.h" #include "iceberg/catalog/rest/rest_catalog.h" #include "iceberg/partition_spec.h" +#include "iceberg/table_scan.h" #include "iceberg/result.h" #include "iceberg/schema.h" #include "iceberg/sort_order.h" @@ -448,6 +449,95 @@ TEST_F(RestCatalogIntegrationTest, StageCreateTable) { EXPECT_EQ(committed->metadata()->properties.configs().at("key1"), "value1"); } +// -- Scan plan operations -- + +TEST_F(RestCatalogIntegrationTest, PlanTableScan) { + Namespace ns{.levels = {"test_plan_table_scan"}}; + ICEBERG_UNWRAP_OR_FAIL(auto catalog, CreateCatalogAndNamespace(ns)); + TableIdentifier table_id{.ns = ns, .name = "scan_table"}; + ASSERT_THAT(CreateDefaultTable(catalog, table_id), IsOk()); + ICEBERG_UNWRAP_OR_FAIL(auto table, catalog->LoadTable(table_id)); + + internal::TableScanContext context; + ICEBERG_UNWRAP_OR_FAIL(auto response, catalog->PlanTableScan(*table, context)); + // Empty table: no file scan tasks returned + EXPECT_TRUE(response.file_scan_tasks.empty()); +} + +TEST_F(RestCatalogIntegrationTest, PlanTableScanWithContext) { + Namespace ns{.levels = {"test_plan_scan_context"}}; + ICEBERG_UNWRAP_OR_FAIL(auto catalog, CreateCatalogAndNamespace(ns)); + TableIdentifier table_id{.ns = ns, .name = "context_scan_table"}; + ASSERT_THAT(CreateDefaultTable(catalog, table_id), IsOk()); + ICEBERG_UNWRAP_OR_FAIL(auto table, catalog->LoadTable(table_id)); + + internal::TableScanContext context; + context.selected_columns = {"id", "data"}; + context.case_sensitive = true; + ICEBERG_UNWRAP_OR_FAIL(auto response, catalog->PlanTableScan(*table, context)); + EXPECT_TRUE(response.file_scan_tasks.empty()); +} + +TEST_F(RestCatalogIntegrationTest, FetchPlanningResult) { + Namespace ns{.levels = {"test_fetch_planning_result"}}; + ICEBERG_UNWRAP_OR_FAIL(auto catalog, CreateCatalogAndNamespace(ns)); + TableIdentifier table_id{.ns = ns, .name = "fetch_plan_table"}; + ASSERT_THAT(CreateDefaultTable(catalog, table_id), IsOk()); + ICEBERG_UNWRAP_OR_FAIL(auto table, catalog->LoadTable(table_id)); + + internal::TableScanContext context; + ICEBERG_UNWRAP_OR_FAIL(auto plan_response, catalog->PlanTableScan(*table, context)); + + if (!plan_response.plan_id.empty()) { + // Async plan: fetch the result using the returned plan_id + ICEBERG_UNWRAP_OR_FAIL(auto fetch_response, + catalog->FetchPlanningResult(*table, plan_response.plan_id)); + EXPECT_TRUE(fetch_response.file_scan_tasks.empty()); + } else { + // Synchronous plan: file_scan_tasks already returned; verify nonexistent plan_id errors + EXPECT_FALSE(catalog->FetchPlanningResult(*table, "nonexistent-plan-id").has_value()); + } +} + +TEST_F(RestCatalogIntegrationTest, CancelPlanning) { + Namespace ns{.levels = {"test_cancel_planning"}}; + ICEBERG_UNWRAP_OR_FAIL(auto catalog, CreateCatalogAndNamespace(ns)); + TableIdentifier table_id{.ns = ns, .name = "cancel_plan_table"}; + ASSERT_THAT(CreateDefaultTable(catalog, table_id), IsOk()); + ICEBERG_UNWRAP_OR_FAIL(auto table, catalog->LoadTable(table_id)); + + internal::TableScanContext context; + ICEBERG_UNWRAP_OR_FAIL(auto plan_response, catalog->PlanTableScan(*table, context)); + + if (!plan_response.plan_id.empty()) { + // Async plan: cancel it + ASSERT_THAT(catalog->CancelPlanning(*table, plan_response.plan_id), IsOk()); + } else { + // Synchronous plan: verify cancelling a nonexistent plan_id returns an error + EXPECT_FALSE(catalog->CancelPlanning(*table, "nonexistent-plan-id").has_value()); + } +} + +TEST_F(RestCatalogIntegrationTest, FetchScanTasks) { + Namespace ns{.levels = {"test_fetch_scan_tasks"}}; + ICEBERG_UNWRAP_OR_FAIL(auto catalog, CreateCatalogAndNamespace(ns)); + TableIdentifier table_id{.ns = ns, .name = "fetch_tasks_table"}; + ASSERT_THAT(CreateDefaultTable(catalog, table_id), IsOk()); + ICEBERG_UNWRAP_OR_FAIL(auto table, catalog->LoadTable(table_id)); + + internal::TableScanContext context; + ICEBERG_UNWRAP_OR_FAIL(auto plan_response, catalog->PlanTableScan(*table, context)); + + if (!plan_response.plan_tasks.empty()) { + // Use the first plan task token to fetch scan tasks + ICEBERG_UNWRAP_OR_FAIL( + auto tasks_response, + catalog->FetchScanTasks(*table, plan_response.plan_tasks[0])); + EXPECT_TRUE(tasks_response.file_scan_tasks.empty()); + } + // If synchronous plan (no plan_tasks), file_scan_tasks are already in plan_response +} + // -- Snapshot loading mode -- TEST_F(RestCatalogIntegrationTest, LoadTableWithSnapshotModeAll) { From 4279ba0c41c895d29af5fbeddbf378b6fc2e3f2e Mon Sep 17 00:00:00 2001 From: Sandeep Gottimukkala Date: Sat, 4 Apr 2026 23:33:35 -0700 Subject: [PATCH 02/17] Comments and stuff --- src/iceberg/catalog.h | 22 +++++++++++ src/iceberg/catalog/rest/error_handlers.h | 1 + src/iceberg/catalog/rest/json_serde.cc | 3 ++ src/iceberg/catalog/rest/resource_paths.h | 6 +++ src/iceberg/catalog/rest/types.cc | 46 +++++++++++++++++++++++ src/iceberg/json_serde.cc | 45 ++++------------------ 6 files changed, 85 insertions(+), 38 deletions(-) diff --git a/src/iceberg/catalog.h b/src/iceberg/catalog.h index c1f2530ea..18e76e4f5 100644 --- a/src/iceberg/catalog.h +++ b/src/iceberg/catalog.h @@ -191,17 +191,39 @@ class ICEBERG_EXPORT Catalog { virtual Result> RegisterTable( const TableIdentifier& identifier, const std::string& metadata_file_location) = 0; + /// \brief Initiate a scan planning operation for the given table. + /// + /// \param table The table to scan. + /// \param context The scan context containing snapshot, filter, and other options. + /// \return A PlanTableScanResponse with the plan status and initial scan tasks. virtual Result PlanTableScan( const Table& table, const internal::TableScanContext& context) { return NotImplemented("PlanTableScan is not supported by this catalog"); } + + /// \brief Fetch the current status and results of an asynchronous scan plan. + /// + /// \param table The table being scanned. + /// \param plan_id The plan ID returned by PlanTableScan. + /// \return A FetchPlanningResultResponse with the current plan status and tasks. virtual Result FetchPlanningResult( const Table& table, const std::string& plan_id) { return NotImplemented("FetchPlanningResult is not supported by this catalog"); } + + /// \brief Cancel an in-progress scan planning operation. + /// + /// \param table The table being scanned. + /// \param plan_id The plan ID returned by PlanTableScan. virtual Status CancelPlanning(const Table& table, const std::string& plan_id) { return NotImplemented("CancelPlanning is not supported by this catalog"); } + + /// \brief Fetch the scan tasks for a given plan task token. + /// + /// \param table The table being scanned. + /// \param plan_task The plan task token returned in a scan plan response. + /// \return A FetchScanTasksResponse with the file scan tasks. virtual Result FetchScanTasks( const Table& table, const std::string& plan_task) { return NotImplemented("FetchScanTasks is not supported by this catalog"); diff --git a/src/iceberg/catalog/rest/error_handlers.h b/src/iceberg/catalog/rest/error_handlers.h index 69c7f5007..7c0823384 100644 --- a/src/iceberg/catalog/rest/error_handlers.h +++ b/src/iceberg/catalog/rest/error_handlers.h @@ -127,6 +127,7 @@ class ICEBERG_REST_EXPORT ViewCommitErrorHandler final : public DefaultErrorHand constexpr ViewCommitErrorHandler() = default; }; +/// \brief Scan plan operation error handler. class ICEBERG_REST_EXPORT ScanPlanErrorHandler final : public DefaultErrorHandler { public: static const std::shared_ptr& Instance(); diff --git a/src/iceberg/catalog/rest/json_serde.cc b/src/iceberg/catalog/rest/json_serde.cc index b34980f95..91753add4 100644 --- a/src/iceberg/catalog/rest/json_serde.cc +++ b/src/iceberg/catalog/rest/json_serde.cc @@ -605,6 +605,7 @@ Result PlanTableScanResponseFromJson( ICEBERG_ASSIGN_OR_RAISE(response.plan_id, GetJsonValue(json, kPlanId)); ICEBERG_RETURN_UNEXPECTED( BaseScanTaskResponseFromJson(json, &response, partition_specs_by_id, schema)); + ICEBERG_RETURN_UNEXPECTED(response.Validate()); return response; } @@ -618,6 +619,7 @@ Result FetchPlanningResultResponseFromJson( response.plan_status = PlanStatus(PlanStatus::FromString(status_str)); ICEBERG_RETURN_UNEXPECTED( BaseScanTaskResponseFromJson(json, &response, partition_specs_by_id, schema)); + ICEBERG_RETURN_UNEXPECTED(response.Validate()); return response; } @@ -628,6 +630,7 @@ Result FetchScanTasksResponseFromJson( FetchScanTasksResponse response; ICEBERG_RETURN_UNEXPECTED( BaseScanTaskResponseFromJson(json, &response, partition_specs_by_id, schema)); + ICEBERG_RETURN_UNEXPECTED(response.Validate()); return response; } diff --git a/src/iceberg/catalog/rest/resource_paths.h b/src/iceberg/catalog/rest/resource_paths.h index dbb3b8bf4..cf7e98fc3 100644 --- a/src/iceberg/catalog/rest/resource_paths.h +++ b/src/iceberg/catalog/rest/resource_paths.h @@ -81,8 +81,14 @@ class ICEBERG_REST_EXPORT ResourcePaths { /// \brief Get the /v1/{prefix}/transactions/commit endpoint path. Result CommitTransaction() const; + /// \brief Get the /v1/{prefix}/namespaces/{namespace}/tables/{table}/plan endpoint path. Result ScanPlan(const TableIdentifier& ident) const; + + /// \brief Get the /v1/{prefix}/namespaces/{namespace}/tables/{table}/plan/{plan_id} + /// endpoint path. Result ScanPlan(const TableIdentifier& ident, const std::string& plan_id) const; + + /// \brief Get the /v1/{prefix}/namespaces/{namespace}/tables/{table}/tasks endpoint path. Result ScanTask(const TableIdentifier& ident) const; private: diff --git a/src/iceberg/catalog/rest/types.cc b/src/iceberg/catalog/rest/types.cc index 8a90bfbb8..5a2e19e22 100644 --- a/src/iceberg/catalog/rest/types.cc +++ b/src/iceberg/catalog/rest/types.cc @@ -118,6 +118,52 @@ bool CommitTableResponse::operator==(const CommitTableResponse& other) const { return true; } +bool PlanTableScanRequest::operator==(const PlanTableScanRequest& other) const { + return snapshot_id == other.snapshot_id && select == other.select && + filter == other.filter && case_sensitive == other.case_sensitive && + use_snapshot_schema == other.use_snapshot_schema && + start_snapshot_id == other.start_snapshot_id && + end_snapshot_id == other.end_snapshot_id && statsFields == other.statsFields && + min_rows_required == other.min_rows_required; +} + +bool BaseScanTaskResponse::operator==(const BaseScanTaskResponse& other) const { + if (plan_tasks != other.plan_tasks) return false; + if (delete_files != other.delete_files) return false; + if (file_scan_tasks.size() != other.file_scan_tasks.size()) return false; + for (size_t i = 0; i < file_scan_tasks.size(); ++i) { + const auto& a = file_scan_tasks[i]; + const auto& b = other.file_scan_tasks[i]; + if (!a.data_file() != !b.data_file()) return false; + if (a.data_file() && *a.data_file() != *b.data_file()) return false; + if (a.delete_files().size() != b.delete_files().size()) return false; + for (size_t j = 0; j < a.delete_files().size(); ++j) { + if (!a.delete_files()[j] != !b.delete_files()[j]) return false; + if (a.delete_files()[j] && *a.delete_files()[j] != *b.delete_files()[j]) return false; + } + if (a.residual_filter() != b.residual_filter()) return false; + } + return true; +} + +bool PlanTableScanResponse::operator==(const PlanTableScanResponse& other) const { + return BaseScanTaskResponse::operator==(other) && plan_status == other.plan_status && + plan_id == other.plan_id; +} + +bool FetchPlanningResultResponse::operator==(const FetchPlanningResultResponse& other) const { + return BaseScanTaskResponse::operator==(other) && + plan_status.ToString() == other.plan_status.ToString(); +} + +bool FetchScanTasksRequest::operator==(const FetchScanTasksRequest& other) const { + return planTask == other.planTask; +} + +bool FetchScanTasksResponse::operator==(const FetchScanTasksResponse& other) const { + return BaseScanTaskResponse::operator==(other); +} + Status OAuthTokenResponse::Validate() const { if (access_token.empty()) { return ValidationFailed("OAuth2 token response missing required 'access_token'"); diff --git a/src/iceberg/json_serde.cc b/src/iceberg/json_serde.cc index 995c9bf73..1dc3537a8 100644 --- a/src/iceberg/json_serde.cc +++ b/src/iceberg/json_serde.cc @@ -253,35 +253,6 @@ constexpr std::string_view kReferencedDataFile = "referenced-data-file"; constexpr std::string_view kContentOffset = "content-offset"; constexpr std::string_view kContentSizeInBytes = "content-size-in-bytes"; -// Decode a base64-encoded string to raw bytes. -std::vector Base64Decode(std::string_view encoded) { - static const std::array kDecodeTable = [] { - std::array table; - table.fill(-1); - for (int i = 0; i < 26; i++) table[static_cast('A') + i] = i; - for (int i = 0; i < 26; i++) table[static_cast('a') + i] = 26 + i; - for (int i = 0; i < 10; i++) table[static_cast('0') + i] = 52 + i; - table[static_cast('+')] = 62; - table[static_cast('/')] = 63; - return table; - }(); - - std::vector decoded; - decoded.reserve(encoded.size() * 3 / 4); - int val = 0, bits = -8; - for (unsigned char c : encoded) { - if (c == '=') break; - const int d = kDecodeTable[c]; - if (d == -1) continue; - val = (val << 6) + d; - bits += 6; - if (bits >= 0) { - decoded.push_back(static_cast((val >> bits) & 0xFF)); - bits -= 8; - } - } - return decoded; -} } // namespace @@ -1866,21 +1837,20 @@ Result DataFileFromJson( ICEBERG_RETURN_UNEXPECTED(parse_int_map(kNullValueCounts, df.null_value_counts)); ICEBERG_RETURN_UNEXPECTED(parse_int_map(kNanValueCounts, df.nan_value_counts)); - // Parse BinaryMap: {"keys": [int, ...], "values": [base64string, ...]} + // Parse BinaryMap: {"keys": [int, ...], "values": [base64 binary, ...]} auto parse_binary_map = [&](std::string_view key, std::map>& target) -> Status { if (!json.contains(key) || json.at(key).is_null()) return {}; ICEBERG_ASSIGN_OR_RAISE(auto map_json, GetJsonValue(json, key)); ICEBERG_ASSIGN_OR_RAISE(auto keys, - GetTypedJsonValue>(map_json.at("keys"))); - ICEBERG_ASSIGN_OR_RAISE( - auto values, - GetTypedJsonValue>(map_json.at("values"))); + GetJsonValue>(map_json, "keys")); + ICEBERG_ASSIGN_OR_RAISE(auto values, + GetJsonValue>>(map_json, "values")); if (keys.size() != values.size()) { return JsonParseError("'{}' binary map keys and values have different lengths", key); } for (size_t i = 0; i < keys.size(); ++i) { - target[keys[i]] = Base64Decode(values[i]); + target[keys[i]] = values[i]; } return {}; }; @@ -1889,9 +1859,8 @@ Result DataFileFromJson( ICEBERG_RETURN_UNEXPECTED(parse_binary_map(kUpperBounds, df.upper_bounds)); if (json.contains(kKeyMetadata) && !json.at(kKeyMetadata).is_null()) { - ICEBERG_ASSIGN_OR_RAISE(auto key_meta_str, - GetJsonValue(json, kKeyMetadata)); - df.key_metadata = Base64Decode(key_meta_str); + ICEBERG_ASSIGN_OR_RAISE(df.key_metadata, + GetJsonValue>(json, kKeyMetadata)); } if (json.contains(kSplitOffsets) && !json.at(kSplitOffsets).is_null()) { ICEBERG_ASSIGN_OR_RAISE(df.split_offsets, From ddadb328db50c17dd87d714a4a6ff1a0fce7a885 Mon Sep 17 00:00:00 2001 From: Sandeep Gottimukkala Date: Sat, 4 Apr 2026 23:39:56 -0700 Subject: [PATCH 03/17] Comments --- src/iceberg/catalog/rest/json_serde.cc | 21 ++-- src/iceberg/catalog/rest/rest_catalog.cc | 5 +- src/iceberg/catalog/rest/types.h | 13 ++- src/iceberg/test/rest_json_serde_test.cc | 134 +++++++++++++++++++++++ 4 files changed, 156 insertions(+), 17 deletions(-) diff --git a/src/iceberg/catalog/rest/json_serde.cc b/src/iceberg/catalog/rest/json_serde.cc index 91753add4..b6f00877c 100644 --- a/src/iceberg/catalog/rest/json_serde.cc +++ b/src/iceberg/catalog/rest/json_serde.cc @@ -563,22 +563,13 @@ Status BaseScanTaskResponseFromJson( const std::unordered_map>& partition_specs_by_id, const Schema& schema) { // 1. plan_tasks - ICEBERG_ASSIGN_OR_RAISE(auto plan_tasks, - GetJsonValue(json, kPlanTasks)); - if (!plan_tasks.is_array()) { - return JsonParseError("Cannot parse plan tasks from non-array: {}", - SafeDumpJson(plan_tasks)); - } ICEBERG_ASSIGN_OR_RAISE(response->plan_tasks, - GetTypedJsonValue>(plan_tasks)); + GetJsonValueOrDefault>(json, kPlanTasks)); // 2. delete_files ICEBERG_ASSIGN_OR_RAISE(auto delete_files_json, - GetJsonValue(json, kDeleteFiles)); - if (!delete_files_json.is_array()) { - return JsonParseError("Cannot parse delete files from non-array: {}", - SafeDumpJson(delete_files_json)); - } + GetJsonValueOrDefault(json, kDeleteFiles, + nlohmann::json::array())); for (const auto& entry_json : delete_files_json) { ICEBERG_ASSIGN_OR_RAISE(auto delete_file, DataFileFromJson(entry_json, partition_specs_by_id, schema)); @@ -587,7 +578,8 @@ Status BaseScanTaskResponseFromJson( // 3. file_scan_tasks ICEBERG_ASSIGN_OR_RAISE(auto file_scan_tasks_json, - GetJsonValue(json, kFileScanTasks)); + GetJsonValueOrDefault(json, kFileScanTasks, + nlohmann::json::array())); ICEBERG_ASSIGN_OR_RAISE( response->file_scan_tasks, FileScanTasksFromJson(file_scan_tasks_json, response->delete_files, @@ -602,7 +594,8 @@ Result PlanTableScanResponseFromJson( PlanTableScanResponse response; ICEBERG_ASSIGN_OR_RAISE(response.plan_status, GetJsonValue(json, kPlanStatus)); - ICEBERG_ASSIGN_OR_RAISE(response.plan_id, GetJsonValue(json, kPlanId)); + ICEBERG_ASSIGN_OR_RAISE(response.plan_id, + GetJsonValueOrDefault(json, kPlanId)); ICEBERG_RETURN_UNEXPECTED( BaseScanTaskResponseFromJson(json, &response, partition_specs_by_id, schema)); ICEBERG_RETURN_UNEXPECTED(response.Validate()); diff --git a/src/iceberg/catalog/rest/rest_catalog.cc b/src/iceberg/catalog/rest/rest_catalog.cc index 2774f1bba..7b1657b86 100644 --- a/src/iceberg/catalog/rest/rest_catalog.cc +++ b/src/iceberg/catalog/rest/rest_catalog.cc @@ -512,15 +512,17 @@ Result RestCatalog::PlanTableScan( request.select = context.selected_columns; request.filter = context.filter; request.case_sensitive = context.case_sensitive; - request.use_snapshot_schema = false; // TODO + request.use_snapshot_schema = false; if (context.from_snapshot_id.has_value() && context.to_snapshot_id.has_value()) { request.start_snapshot_id = context.from_snapshot_id.value(); request.end_snapshot_id = context.to_snapshot_id.value(); + request.use_snapshot_schema = true; } if (context.min_rows_requested.has_value()) { request.min_rows_required = context.min_rows_requested.value(); } + ICEBERG_RETURN_UNEXPECTED(request.Validate()); ICEBERG_ASSIGN_OR_RAISE(auto request_json, ToJson(request)); ICEBERG_ASSIGN_OR_RAISE(auto json_request, ToJsonString(request_json)); ICEBERG_ASSIGN_OR_RAISE( @@ -566,6 +568,7 @@ Result RestCatalog::FetchScanTasks( ICEBERG_ASSIGN_OR_RAISE(auto specs_ref, table.specs()); FetchScanTasksRequest request{.planTask = plan_task}; + ICEBERG_RETURN_UNEXPECTED(request.Validate()); ICEBERG_ASSIGN_OR_RAISE(auto json_request, ToJsonString(ToJson(request))); ICEBERG_ASSIGN_OR_RAISE( const auto response, diff --git a/src/iceberg/catalog/rest/types.h b/src/iceberg/catalog/rest/types.h index 1bbb197d8..84d0c1c35 100644 --- a/src/iceberg/catalog/rest/types.h +++ b/src/iceberg/catalog/rest/types.h @@ -298,12 +298,13 @@ struct ICEBERG_REST_EXPORT OAuthTokenResponse { bool operator==(const OAuthTokenResponse&) const = default; }; +/// \brief Request to initiate a server-side scan planning operation. struct ICEBERG_REST_EXPORT PlanTableScanRequest { std::optional snapshot_id; std::vector select; std::shared_ptr filter; - bool case_sensitive; - bool use_snapshot_schema; + bool case_sensitive = true; + bool use_snapshot_schema = false; std::optional start_snapshot_id; std::optional end_snapshot_id; std::vector statsFields; @@ -314,6 +315,8 @@ struct ICEBERG_REST_EXPORT PlanTableScanRequest { bool operator==(const PlanTableScanRequest&) const; }; +/// \brief Base response containing scan tasks and delete files returned by scan plan +/// endpoints. struct ICEBERG_REST_EXPORT BaseScanTaskResponse { std::vector plan_tasks; std::vector file_scan_tasks; @@ -325,6 +328,8 @@ struct ICEBERG_REST_EXPORT BaseScanTaskResponse { bool operator==(const BaseScanTaskResponse&) const; }; +/// \brief Response from initiating a scan planning operation, including plan status and +/// initial scan tasks. struct ICEBERG_REST_EXPORT PlanTableScanResponse : BaseScanTaskResponse { std::string plan_status; std::string plan_id; @@ -335,6 +340,8 @@ struct ICEBERG_REST_EXPORT PlanTableScanResponse : BaseScanTaskResponse { bool operator==(const PlanTableScanResponse&) const; }; +/// \brief Response from polling an asynchronous scan plan, including current status and +/// available scan tasks. struct ICEBERG_REST_EXPORT FetchPlanningResultResponse : BaseScanTaskResponse { PlanStatus plan_status; // TODO: Add credentials. @@ -344,6 +351,7 @@ struct ICEBERG_REST_EXPORT FetchPlanningResultResponse : BaseScanTaskResponse { bool operator==(const FetchPlanningResultResponse&) const; }; +/// \brief Request to fetch the scan tasks for a given plan task token. struct ICEBERG_REST_EXPORT FetchScanTasksRequest { std::string planTask; @@ -352,6 +360,7 @@ struct ICEBERG_REST_EXPORT FetchScanTasksRequest { bool operator==(const FetchScanTasksRequest&) const; }; +/// \brief Response containing the file scan tasks for a given plan task token. struct ICEBERG_REST_EXPORT FetchScanTasksResponse : BaseScanTaskResponse { Status Validate() const; diff --git a/src/iceberg/test/rest_json_serde_test.cc b/src/iceberg/test/rest_json_serde_test.cc index 9da052e6a..a09235d46 100644 --- a/src/iceberg/test/rest_json_serde_test.cc +++ b/src/iceberg/test/rest_json_serde_test.cc @@ -26,6 +26,7 @@ #include "iceberg/catalog/rest/json_serde_internal.h" #include "iceberg/catalog/rest/types.h" #include "iceberg/partition_spec.h" +#include "iceberg/schema.h" #include "iceberg/result.h" #include "iceberg/sort_order.h" #include "iceberg/table_identifier.h" @@ -1380,4 +1381,137 @@ INSTANTIATE_TEST_SUITE_P( return info.param.test_name; }); +// Helper: empty schema and specs for scan response tests that don't need partition parsing. +static Schema EmptySchema() { return Schema({}, 0); } +static std::unordered_map> EmptySpecs() { return {}; } + +// --- PlanTableScanResponse --- + +TEST(PlanTableScanResponseFromJsonTest, SubmittedStatusMissingOptionalFields) { + // "submitted" response: only status and plan-id, no tasks + auto json = nlohmann::json::parse( + R"({"status":"submitted","plan-id":"abc-123"})"); + auto result = PlanTableScanResponseFromJson(json, EmptySpecs(), EmptySchema()); + ASSERT_THAT(result, IsOk()); + EXPECT_EQ(result->plan_status, "submitted"); + EXPECT_EQ(result->plan_id, "abc-123"); + EXPECT_TRUE(result->plan_tasks.empty()); + EXPECT_TRUE(result->file_scan_tasks.empty()); + EXPECT_TRUE(result->delete_files.empty()); +} + +TEST(PlanTableScanResponseFromJsonTest, CompletedStatusWithPlanTasks) { + // "completed" response with plan-tasks but no file-scan-tasks + auto json = nlohmann::json::parse( + R"({"status":"completed","plan-id":"abc-123","plan-tasks":["task-1","task-2"],"delete-files":[],"file-scan-tasks":[]})"); + auto result = PlanTableScanResponseFromJson(json, EmptySpecs(), EmptySchema()); + ASSERT_THAT(result, IsOk()); + EXPECT_EQ(result->plan_status, "completed"); + EXPECT_EQ(result->plan_id, "abc-123"); + ASSERT_EQ(result->plan_tasks.size(), 2); + EXPECT_EQ(result->plan_tasks[0], "task-1"); + EXPECT_EQ(result->plan_tasks[1], "task-2"); +} + +TEST(PlanTableScanResponseFromJsonTest, MissingRequiredStatus) { + auto json = nlohmann::json::parse(R"({"plan-id":"abc-123"})"); + auto result = PlanTableScanResponseFromJson(json, EmptySpecs(), EmptySchema()); + ASSERT_THAT(result, IsError(ErrorKind::kJsonParseError)); + EXPECT_THAT(result, HasErrorMessage("Missing 'status'")); +} + +TEST(PlanTableScanResponseFromJsonTest, MissingPlanIdDefaultsToEmptyForFailedStatus) { + // plan-id is optional for non-submitted/completed statuses + auto json = nlohmann::json::parse(R"({"status":"failed"})"); + auto result = PlanTableScanResponseFromJson(json, EmptySpecs(), EmptySchema()); + ASSERT_THAT(result, IsOk()); + EXPECT_TRUE(result->plan_id.empty()); +} + +// --- FetchPlanningResultResponse --- + +TEST(FetchPlanningResultResponseFromJsonTest, SubmittedStatusNoTasks) { + auto json = nlohmann::json::parse(R"({"status":"submitted"})"); + auto result = FetchPlanningResultResponseFromJson(json, EmptySpecs(), EmptySchema()); + ASSERT_THAT(result, IsOk()); + EXPECT_EQ(result->plan_status.ToString(), "submitted"); + EXPECT_TRUE(result->plan_tasks.empty()); + EXPECT_TRUE(result->file_scan_tasks.empty()); + EXPECT_TRUE(result->delete_files.empty()); +} + +TEST(FetchPlanningResultResponseFromJsonTest, CompletedStatusWithPlanTasks) { + auto json = nlohmann::json::parse( + R"({"status":"completed","plan-tasks":["task-1"],"delete-files":[],"file-scan-tasks":[]})"); + auto result = FetchPlanningResultResponseFromJson(json, EmptySpecs(), EmptySchema()); + ASSERT_THAT(result, IsOk()); + EXPECT_EQ(result->plan_status.ToString(), "completed"); + ASSERT_EQ(result->plan_tasks.size(), 1); + EXPECT_EQ(result->plan_tasks[0], "task-1"); +} + +TEST(FetchPlanningResultResponseFromJsonTest, MissingRequiredStatus) { + auto json = nlohmann::json::parse(R"({})"); + auto result = FetchPlanningResultResponseFromJson(json, EmptySpecs(), EmptySchema()); + ASSERT_THAT(result, IsError(ErrorKind::kJsonParseError)); + EXPECT_THAT(result, HasErrorMessage("Missing 'status'")); +} + +// --- FetchScanTasksResponse --- + +TEST(FetchScanTasksResponseFromJsonTest, WithFileScanTasks) { + // One file scan task with a data file and one delete file referenced by index. + auto json = nlohmann::json::parse(R"({ + "plan-tasks": [], + "delete-files": [ + { + "content": "POSITION_DELETES", + "file-path": "s3://bucket/deletes/delete.parquet", + "file-format": "PARQUET", + "file-size-in-bytes": 512, + "record-count": 5 + } + ], + "file-scan-tasks": [ + { + "data-file": { + "content": "DATA", + "file-path": "s3://bucket/data/file.parquet", + "file-format": "PARQUET", + "file-size-in-bytes": 12345, + "record-count": 100 + }, + "delete-file-references": [0] + } + ] + })"); + auto result = FetchScanTasksResponseFromJson(json, EmptySpecs(), EmptySchema()); + ASSERT_THAT(result, IsOk()); + EXPECT_TRUE(result->plan_tasks.empty()); + ASSERT_EQ(result->delete_files.size(), 1); + ASSERT_EQ(result->file_scan_tasks.size(), 1); + EXPECT_EQ(result->file_scan_tasks[0].data_file()->file_path, + "s3://bucket/data/file.parquet"); + ASSERT_EQ(result->file_scan_tasks[0].delete_files().size(), 1); + EXPECT_EQ(result->file_scan_tasks[0].delete_files()[0]->file_path, + "s3://bucket/deletes/delete.parquet"); +} + +TEST(FetchScanTasksResponseFromJsonTest, WithPlanTasksOnly) { + auto json = nlohmann::json::parse( + R"({"plan-tasks":["task-1","task-2"],"delete-files":[],"file-scan-tasks":[]})"); + auto result = FetchScanTasksResponseFromJson(json, EmptySpecs(), EmptySchema()); + ASSERT_THAT(result, IsOk()); + ASSERT_EQ(result->plan_tasks.size(), 2); + EXPECT_EQ(result->plan_tasks[0], "task-1"); + EXPECT_TRUE(result->file_scan_tasks.empty()); +} + +TEST(FetchScanTasksResponseFromJsonTest, AllFieldsMissing) { + // Both plan-tasks and file-scan-tasks absent → Validate() should fail + auto json = nlohmann::json::parse(R"({})"); + auto result = FetchScanTasksResponseFromJson(json, EmptySpecs(), EmptySchema()); + ASSERT_THAT(result, IsError(ErrorKind::kValidationFailed)); +} + } // namespace iceberg::rest From 36dab59336dfed3cba9ce3513570d3f001dcd24d Mon Sep 17 00:00:00 2001 From: Sandeep Gottimukkala Date: Sun, 5 Apr 2026 20:06:28 -0700 Subject: [PATCH 04/17] Test --- .../test/rest_catalog_integration_test.cc | 36 +++++++------------ 1 file changed, 12 insertions(+), 24 deletions(-) diff --git a/src/iceberg/test/rest_catalog_integration_test.cc b/src/iceberg/test/rest_catalog_integration_test.cc index 1bd0807be..490d41a96 100644 --- a/src/iceberg/test/rest_catalog_integration_test.cc +++ b/src/iceberg/test/rest_catalog_integration_test.cc @@ -488,15 +488,10 @@ TEST_F(RestCatalogIntegrationTest, FetchPlanningResult) { internal::TableScanContext context; ICEBERG_UNWRAP_OR_FAIL(auto plan_response, catalog->PlanTableScan(*table, context)); - if (!plan_response.plan_id.empty()) { - // Async plan: fetch the result using the returned plan_id - ICEBERG_UNWRAP_OR_FAIL(auto fetch_response, - catalog->FetchPlanningResult(*table, plan_response.plan_id)); - EXPECT_TRUE(fetch_response.file_scan_tasks.empty()); - } else { - // Synchronous plan: file_scan_tasks already returned; verify nonexistent plan_id errors - EXPECT_FALSE(catalog->FetchPlanningResult(*table, "nonexistent-plan-id").has_value()); - } + // NOTE: apache/iceberg-rest-fixture always responds synchronously (status="completed") + // with a non-empty plan_id (e.g. "sync-"). Sync plans are immediately discarded + // server-side. Async paths are covered by unit tests. + EXPECT_FALSE(plan_response.plan_id.empty()); } TEST_F(RestCatalogIntegrationTest, CancelPlanning) { @@ -509,13 +504,10 @@ TEST_F(RestCatalogIntegrationTest, CancelPlanning) { internal::TableScanContext context; ICEBERG_UNWRAP_OR_FAIL(auto plan_response, catalog->PlanTableScan(*table, context)); - if (!plan_response.plan_id.empty()) { - // Async plan: cancel it - ASSERT_THAT(catalog->CancelPlanning(*table, plan_response.plan_id), IsOk()); - } else { - // Synchronous plan: verify cancelling a nonexistent plan_id returns an error - EXPECT_FALSE(catalog->CancelPlanning(*table, "nonexistent-plan-id").has_value()); - } + // NOTE: apache/iceberg-rest-fixture always responds synchronously — sync plan_id is + // accepted for cancellation (idempotent). Async paths are covered by unit tests. + ASSERT_FALSE(plan_response.plan_id.empty()); + ASSERT_THAT(catalog->CancelPlanning(*table, plan_response.plan_id), IsOk()); } TEST_F(RestCatalogIntegrationTest, FetchScanTasks) { @@ -528,14 +520,10 @@ TEST_F(RestCatalogIntegrationTest, FetchScanTasks) { internal::TableScanContext context; ICEBERG_UNWRAP_OR_FAIL(auto plan_response, catalog->PlanTableScan(*table, context)); - if (!plan_response.plan_tasks.empty()) { - // Use the first plan task token to fetch scan tasks - ICEBERG_UNWRAP_OR_FAIL( - auto tasks_response, - catalog->FetchScanTasks(*table, plan_response.plan_tasks[0])); - EXPECT_TRUE(tasks_response.file_scan_tasks.empty()); - } - // If synchronous plan (no plan_tasks), file_scan_tasks are already in plan_response + // NOTE: apache/iceberg-rest-fixture always responds synchronously — plan_tasks is + // always empty. FetchScanTasks is only relevant for async plans; async paths are + // covered by unit tests. + EXPECT_TRUE(plan_response.plan_tasks.empty()); } // -- Snapshot loading mode -- From 2647391656a0fe947117092aabb9dcd6f93e7625 Mon Sep 17 00:00:00 2001 From: Sandeep Gottimukkala Date: Sun, 5 Apr 2026 20:11:13 -0700 Subject: [PATCH 05/17] Precommit fixes --- src/iceberg/catalog/rest/json_serde.cc | 26 +++++++++++-------- src/iceberg/catalog/rest/resource_paths.h | 9 ++++--- src/iceberg/catalog/rest/rest_catalog.cc | 16 ++++++------ src/iceberg/catalog/rest/types.cc | 21 ++++++++++----- src/iceberg/catalog/rest/types.h | 4 +-- src/iceberg/json_serde.cc | 22 +++++++--------- .../test/rest_catalog_integration_test.cc | 2 +- src/iceberg/test/rest_json_serde_test.cc | 12 +++++---- 8 files changed, 62 insertions(+), 50 deletions(-) diff --git a/src/iceberg/catalog/rest/json_serde.cc b/src/iceberg/catalog/rest/json_serde.cc index b6f00877c..f36184498 100644 --- a/src/iceberg/catalog/rest/json_serde.cc +++ b/src/iceberg/catalog/rest/json_serde.cc @@ -560,16 +560,18 @@ nlohmann::json ToJson(const FetchScanTasksRequest& request) { Status BaseScanTaskResponseFromJson( const nlohmann::json& json, BaseScanTaskResponse* response, - const std::unordered_map>& partition_specs_by_id, + const std::unordered_map>& + partition_specs_by_id, const Schema& schema) { // 1. plan_tasks - ICEBERG_ASSIGN_OR_RAISE(response->plan_tasks, - GetJsonValueOrDefault>(json, kPlanTasks)); + ICEBERG_ASSIGN_OR_RAISE( + response->plan_tasks, + GetJsonValueOrDefault>(json, kPlanTasks)); // 2. delete_files - ICEBERG_ASSIGN_OR_RAISE(auto delete_files_json, - GetJsonValueOrDefault(json, kDeleteFiles, - nlohmann::json::array())); + ICEBERG_ASSIGN_OR_RAISE( + auto delete_files_json, + GetJsonValueOrDefault(json, kDeleteFiles, nlohmann::json::array())); for (const auto& entry_json : delete_files_json) { ICEBERG_ASSIGN_OR_RAISE(auto delete_file, DataFileFromJson(entry_json, partition_specs_by_id, schema)); @@ -589,7 +591,8 @@ Status BaseScanTaskResponseFromJson( Result PlanTableScanResponseFromJson( const nlohmann::json& json, - const std::unordered_map>& partition_specs_by_id, + const std::unordered_map>& + partition_specs_by_id, const Schema& schema) { PlanTableScanResponse response; ICEBERG_ASSIGN_OR_RAISE(response.plan_status, @@ -604,11 +607,11 @@ Result PlanTableScanResponseFromJson( Result FetchPlanningResultResponseFromJson( const nlohmann::json& json, - const std::unordered_map>& partition_specs_by_id, + const std::unordered_map>& + partition_specs_by_id, const Schema& schema) { FetchPlanningResultResponse response; - ICEBERG_ASSIGN_OR_RAISE(auto status_str, - GetJsonValue(json, kPlanStatus)); + ICEBERG_ASSIGN_OR_RAISE(auto status_str, GetJsonValue(json, kPlanStatus)); response.plan_status = PlanStatus(PlanStatus::FromString(status_str)); ICEBERG_RETURN_UNEXPECTED( BaseScanTaskResponseFromJson(json, &response, partition_specs_by_id, schema)); @@ -618,7 +621,8 @@ Result FetchPlanningResultResponseFromJson( Result FetchScanTasksResponseFromJson( const nlohmann::json& json, - const std::unordered_map>& partition_specs_by_id, + const std::unordered_map>& + partition_specs_by_id, const Schema& schema) { FetchScanTasksResponse response; ICEBERG_RETURN_UNEXPECTED( diff --git a/src/iceberg/catalog/rest/resource_paths.h b/src/iceberg/catalog/rest/resource_paths.h index cf7e98fc3..15afb35dc 100644 --- a/src/iceberg/catalog/rest/resource_paths.h +++ b/src/iceberg/catalog/rest/resource_paths.h @@ -81,14 +81,17 @@ class ICEBERG_REST_EXPORT ResourcePaths { /// \brief Get the /v1/{prefix}/transactions/commit endpoint path. Result CommitTransaction() const; - /// \brief Get the /v1/{prefix}/namespaces/{namespace}/tables/{table}/plan endpoint path. + /// \brief Get the /v1/{prefix}/namespaces/{namespace}/tables/{table}/plan endpoint + /// path. Result ScanPlan(const TableIdentifier& ident) const; /// \brief Get the /v1/{prefix}/namespaces/{namespace}/tables/{table}/plan/{plan_id} /// endpoint path. - Result ScanPlan(const TableIdentifier& ident, const std::string& plan_id) const; + Result ScanPlan(const TableIdentifier& ident, + const std::string& plan_id) const; - /// \brief Get the /v1/{prefix}/namespaces/{namespace}/tables/{table}/tasks endpoint path. + /// \brief Get the /v1/{prefix}/namespaces/{namespace}/tables/{table}/tasks endpoint + /// path. Result ScanTask(const TableIdentifier& ident) const; private: diff --git a/src/iceberg/catalog/rest/rest_catalog.cc b/src/iceberg/catalog/rest/rest_catalog.cc index 7b1657b86..38738024d 100644 --- a/src/iceberg/catalog/rest/rest_catalog.cc +++ b/src/iceberg/catalog/rest/rest_catalog.cc @@ -527,8 +527,8 @@ Result RestCatalog::PlanTableScan( ICEBERG_ASSIGN_OR_RAISE(auto json_request, ToJsonString(request_json)); ICEBERG_ASSIGN_OR_RAISE( const auto response, - client_->Post(path, json_request, /*headers=*/{}, - *ScanPlanErrorHandler::Instance(), *catalog_session_)); + client_->Post(path, json_request, /*headers=*/{}, *ScanPlanErrorHandler::Instance(), + *catalog_session_)); ICEBERG_ASSIGN_OR_RAISE(auto json, FromJsonString(response.body())); return PlanTableScanResponseFromJson(json, specs_ref.get(), *schema_ptr); @@ -543,8 +543,8 @@ Result RestCatalog::FetchPlanningResult( ICEBERG_ASSIGN_OR_RAISE( const auto response, - client_->Get(path, /*params=*/{}, /*headers=*/{}, - *ScanPlanErrorHandler::Instance(), *catalog_session_)); + client_->Get(path, /*params=*/{}, /*headers=*/{}, *ScanPlanErrorHandler::Instance(), + *catalog_session_)); ICEBERG_ASSIGN_OR_RAISE(auto json, FromJsonString(response.body())); return FetchPlanningResultResponseFromJson(json, specs_ref.get(), *schema_ptr); } @@ -560,8 +560,8 @@ Status RestCatalog::CancelPlanning(const Table& table, const std::string& plan_i return {}; } -Result RestCatalog::FetchScanTasks( - const Table& table, const std::string& plan_task) { +Result RestCatalog::FetchScanTasks(const Table& table, + const std::string& plan_task) { ICEBERG_ENDPOINT_CHECK(supported_endpoints_, Endpoint::FetchScanTasks()); ICEBERG_ASSIGN_OR_RAISE(auto path, paths_->ScanTask(table.name())); ICEBERG_ASSIGN_OR_RAISE(auto schema_ptr, table.schema()); @@ -572,8 +572,8 @@ Result RestCatalog::FetchScanTasks( ICEBERG_ASSIGN_OR_RAISE(auto json_request, ToJsonString(ToJson(request))); ICEBERG_ASSIGN_OR_RAISE( const auto response, - client_->Post(path, json_request, /*headers=*/{}, - *ScanPlanErrorHandler::Instance(), *catalog_session_)); + client_->Post(path, json_request, /*headers=*/{}, *ScanPlanErrorHandler::Instance(), + *catalog_session_)); ICEBERG_ASSIGN_OR_RAISE(auto json, FromJsonString(response.body())); return FetchScanTasksResponseFromJson(json, specs_ref.get(), *schema_ptr); diff --git a/src/iceberg/catalog/rest/types.cc b/src/iceberg/catalog/rest/types.cc index 5a2e19e22..0c05c8fdb 100644 --- a/src/iceberg/catalog/rest/types.cc +++ b/src/iceberg/catalog/rest/types.cc @@ -139,7 +139,8 @@ bool BaseScanTaskResponse::operator==(const BaseScanTaskResponse& other) const { if (a.delete_files().size() != b.delete_files().size()) return false; for (size_t j = 0; j < a.delete_files().size(); ++j) { if (!a.delete_files()[j] != !b.delete_files()[j]) return false; - if (a.delete_files()[j] && *a.delete_files()[j] != *b.delete_files()[j]) return false; + if (a.delete_files()[j] && *a.delete_files()[j] != *b.delete_files()[j]) + return false; } if (a.residual_filter() != b.residual_filter()) return false; } @@ -151,7 +152,8 @@ bool PlanTableScanResponse::operator==(const PlanTableScanResponse& other) const plan_id == other.plan_id; } -bool FetchPlanningResultResponse::operator==(const FetchPlanningResultResponse& other) const { +bool FetchPlanningResultResponse::operator==( + const FetchPlanningResultResponse& other) const { return BaseScanTaskResponse::operator==(other) && plan_status.ToString() == other.plan_status.ToString(); } @@ -185,7 +187,8 @@ Status PlanTableScanRequest::Validate() const { if (snapshot_id.has_value()) { if (start_snapshot_id.has_value() || end_snapshot_id.has_value()) { return ValidationFailed( - "Invalid scan: cannot provide both snapshotId and startSnapshotId/endSnapshotId"); + "Invalid scan: cannot provide both snapshotId and " + "startSnapshotId/endSnapshotId"); } } if (start_snapshot_id.has_value() || end_snapshot_id.has_value()) { @@ -218,11 +221,13 @@ Status PlanTableScanResponse::Validate() const { } if (!plan_id.empty() && plan_status != "submitted" && plan_status != "completed") { return ValidationFailed( - "Invalid response: plan id can only be defined when status is 'submitted' or 'completed'"); + "Invalid response: plan id can only be defined when status is 'submitted' or " + "'completed'"); } if (file_scan_tasks.empty() && !delete_files.empty()) { return ValidationFailed( - "Invalid response: deleteFiles should only be returned with fileScanTasks that reference them"); + "Invalid response: deleteFiles should only be returned with fileScanTasks that " + "reference them"); } return {}; } @@ -238,7 +243,8 @@ Status FetchPlanningResultResponse::Validate() const { } if (file_scan_tasks.empty() && !delete_files.empty()) { return ValidationFailed( - "Invalid response: deleteFiles should only be returned with fileScanTasks that reference them"); + "Invalid response: deleteFiles should only be returned with fileScanTasks that " + "reference them"); } return {}; } @@ -253,7 +259,8 @@ Status FetchScanTasksRequest::Validate() const { Status FetchScanTasksResponse::Validate() const { if (file_scan_tasks.empty() && !delete_files.empty()) { return ValidationFailed( - "Invalid response: deleteFiles should only be returned with fileScanTasks that reference them"); + "Invalid response: deleteFiles should only be returned with fileScanTasks that " + "reference them"); } if (plan_tasks.empty() && file_scan_tasks.empty()) { return ValidationFailed( diff --git a/src/iceberg/catalog/rest/types.h b/src/iceberg/catalog/rest/types.h index 84d0c1c35..febb876ea 100644 --- a/src/iceberg/catalog/rest/types.h +++ b/src/iceberg/catalog/rest/types.h @@ -320,12 +320,12 @@ struct ICEBERG_REST_EXPORT PlanTableScanRequest { struct ICEBERG_REST_EXPORT BaseScanTaskResponse { std::vector plan_tasks; std::vector file_scan_tasks; - std::vector delete_files; + std::vector delete_files; // std::unordered_map specsById; Status Validate() const { return {}; }; - bool operator==(const BaseScanTaskResponse&) const; + bool operator==(const BaseScanTaskResponse&) const; }; /// \brief Response from initiating a scan planning operation, including plan status and diff --git a/src/iceberg/json_serde.cc b/src/iceberg/json_serde.cc index 1dc3537a8..a2a15a27d 100644 --- a/src/iceberg/json_serde.cc +++ b/src/iceberg/json_serde.cc @@ -253,7 +253,6 @@ constexpr std::string_view kReferencedDataFile = "referenced-data-file"; constexpr std::string_view kContentOffset = "content-offset"; constexpr std::string_view kContentSizeInBytes = "content-size-in-bytes"; - } // namespace nlohmann::json ToJson(const SortField& sort_field) { @@ -1803,8 +1802,7 @@ Result DataFileFromJson( } for (size_t pos = 0; pos < fields.size(); ++pos) { ICEBERG_ASSIGN_OR_RAISE( - auto literal, - LiteralFromJson(partition_vals[pos], fields[pos].type().get())); + auto literal, LiteralFromJson(partition_vals[pos], fields[pos].type().get())); literals.push_back(std::move(literal)); } df.partition = PartitionValues(std::move(literals)); @@ -1844,10 +1842,11 @@ Result DataFileFromJson( ICEBERG_ASSIGN_OR_RAISE(auto map_json, GetJsonValue(json, key)); ICEBERG_ASSIGN_OR_RAISE(auto keys, GetJsonValue>(map_json, "keys")); - ICEBERG_ASSIGN_OR_RAISE(auto values, - GetJsonValue>>(map_json, "values")); + ICEBERG_ASSIGN_OR_RAISE( + auto values, GetJsonValue>>(map_json, "values")); if (keys.size() != values.size()) { - return JsonParseError("'{}' binary map keys and values have different lengths", key); + return JsonParseError("'{}' binary map keys and values have different lengths", + key); } for (size_t i = 0; i < keys.size(); ++i) { target[keys[i]] = values[i]; @@ -1871,8 +1870,7 @@ Result DataFileFromJson( GetJsonValue>(json, kEqualityIds)); } if (json.contains(kSortOrderId) && !json.at(kSortOrderId).is_null()) { - ICEBERG_ASSIGN_OR_RAISE(df.sort_order_id, - GetJsonValue(json, kSortOrderId)); + ICEBERG_ASSIGN_OR_RAISE(df.sort_order_id, GetJsonValue(json, kSortOrderId)); } if (json.contains(kFirstRowId) && !json.at(kFirstRowId).is_null()) { ICEBERG_ASSIGN_OR_RAISE(df.first_row_id, GetJsonValue(json, kFirstRowId)); @@ -1916,9 +1914,8 @@ Result> FileScanTasksFromJson( std::vector> task_delete_files; if (task_json.contains(kDeleteFileReferences) && !task_json.at(kDeleteFileReferences).is_null()) { - ICEBERG_ASSIGN_OR_RAISE( - auto refs, - GetJsonValue>(task_json, kDeleteFileReferences)); + ICEBERG_ASSIGN_OR_RAISE(auto refs, GetJsonValue>( + task_json, kDeleteFileReferences)); for (int32_t ref : refs) { if (ref < 0 || static_cast(ref) >= delete_files.size()) { return JsonParseError( @@ -1930,8 +1927,7 @@ Result> FileScanTasksFromJson( } std::shared_ptr residual_filter; - if (task_json.contains(kResidualFilter) && - !task_json.at(kResidualFilter).is_null()) { + if (task_json.contains(kResidualFilter) && !task_json.at(kResidualFilter).is_null()) { ICEBERG_ASSIGN_OR_RAISE(auto filter_json, GetJsonValue(task_json, kResidualFilter)); ICEBERG_ASSIGN_OR_RAISE(residual_filter, ExpressionFromJson(filter_json)); diff --git a/src/iceberg/test/rest_catalog_integration_test.cc b/src/iceberg/test/rest_catalog_integration_test.cc index 490d41a96..7f03932c1 100644 --- a/src/iceberg/test/rest_catalog_integration_test.cc +++ b/src/iceberg/test/rest_catalog_integration_test.cc @@ -40,13 +40,13 @@ #include "iceberg/catalog/rest/json_serde_internal.h" #include "iceberg/catalog/rest/rest_catalog.h" #include "iceberg/partition_spec.h" -#include "iceberg/table_scan.h" #include "iceberg/result.h" #include "iceberg/schema.h" #include "iceberg/sort_order.h" #include "iceberg/table.h" #include "iceberg/table_identifier.h" #include "iceberg/table_requirement.h" +#include "iceberg/table_scan.h" #include "iceberg/table_update.h" #include "iceberg/test/matchers.h" #include "iceberg/test/std_io.h" diff --git a/src/iceberg/test/rest_json_serde_test.cc b/src/iceberg/test/rest_json_serde_test.cc index a09235d46..095a8ebad 100644 --- a/src/iceberg/test/rest_json_serde_test.cc +++ b/src/iceberg/test/rest_json_serde_test.cc @@ -26,8 +26,8 @@ #include "iceberg/catalog/rest/json_serde_internal.h" #include "iceberg/catalog/rest/types.h" #include "iceberg/partition_spec.h" -#include "iceberg/schema.h" #include "iceberg/result.h" +#include "iceberg/schema.h" #include "iceberg/sort_order.h" #include "iceberg/table_identifier.h" #include "iceberg/table_metadata.h" @@ -1381,16 +1381,18 @@ INSTANTIATE_TEST_SUITE_P( return info.param.test_name; }); -// Helper: empty schema and specs for scan response tests that don't need partition parsing. +// Helper: empty schema and specs for scan response tests that don't need partition +// parsing. static Schema EmptySchema() { return Schema({}, 0); } -static std::unordered_map> EmptySpecs() { return {}; } +static std::unordered_map> EmptySpecs() { + return {}; +} // --- PlanTableScanResponse --- TEST(PlanTableScanResponseFromJsonTest, SubmittedStatusMissingOptionalFields) { // "submitted" response: only status and plan-id, no tasks - auto json = nlohmann::json::parse( - R"({"status":"submitted","plan-id":"abc-123"})"); + auto json = nlohmann::json::parse(R"({"status":"submitted","plan-id":"abc-123"})"); auto result = PlanTableScanResponseFromJson(json, EmptySpecs(), EmptySchema()); ASSERT_THAT(result, IsOk()); EXPECT_EQ(result->plan_status, "submitted"); From b466a31e6e54c76e2335ada2d94e1ef5f2fdde90 Mon Sep 17 00:00:00 2001 From: Sandeep Gottimukkala Date: Sun, 5 Apr 2026 20:32:56 -0700 Subject: [PATCH 06/17] Fixes and cf --- src/iceberg/catalog/rest/json_serde.cc | 4 +-- src/iceberg/catalog/rest/types.cc | 8 ++--- src/iceberg/catalog/rest/types.h | 2 +- src/iceberg/json_serde.cc | 9 +++--- src/iceberg/table_scan.h | 41 ------------------------ src/iceberg/test/json_serde_test.cc | 22 ++++++------- src/iceberg/test/rest_json_serde_test.cc | 4 +-- 7 files changed, 22 insertions(+), 68 deletions(-) diff --git a/src/iceberg/catalog/rest/json_serde.cc b/src/iceberg/catalog/rest/json_serde.cc index f36184498..c2d4ccf4b 100644 --- a/src/iceberg/catalog/rest/json_serde.cc +++ b/src/iceberg/catalog/rest/json_serde.cc @@ -611,8 +611,8 @@ Result FetchPlanningResultResponseFromJson( partition_specs_by_id, const Schema& schema) { FetchPlanningResultResponse response; - ICEBERG_ASSIGN_OR_RAISE(auto status_str, GetJsonValue(json, kPlanStatus)); - response.plan_status = PlanStatus(PlanStatus::FromString(status_str)); + ICEBERG_ASSIGN_OR_RAISE(response.plan_status, + GetJsonValue(json, kPlanStatus)); ICEBERG_RETURN_UNEXPECTED( BaseScanTaskResponseFromJson(json, &response, partition_specs_by_id, schema)); ICEBERG_RETURN_UNEXPECTED(response.Validate()); diff --git a/src/iceberg/catalog/rest/types.cc b/src/iceberg/catalog/rest/types.cc index 0c05c8fdb..a5e6b79a7 100644 --- a/src/iceberg/catalog/rest/types.cc +++ b/src/iceberg/catalog/rest/types.cc @@ -154,8 +154,7 @@ bool PlanTableScanResponse::operator==(const PlanTableScanResponse& other) const bool FetchPlanningResultResponse::operator==( const FetchPlanningResultResponse& other) const { - return BaseScanTaskResponse::operator==(other) && - plan_status.ToString() == other.plan_status.ToString(); + return BaseScanTaskResponse::operator==(other) && plan_status == other.plan_status; } bool FetchScanTasksRequest::operator==(const FetchScanTasksRequest& other) const { @@ -233,11 +232,10 @@ Status PlanTableScanResponse::Validate() const { } Status FetchPlanningResultResponse::Validate() const { - if (plan_status.ToString() == "unknown") { + if (plan_status.empty()) { return ValidationFailed("Invalid status: null"); } - if (plan_status.ToString() != "completed" && - (!plan_tasks.empty() || !file_scan_tasks.empty())) { + if (plan_status != "completed" && (!plan_tasks.empty() || !file_scan_tasks.empty())) { return ValidationFailed( "Invalid response: tasks can only be returned in a 'completed' status"); } diff --git a/src/iceberg/catalog/rest/types.h b/src/iceberg/catalog/rest/types.h index febb876ea..2af2e0d0d 100644 --- a/src/iceberg/catalog/rest/types.h +++ b/src/iceberg/catalog/rest/types.h @@ -343,7 +343,7 @@ struct ICEBERG_REST_EXPORT PlanTableScanResponse : BaseScanTaskResponse { /// \brief Response from polling an asynchronous scan plan, including current status and /// available scan tasks. struct ICEBERG_REST_EXPORT FetchPlanningResultResponse : BaseScanTaskResponse { - PlanStatus plan_status; + std::string plan_status; // TODO: Add credentials. Status Validate() const; diff --git a/src/iceberg/json_serde.cc b/src/iceberg/json_serde.cc index a2a15a27d..72cad1024 100644 --- a/src/iceberg/json_serde.cc +++ b/src/iceberg/json_serde.cc @@ -1761,12 +1761,11 @@ Result DataFileFromJson( DataFile df; ICEBERG_ASSIGN_OR_RAISE(auto content_str, GetJsonValue(json, kContent)); - const auto upper_content = StringUtils::ToUpper(content_str); - if (upper_content == "DATA") { + if (content_str == ToString(DataFile::Content::kData)) { df.content = DataFile::Content::kData; - } else if (upper_content == "POSITION_DELETES") { + } else if (content_str == ToString(DataFile::Content::kPositionDeletes)) { df.content = DataFile::Content::kPositionDeletes; - } else if (upper_content == "EQUALITY_DELETES") { + } else if (content_str == ToString(DataFile::Content::kEqualityDeletes)) { df.content = DataFile::Content::kEqualityDeletes; } else { return JsonParseError("Unknown data file content: {}", content_str); @@ -1835,7 +1834,7 @@ Result DataFileFromJson( ICEBERG_RETURN_UNEXPECTED(parse_int_map(kNullValueCounts, df.null_value_counts)); ICEBERG_RETURN_UNEXPECTED(parse_int_map(kNanValueCounts, df.nan_value_counts)); - // Parse BinaryMap: {"keys": [int, ...], "values": [base64 binary, ...]} + // Parse BinaryMap: {"keys": [int, ...], "values": [...]} auto parse_binary_map = [&](std::string_view key, std::map>& target) -> Status { if (!json.contains(key) || json.at(key).is_null()) return {}; diff --git a/src/iceberg/table_scan.h b/src/iceberg/table_scan.h index 1914a7824..b21adcca6 100644 --- a/src/iceberg/table_scan.h +++ b/src/iceberg/table_scan.h @@ -35,47 +35,6 @@ namespace iceberg { -class ICEBERG_EXPORT PlanStatus { - public: - enum class Status { - kCompleted, - kSubmitted, - kCancelled, - kFailed, - kUnknown - }; - - PlanStatus() : status_(Status::kUnknown) {} - explicit PlanStatus(Status status) : status_(status) {} - - static Status FromString(std::string_view status_str) { - if (status_str == "completed") { - return Status::kCompleted; - } else if (status_str == "submitted") { - return Status::kSubmitted; - } else if (status_str == "cancelled") { - return Status::kCancelled; - } else if (status_str == "failed") { - return Status::kFailed; - } - return Status::kUnknown; - } - - const std::string ToString() const { - switch (status_) { - case Status::kCompleted: return "completed"; - case Status::kSubmitted: return "submitted"; - case Status::kCancelled: return "cancelled"; - case Status::kFailed: return "failed"; - default: return "unknown"; - } - } - - private: - Status status_; - -}; - /// \brief An abstract scan task. class ICEBERG_EXPORT ScanTask { public: diff --git a/src/iceberg/test/json_serde_test.cc b/src/iceberg/test/json_serde_test.cc index 4daff9d4c..6979a02f2 100644 --- a/src/iceberg/test/json_serde_test.cc +++ b/src/iceberg/test/json_serde_test.cc @@ -779,7 +779,7 @@ TEST(TableRequirementJsonTest, TableRequirementUnknownType) { TEST(DataFileFromJsonTest, RequiredFieldsOnly) { auto json = R"({ - "content": "DATA", + "content": "data", "file-path": "s3://bucket/data/file.parquet", "file-format": "PARQUET", "file-size-in-bytes": 12345, @@ -799,8 +799,7 @@ TEST(DataFileFromJsonTest, RequiredFieldsOnly) { EXPECT_FALSE(df.partition_spec_id.has_value()); } -TEST(DataFileFromJsonTest, LowercaseContentAndFormat) { - // The REST API sends uppercase, but we should handle lowercase too. +TEST(DataFileFromJsonTest, LowercaseFormat) { auto json = R"({ "content": "data", "file-path": "s3://bucket/data/file.avro", @@ -817,7 +816,7 @@ TEST(DataFileFromJsonTest, LowercaseContentAndFormat) { TEST(DataFileFromJsonTest, WithOptionalFields) { auto json = R"({ - "content": "DATA", + "content": "data", "file-path": "s3://bucket/data/file.parquet", "file-format": "PARQUET", "spec-id": 1, @@ -852,7 +851,7 @@ TEST(DataFileFromJsonTest, WithOptionalFields) { TEST(DataFileFromJsonTest, EqualityDeleteFile) { auto json = R"({ - "content": "EQUALITY_DELETES", + "content": "equality_deletes", "file-path": "s3://bucket/deletes/eq_delete.parquet", "file-format": "PARQUET", "file-size-in-bytes": 5000, @@ -871,7 +870,7 @@ TEST(DataFileFromJsonTest, EqualityDeleteFile) { TEST(DataFileFromJsonTest, PositionDeleteFileWithReferencedDataFile) { auto json = R"({ - "content": "POSITION_DELETES", + "content": "position_deletes", "file-path": "s3://bucket/deletes/pos_delete.parquet", "file-format": "PARQUET", "file-size-in-bytes": 3000, @@ -904,7 +903,7 @@ TEST(DataFileFromJsonTest, InvalidContentType) { TEST(DataFileFromJsonTest, MissingRequiredField) { // Missing "file-path" auto json = R"({ - "content": "DATA", + "content": "data", "file-format": "PARQUET", "file-size-in-bytes": 100, "record-count": 10 @@ -931,7 +930,7 @@ TEST(FileScanTasksFromJsonTest, EmptyArray) { TEST(FileScanTasksFromJsonTest, SingleTaskNoDeleteFiles) { auto json = R"([{ "data-file": { - "content": "DATA", + "content": "data", "file-path": "s3://bucket/data/file.parquet", "file-format": "PARQUET", "file-size-in-bytes": 12345, @@ -959,7 +958,7 @@ TEST(FileScanTasksFromJsonTest, TaskWithDeleteFileReferences) { auto json = R"([{ "data-file": { - "content": "DATA", + "content": "data", "file-path": "s3://bucket/data/file.parquet", "file-format": "PARQUET", "file-size-in-bytes": 12345, @@ -973,14 +972,13 @@ TEST(FileScanTasksFromJsonTest, TaskWithDeleteFileReferences) { ASSERT_EQ(result.value().size(), 1U); const auto& task = result.value()[0]; ASSERT_EQ(task.delete_files().size(), 1U); - EXPECT_EQ(task.delete_files()[0]->file_path, - "s3://bucket/deletes/pos_delete.parquet"); + EXPECT_EQ(task.delete_files()[0]->file_path, "s3://bucket/deletes/pos_delete.parquet"); } TEST(FileScanTasksFromJsonTest, DeleteFileReferenceOutOfRange) { auto json = R"([{ "data-file": { - "content": "DATA", + "content": "data", "file-path": "s3://bucket/data/file.parquet", "file-format": "PARQUET", "file-size-in-bytes": 100, diff --git a/src/iceberg/test/rest_json_serde_test.cc b/src/iceberg/test/rest_json_serde_test.cc index 095a8ebad..bd38c8ef4 100644 --- a/src/iceberg/test/rest_json_serde_test.cc +++ b/src/iceberg/test/rest_json_serde_test.cc @@ -1467,7 +1467,7 @@ TEST(FetchScanTasksResponseFromJsonTest, WithFileScanTasks) { "plan-tasks": [], "delete-files": [ { - "content": "POSITION_DELETES", + "content": "position_deletes", "file-path": "s3://bucket/deletes/delete.parquet", "file-format": "PARQUET", "file-size-in-bytes": 512, @@ -1477,7 +1477,7 @@ TEST(FetchScanTasksResponseFromJsonTest, WithFileScanTasks) { "file-scan-tasks": [ { "data-file": { - "content": "DATA", + "content": "data", "file-path": "s3://bucket/data/file.parquet", "file-format": "PARQUET", "file-size-in-bytes": 12345, From 121bcb8a9d1cccede7325ffa992f2647438cbdc6 Mon Sep 17 00:00:00 2001 From: Sandeep Gottimukkala Date: Sun, 5 Apr 2026 21:36:29 -0700 Subject: [PATCH 07/17] Fix --- src/iceberg/test/rest_json_serde_test.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/iceberg/test/rest_json_serde_test.cc b/src/iceberg/test/rest_json_serde_test.cc index bd38c8ef4..e000f26b9 100644 --- a/src/iceberg/test/rest_json_serde_test.cc +++ b/src/iceberg/test/rest_json_serde_test.cc @@ -1436,7 +1436,7 @@ TEST(FetchPlanningResultResponseFromJsonTest, SubmittedStatusNoTasks) { auto json = nlohmann::json::parse(R"({"status":"submitted"})"); auto result = FetchPlanningResultResponseFromJson(json, EmptySpecs(), EmptySchema()); ASSERT_THAT(result, IsOk()); - EXPECT_EQ(result->plan_status.ToString(), "submitted"); + EXPECT_EQ(result->plan_status, "submitted"); EXPECT_TRUE(result->plan_tasks.empty()); EXPECT_TRUE(result->file_scan_tasks.empty()); EXPECT_TRUE(result->delete_files.empty()); @@ -1447,7 +1447,7 @@ TEST(FetchPlanningResultResponseFromJsonTest, CompletedStatusWithPlanTasks) { R"({"status":"completed","plan-tasks":["task-1"],"delete-files":[],"file-scan-tasks":[]})"); auto result = FetchPlanningResultResponseFromJson(json, EmptySpecs(), EmptySchema()); ASSERT_THAT(result, IsOk()); - EXPECT_EQ(result->plan_status.ToString(), "completed"); + EXPECT_EQ(result->plan_status, "completed"); ASSERT_EQ(result->plan_tasks.size(), 1); EXPECT_EQ(result->plan_tasks[0], "task-1"); } From a8d5ee92289ddef4b01fa5c14cdd3b81d796c5eb Mon Sep 17 00:00:00 2001 From: Sandeep Gottimukkala Date: Sun, 5 Apr 2026 21:40:00 -0700 Subject: [PATCH 08/17] Clang fmt --- src/iceberg/catalog/rest/endpoint.h | 9 ++++++--- src/iceberg/catalog/rest/error_handlers.cc | 4 +--- src/iceberg/catalog/rest/json_serde_internal.h | 12 ++++++++---- src/iceberg/catalog/rest/resource_paths.cc | 3 ++- src/iceberg/catalog/rest/rest_catalog.h | 4 ++-- src/iceberg/json_serde_internal.h | 12 ++++++++---- src/iceberg/test/endpoint_test.cc | 10 +++++----- 7 files changed, 32 insertions(+), 22 deletions(-) diff --git a/src/iceberg/catalog/rest/endpoint.h b/src/iceberg/catalog/rest/endpoint.h index 6021be6c1..fdcd2108e 100644 --- a/src/iceberg/catalog/rest/endpoint.h +++ b/src/iceberg/catalog/rest/endpoint.h @@ -134,15 +134,18 @@ class ICEBERG_REST_EXPORT Endpoint { } static Endpoint FetchPlanningResult() { - return {HttpMethod::kGet, "/v1/{prefix}/namespaces/{namespace}/tables/{table}/plan/{plan-id}"}; + return {HttpMethod::kGet, + "/v1/{prefix}/namespaces/{namespace}/tables/{table}/plan/{plan-id}"}; } static Endpoint CancelPlanning() { - return {HttpMethod::kDelete, "/v1/{prefix}/namespaces/{namespace}/tables/{table}/plan/{plan-id}"}; + return {HttpMethod::kDelete, + "/v1/{prefix}/namespaces/{namespace}/tables/{table}/plan/{plan-id}"}; } static Endpoint FetchScanTasks() { - return {HttpMethod::kPost, "/v1/{prefix}/namespaces/{namespace}/tables/{table}/tasks"}; + return {HttpMethod::kPost, + "/v1/{prefix}/namespaces/{namespace}/tables/{table}/tasks"}; } private: diff --git a/src/iceberg/catalog/rest/error_handlers.cc b/src/iceberg/catalog/rest/error_handlers.cc index 74189c27d..f1aa6d4cf 100644 --- a/src/iceberg/catalog/rest/error_handlers.cc +++ b/src/iceberg/catalog/rest/error_handlers.cc @@ -187,8 +187,7 @@ Status ViewCommitErrorHandler::Accept(const ErrorResponse& error) const { } const std::shared_ptr& ScanPlanErrorHandler::Instance() { - static const std::shared_ptr instance{ - new ScanPlanErrorHandler()}; + static const std::shared_ptr instance{new ScanPlanErrorHandler()}; return instance; } @@ -215,5 +214,4 @@ Status ScanPlanErrorHandler::Accept(const ErrorResponse& error) const { return DefaultErrorHandler::Accept(error); } - } // namespace iceberg::rest diff --git a/src/iceberg/catalog/rest/json_serde_internal.h b/src/iceberg/catalog/rest/json_serde_internal.h index e50f85d1a..cdd2dab4d 100644 --- a/src/iceberg/catalog/rest/json_serde_internal.h +++ b/src/iceberg/catalog/rest/json_serde_internal.h @@ -68,17 +68,21 @@ ICEBERG_DECLARE_JSON_SERDE(OAuthTokenResponse) ICEBERG_REST_EXPORT Result PlanTableScanResponseFromJson( const nlohmann::json& json, - const std::unordered_map>& partition_specs_by_id, + const std::unordered_map>& + partition_specs_by_id, const Schema& schema); -ICEBERG_REST_EXPORT Result FetchPlanningResultResponseFromJson( +ICEBERG_REST_EXPORT Result +FetchPlanningResultResponseFromJson( const nlohmann::json& json, - const std::unordered_map>& partition_specs_by_id, + const std::unordered_map>& + partition_specs_by_id, const Schema& schema); ICEBERG_REST_EXPORT Result FetchScanTasksResponseFromJson( const nlohmann::json& json, - const std::unordered_map>& partition_specs_by_id, + const std::unordered_map>& + partition_specs_by_id, const Schema& schema); ICEBERG_REST_EXPORT Result ToJson(const PlanTableScanRequest& request); diff --git a/src/iceberg/catalog/rest/resource_paths.cc b/src/iceberg/catalog/rest/resource_paths.cc index b6f8030c1..4e6660b57 100644 --- a/src/iceberg/catalog/rest/resource_paths.cc +++ b/src/iceberg/catalog/rest/resource_paths.cc @@ -109,7 +109,8 @@ Result ResourcePaths::ScanPlan(const TableIdentifier& ident) const encoded_namespace, encoded_table_name); } -Result ResourcePaths::ScanPlan(const TableIdentifier& ident, const std::string& plan_id) const { +Result ResourcePaths::ScanPlan(const TableIdentifier& ident, + const std::string& plan_id) const { ICEBERG_ASSIGN_OR_RAISE(std::string encoded_namespace, EncodeNamespace(ident.ns)); ICEBERG_ASSIGN_OR_RAISE(std::string encoded_table_name, EncodeString(ident.name)); return std::format("{}/v1/{}namespaces/{}/tables/{}/plan/{}", base_uri_, prefix_, diff --git a/src/iceberg/catalog/rest/rest_catalog.h b/src/iceberg/catalog/rest/rest_catalog.h index 014929d83..d4a86e1ec 100644 --- a/src/iceberg/catalog/rest/rest_catalog.h +++ b/src/iceberg/catalog/rest/rest_catalog.h @@ -109,8 +109,8 @@ class ICEBERG_REST_EXPORT RestCatalog : public Catalog, Result FetchPlanningResult( const Table& table, const std::string& plan_id) override; Status CancelPlanning(const Table& table, const std::string& plan_id) override; - Result FetchScanTasks( - const Table& table, const std::string& plan_task) override; + Result FetchScanTasks(const Table& table, + const std::string& plan_task) override; private: RestCatalog(RestCatalogProperties config, std::shared_ptr file_io, diff --git a/src/iceberg/json_serde_internal.h b/src/iceberg/json_serde_internal.h index 654c0f24c..410d92ae2 100644 --- a/src/iceberg/json_serde_internal.h +++ b/src/iceberg/json_serde_internal.h @@ -424,8 +424,10 @@ ICEBERG_EXPORT Result> TableRequirementFromJso /// Binary fields (lower-bounds, upper-bounds, key-metadata) are base64-encoded. /// /// \param json The JSON object representing a `DataFile`. -/// \param partitionSpecById Map from spec ID to PartitionSpec for type-aware partition parsing. -/// \param schema The table schema, used with partitionSpecById to resolve partition types. +/// \param partitionSpecById Map from spec ID to PartitionSpec for type-aware partition +/// parsing. +/// \param schema The table schema, used with partitionSpecById to resolve partition +/// types. /// \return A `DataFile` object or an error if the conversion fails. ICEBERG_EXPORT Result DataFileFromJson(const nlohmann::json& json); @@ -440,8 +442,10 @@ ICEBERG_EXPORT Result DataFileFromJson( /// /// \param json The JSON array of file scan task objects. /// \param delete_files Delete files indexed by the tasks' delete-file-references. -/// \param partitionSpecById Map from spec ID to PartitionSpec for type-aware partition parsing. -/// \param schema The table schema, used with partitionSpecById to resolve partition types. +/// \param partitionSpecById Map from spec ID to PartitionSpec for type-aware partition +/// parsing. +/// \param schema The table schema, used with partitionSpecById to resolve partition +/// types. /// \return A vector of `FileScanTask` objects or an error if the conversion fails. ICEBERG_EXPORT Result> FileScanTasksFromJson( const nlohmann::json& json, const std::vector& delete_files, diff --git a/src/iceberg/test/endpoint_test.cc b/src/iceberg/test/endpoint_test.cc index 460eea4d6..69cc56895 100644 --- a/src/iceberg/test/endpoint_test.cc +++ b/src/iceberg/test/endpoint_test.cc @@ -264,11 +264,11 @@ TEST(EndpointTest, FromStringInvalid) { TEST(EndpointTest, StringRoundTrip) { // Create various endpoints and verify they survive string round-trip std::vector endpoints = { - Endpoint::ListNamespaces(), Endpoint::GetNamespaceProperties(), - Endpoint::CreateNamespace(), Endpoint::LoadTable(), - Endpoint::CreateTable(), Endpoint::DeleteTable(), - Endpoint::PlanTableScan(), Endpoint::FetchPlanningResult(), - Endpoint::CancelPlanning(), Endpoint::FetchScanTasks(), + Endpoint::ListNamespaces(), Endpoint::GetNamespaceProperties(), + Endpoint::CreateNamespace(), Endpoint::LoadTable(), + Endpoint::CreateTable(), Endpoint::DeleteTable(), + Endpoint::PlanTableScan(), Endpoint::FetchPlanningResult(), + Endpoint::CancelPlanning(), Endpoint::FetchScanTasks(), }; for (const auto& original : endpoints) { From a95f3418d17bcc48d4baa1ec4ca979d379ef41ed Mon Sep 17 00:00:00 2001 From: Sandeep Gottimukkala Date: Sun, 5 Apr 2026 22:20:13 -0700 Subject: [PATCH 09/17] Fixes --- src/iceberg/catalog/rest/types.cc | 35 +++++++++++++++++++++++-------- src/iceberg/catalog/rest/types.h | 4 ++-- src/iceberg/json_serde.cc | 8 +++++-- 3 files changed, 34 insertions(+), 13 deletions(-) diff --git a/src/iceberg/catalog/rest/types.cc b/src/iceberg/catalog/rest/types.cc index a5e6b79a7..8dcfadd92 100644 --- a/src/iceberg/catalog/rest/types.cc +++ b/src/iceberg/catalog/rest/types.cc @@ -128,21 +128,38 @@ bool PlanTableScanRequest::operator==(const PlanTableScanRequest& other) const { } bool BaseScanTaskResponse::operator==(const BaseScanTaskResponse& other) const { - if (plan_tasks != other.plan_tasks) return false; - if (delete_files != other.delete_files) return false; - if (file_scan_tasks.size() != other.file_scan_tasks.size()) return false; + if (plan_tasks != other.plan_tasks) { + return false; + } + if (delete_files != other.delete_files) { + return false; + } + if (file_scan_tasks.size() != other.file_scan_tasks.size()) { + return false; + } for (size_t i = 0; i < file_scan_tasks.size(); ++i) { const auto& a = file_scan_tasks[i]; const auto& b = other.file_scan_tasks[i]; - if (!a.data_file() != !b.data_file()) return false; - if (a.data_file() && *a.data_file() != *b.data_file()) return false; - if (a.delete_files().size() != b.delete_files().size()) return false; + if (!a.data_file() != !b.data_file()) { + return false; + } + if (a.data_file() && *a.data_file() != *b.data_file()) { + return false; + } + if (a.delete_files().size() != b.delete_files().size()) { + return false; + } for (size_t j = 0; j < a.delete_files().size(); ++j) { - if (!a.delete_files()[j] != !b.delete_files()[j]) return false; - if (a.delete_files()[j] && *a.delete_files()[j] != *b.delete_files()[j]) + if (!a.delete_files()[j] != !b.delete_files()[j]) { return false; + } + if (a.delete_files()[j] && *a.delete_files()[j] != *b.delete_files()[j]) { + return false; + } + } + if (a.residual_filter() != b.residual_filter()) { + return false; } - if (a.residual_filter() != b.residual_filter()) return false; } return true; } diff --git a/src/iceberg/catalog/rest/types.h b/src/iceberg/catalog/rest/types.h index 2af2e0d0d..e49791b77 100644 --- a/src/iceberg/catalog/rest/types.h +++ b/src/iceberg/catalog/rest/types.h @@ -333,7 +333,7 @@ struct ICEBERG_REST_EXPORT BaseScanTaskResponse { struct ICEBERG_REST_EXPORT PlanTableScanResponse : BaseScanTaskResponse { std::string plan_status; std::string plan_id; - // TODO: Add credentials. + // TODO(sandeepg): Add credentials. Status Validate() const; @@ -344,7 +344,7 @@ struct ICEBERG_REST_EXPORT PlanTableScanResponse : BaseScanTaskResponse { /// available scan tasks. struct ICEBERG_REST_EXPORT FetchPlanningResultResponse : BaseScanTaskResponse { std::string plan_status; - // TODO: Add credentials. + // TODO(sandeepg): Add credentials. Status Validate() const; diff --git a/src/iceberg/json_serde.cc b/src/iceberg/json_serde.cc index 72cad1024..f7b782b49 100644 --- a/src/iceberg/json_serde.cc +++ b/src/iceberg/json_serde.cc @@ -1814,7 +1814,9 @@ Result DataFileFromJson( // Parse CountMap: {"keys": [int, ...], "values": [long, ...]} auto parse_int_map = [&](std::string_view key, std::map& target) -> Status { - if (!json.contains(key) || json.at(key).is_null()) return {}; + if (!json.contains(key) || json.at(key).is_null()) { + return {}; + } ICEBERG_ASSIGN_OR_RAISE(auto map_json, GetJsonValue(json, key)); ICEBERG_ASSIGN_OR_RAISE(auto keys, GetTypedJsonValue>(map_json.at("keys"))); @@ -1837,7 +1839,9 @@ Result DataFileFromJson( // Parse BinaryMap: {"keys": [int, ...], "values": [...]} auto parse_binary_map = [&](std::string_view key, std::map>& target) -> Status { - if (!json.contains(key) || json.at(key).is_null()) return {}; + if (!json.contains(key) || json.at(key).is_null()) { + return {}; + } ICEBERG_ASSIGN_OR_RAISE(auto map_json, GetJsonValue(json, key)); ICEBERG_ASSIGN_OR_RAISE(auto keys, GetJsonValue>(map_json, "keys")); From 86714b90c694ef16fca3562c281135fac9dfa639 Mon Sep 17 00:00:00 2001 From: Sandeep Gottimukkala Date: Sat, 2 May 2026 22:06:05 -0700 Subject: [PATCH 10/17] Address comments --- src/iceberg/catalog/rest/error_handlers.cc | 20 +++++++ src/iceberg/catalog/rest/error_handlers.h | 11 ++++ src/iceberg/catalog/rest/json_serde.cc | 66 +++++++++++----------- src/iceberg/catalog/rest/resource_paths.cc | 3 +- src/iceberg/catalog/rest/rest_catalog.cc | 2 +- 5 files changed, 67 insertions(+), 35 deletions(-) diff --git a/src/iceberg/catalog/rest/error_handlers.cc b/src/iceberg/catalog/rest/error_handlers.cc index f1aa6d4cf..11661ce63 100644 --- a/src/iceberg/catalog/rest/error_handlers.cc +++ b/src/iceberg/catalog/rest/error_handlers.cc @@ -214,4 +214,24 @@ Status ScanPlanErrorHandler::Accept(const ErrorResponse& error) const { return DefaultErrorHandler::Accept(error); } +const std::shared_ptr& PlanTaskErrorHandler::Instance() { + static const std::shared_ptr instance{new PlanTaskErrorHandler()}; + return instance; +} + +Status PlanTaskErrorHandler::Accept(const ErrorResponse& error) const { + switch (error.code) { + case 404: + if (error.type == kNoSuchNamespaceException) { + return NoSuchNamespace(error.message); + } + if (error.type == kNoSuchTableException) { + return NoSuchTable(error.message); + } + return NoSuchPlanTask(error.message); + } + + return DefaultErrorHandler::Accept(error); +} + } // namespace iceberg::rest diff --git a/src/iceberg/catalog/rest/error_handlers.h b/src/iceberg/catalog/rest/error_handlers.h index 7c0823384..85899b7fc 100644 --- a/src/iceberg/catalog/rest/error_handlers.h +++ b/src/iceberg/catalog/rest/error_handlers.h @@ -138,4 +138,15 @@ class ICEBERG_REST_EXPORT ScanPlanErrorHandler final : public DefaultErrorHandle constexpr ScanPlanErrorHandler() = default; }; +/// \brief Fetch scan tasks operation error handler. +class ICEBERG_REST_EXPORT PlanTaskErrorHandler final : public DefaultErrorHandler { + public: + static const std::shared_ptr& Instance(); + + Status Accept(const ErrorResponse& error) const override; + + private: + constexpr PlanTaskErrorHandler() = default; +}; + } // namespace iceberg::rest diff --git a/src/iceberg/catalog/rest/json_serde.cc b/src/iceberg/catalog/rest/json_serde.cc index c2d4ccf4b..9960001df 100644 --- a/src/iceberg/catalog/rest/json_serde.cc +++ b/src/iceberg/catalog/rest/json_serde.cc @@ -93,9 +93,40 @@ constexpr std::string_view kUseSnapshotSchema = "use-snapshot-schema"; constexpr std::string_view kStartSnapshotId = "start-snapshot-id"; constexpr std::string_view kEndSnapshotId = "end-snapshot-id"; constexpr std::string_view kStatsFields = "stats-fields"; -constexpr std::string_view kMinRowsRequired = "min-rows-required"; +constexpr std::string_view kMinRowsRequested = "min-rows-requested"; constexpr std::string_view kPlanTask = "plan-task"; +Status BaseScanTaskResponseFromJson( + const nlohmann::json& json, BaseScanTaskResponse* response, + const std::unordered_map>& + partition_specs_by_id, + const Schema& schema) { + // 1. plan_tasks + ICEBERG_ASSIGN_OR_RAISE( + response->plan_tasks, + GetJsonValueOrDefault>(json, kPlanTasks)); + + // 2. delete_files + ICEBERG_ASSIGN_OR_RAISE( + auto delete_files_json, + GetJsonValueOrDefault(json, kDeleteFiles, nlohmann::json::array())); + for (const auto& entry_json : delete_files_json) { + ICEBERG_ASSIGN_OR_RAISE(auto delete_file, + DataFileFromJson(entry_json, partition_specs_by_id, schema)); + response->delete_files.push_back(std::move(delete_file)); + } + + // 3. file_scan_tasks + ICEBERG_ASSIGN_OR_RAISE(auto file_scan_tasks_json, + GetJsonValueOrDefault(json, kFileScanTasks, + nlohmann::json::array())); + ICEBERG_ASSIGN_OR_RAISE( + response->file_scan_tasks, + FileScanTasksFromJson(file_scan_tasks_json, response->delete_files, + partition_specs_by_id, schema)); + return {}; +} + } // namespace nlohmann::json ToJson(const CatalogConfig& config) { @@ -547,7 +578,7 @@ Result ToJson(const PlanTableScanRequest& request) { json[kStatsFields] = request.statsFields; } if (request.min_rows_required.has_value()) { - json[kMinRowsRequired] = request.min_rows_required.value(); + json[kMinRowsRequested] = request.min_rows_required.value(); } return json; } @@ -558,37 +589,6 @@ nlohmann::json ToJson(const FetchScanTasksRequest& request) { return json; } -Status BaseScanTaskResponseFromJson( - const nlohmann::json& json, BaseScanTaskResponse* response, - const std::unordered_map>& - partition_specs_by_id, - const Schema& schema) { - // 1. plan_tasks - ICEBERG_ASSIGN_OR_RAISE( - response->plan_tasks, - GetJsonValueOrDefault>(json, kPlanTasks)); - - // 2. delete_files - ICEBERG_ASSIGN_OR_RAISE( - auto delete_files_json, - GetJsonValueOrDefault(json, kDeleteFiles, nlohmann::json::array())); - for (const auto& entry_json : delete_files_json) { - ICEBERG_ASSIGN_OR_RAISE(auto delete_file, - DataFileFromJson(entry_json, partition_specs_by_id, schema)); - response->delete_files.push_back(std::move(delete_file)); - } - - // 3. file_scan_tasks - ICEBERG_ASSIGN_OR_RAISE(auto file_scan_tasks_json, - GetJsonValueOrDefault(json, kFileScanTasks, - nlohmann::json::array())); - ICEBERG_ASSIGN_OR_RAISE( - response->file_scan_tasks, - FileScanTasksFromJson(file_scan_tasks_json, response->delete_files, - partition_specs_by_id, schema)); - return {}; -} - Result PlanTableScanResponseFromJson( const nlohmann::json& json, const std::unordered_map>& diff --git a/src/iceberg/catalog/rest/resource_paths.cc b/src/iceberg/catalog/rest/resource_paths.cc index 4e6660b57..ab5daed02 100644 --- a/src/iceberg/catalog/rest/resource_paths.cc +++ b/src/iceberg/catalog/rest/resource_paths.cc @@ -113,8 +113,9 @@ Result ResourcePaths::ScanPlan(const TableIdentifier& ident, const std::string& plan_id) const { ICEBERG_ASSIGN_OR_RAISE(std::string encoded_namespace, EncodeNamespace(ident.ns)); ICEBERG_ASSIGN_OR_RAISE(std::string encoded_table_name, EncodeString(ident.name)); + ICEBERG_ASSIGN_OR_RAISE(std::string encoded_plan_id, EncodeString(plan_id)); return std::format("{}/v1/{}namespaces/{}/tables/{}/plan/{}", base_uri_, prefix_, - encoded_namespace, encoded_table_name, plan_id); + encoded_namespace, encoded_table_name, encoded_plan_id); } Result ResourcePaths::ScanTask(const TableIdentifier& ident) const { diff --git a/src/iceberg/catalog/rest/rest_catalog.cc b/src/iceberg/catalog/rest/rest_catalog.cc index 38738024d..8966da1e9 100644 --- a/src/iceberg/catalog/rest/rest_catalog.cc +++ b/src/iceberg/catalog/rest/rest_catalog.cc @@ -572,7 +572,7 @@ Result RestCatalog::FetchScanTasks(const Table& table, ICEBERG_ASSIGN_OR_RAISE(auto json_request, ToJsonString(ToJson(request))); ICEBERG_ASSIGN_OR_RAISE( const auto response, - client_->Post(path, json_request, /*headers=*/{}, *ScanPlanErrorHandler::Instance(), + client_->Post(path, json_request, /*headers=*/{}, *PlanTaskErrorHandler::Instance(), *catalog_session_)); ICEBERG_ASSIGN_OR_RAISE(auto json, FromJsonString(response.body())); From 67a5602ed705a3760810dfe858b2f6ab08ac00b8 Mon Sep 17 00:00:00 2001 From: Sandeep Gottimukkala Date: Wed, 6 May 2026 23:27:11 -0700 Subject: [PATCH 11/17] refactor(rest): clean up scan plan API naming and structure - Rename ScanPlanErrorHandler to PlanErrorHandler; add PlanTaskErrorHandler for FetchScanTasks with correct 404 handling per spec - Merge ScanPlan overloads into Plan(ident, optional plan_id) - Rename ResourcePaths::ScanTask to FetchScanTasks - Rename statsFields -> stats_fields, min_rows_required -> min_rows_requested - Encode plan_id in URL path - Move BaseScanTaskResponseFromJson into anonymous namespace - Fix kMinRowsRequired wire value to "min-rows-requested" --- src/iceberg/catalog/rest/error_handlers.cc | 14 +++++++------- src/iceberg/catalog/rest/error_handlers.h | 8 ++++---- src/iceberg/catalog/rest/json_serde.cc | 8 ++++---- src/iceberg/catalog/rest/resource_paths.cc | 20 +++++++++----------- src/iceberg/catalog/rest/resource_paths.h | 13 +++++-------- src/iceberg/catalog/rest/rest_catalog.cc | 16 ++++++++-------- src/iceberg/catalog/rest/types.cc | 6 +++--- src/iceberg/catalog/rest/types.h | 4 ++-- 8 files changed, 42 insertions(+), 47 deletions(-) diff --git a/src/iceberg/catalog/rest/error_handlers.cc b/src/iceberg/catalog/rest/error_handlers.cc index 11661ce63..67146e745 100644 --- a/src/iceberg/catalog/rest/error_handlers.cc +++ b/src/iceberg/catalog/rest/error_handlers.cc @@ -186,12 +186,12 @@ Status ViewCommitErrorHandler::Accept(const ErrorResponse& error) const { return DefaultErrorHandler::Accept(error); } -const std::shared_ptr& ScanPlanErrorHandler::Instance() { - static const std::shared_ptr instance{new ScanPlanErrorHandler()}; +const std::shared_ptr& PlanErrorHandler::Instance() { + static const std::shared_ptr instance{new PlanErrorHandler()}; return instance; } -Status ScanPlanErrorHandler::Accept(const ErrorResponse& error) const { +Status PlanErrorHandler::Accept(const ErrorResponse& error) const { switch (error.code) { case 404: if (error.type == kNoSuchNamespaceException) { @@ -203,9 +203,6 @@ Status ScanPlanErrorHandler::Accept(const ErrorResponse& error) const { if (error.type == kNoSuchPlanIdException) { return NoSuchPlanId(error.message); } - if (error.type == kNoSuchPlanTaskException) { - return NoSuchPlanTask(error.message); - } return NotFound(error.message); case 406: return NotSupported(error.message); @@ -228,7 +225,10 @@ Status PlanTaskErrorHandler::Accept(const ErrorResponse& error) const { if (error.type == kNoSuchTableException) { return NoSuchTable(error.message); } - return NoSuchPlanTask(error.message); + if (error.type == kNoSuchPlanTaskException) { + return NoSuchPlanTask(error.message); + } + return NotFound(error.message); } return DefaultErrorHandler::Accept(error); diff --git a/src/iceberg/catalog/rest/error_handlers.h b/src/iceberg/catalog/rest/error_handlers.h index 85899b7fc..ee338fb3e 100644 --- a/src/iceberg/catalog/rest/error_handlers.h +++ b/src/iceberg/catalog/rest/error_handlers.h @@ -127,15 +127,15 @@ class ICEBERG_REST_EXPORT ViewCommitErrorHandler final : public DefaultErrorHand constexpr ViewCommitErrorHandler() = default; }; -/// \brief Scan plan operation error handler. -class ICEBERG_REST_EXPORT ScanPlanErrorHandler final : public DefaultErrorHandler { +/// \brief Plan operation error handler. +class ICEBERG_REST_EXPORT PlanErrorHandler final : public DefaultErrorHandler { public: - static const std::shared_ptr& Instance(); + static const std::shared_ptr& Instance(); Status Accept(const ErrorResponse& error) const override; private: - constexpr ScanPlanErrorHandler() = default; + constexpr PlanErrorHandler() = default; }; /// \brief Fetch scan tasks operation error handler. diff --git a/src/iceberg/catalog/rest/json_serde.cc b/src/iceberg/catalog/rest/json_serde.cc index 9960001df..649c1e81e 100644 --- a/src/iceberg/catalog/rest/json_serde.cc +++ b/src/iceberg/catalog/rest/json_serde.cc @@ -574,11 +574,11 @@ Result ToJson(const PlanTableScanRequest& request) { if (request.end_snapshot_id.has_value()) { json[kEndSnapshotId] = request.end_snapshot_id.value(); } - if (!request.statsFields.empty()) { - json[kStatsFields] = request.statsFields; + if (!request.stats_fields.empty()) { + json[kStatsFields] = request.stats_fields; } - if (request.min_rows_required.has_value()) { - json[kMinRowsRequested] = request.min_rows_required.value(); + if (request.min_rows_requested.has_value()) { + json[kMinRowsRequested] = request.min_rows_requested.value(); } return json; } diff --git a/src/iceberg/catalog/rest/resource_paths.cc b/src/iceberg/catalog/rest/resource_paths.cc index ab5daed02..ac8c52255 100644 --- a/src/iceberg/catalog/rest/resource_paths.cc +++ b/src/iceberg/catalog/rest/resource_paths.cc @@ -20,6 +20,7 @@ #include "iceberg/catalog/rest/resource_paths.h" #include +#include #include "iceberg/catalog/rest/rest_util.h" #include "iceberg/table_identifier.h" @@ -102,23 +103,20 @@ Result ResourcePaths::CommitTransaction() const { return std::format("{}/v1/{}transactions/commit", base_uri_, prefix_); } -Result ResourcePaths::ScanPlan(const TableIdentifier& ident) const { +Result ResourcePaths::Plan(const TableIdentifier& ident, + std::optional plan_id) const { ICEBERG_ASSIGN_OR_RAISE(std::string encoded_namespace, EncodeNamespace(ident.ns)); ICEBERG_ASSIGN_OR_RAISE(std::string encoded_table_name, EncodeString(ident.name)); + if (plan_id.has_value()) { + ICEBERG_ASSIGN_OR_RAISE(std::string encoded_plan_id, EncodeString(plan_id.value())); + return std::format("{}/v1/{}namespaces/{}/tables/{}/plan/{}", base_uri_, prefix_, + encoded_namespace, encoded_table_name, encoded_plan_id); + } return std::format("{}/v1/{}namespaces/{}/tables/{}/plan", base_uri_, prefix_, encoded_namespace, encoded_table_name); } -Result ResourcePaths::ScanPlan(const TableIdentifier& ident, - const std::string& plan_id) const { - ICEBERG_ASSIGN_OR_RAISE(std::string encoded_namespace, EncodeNamespace(ident.ns)); - ICEBERG_ASSIGN_OR_RAISE(std::string encoded_table_name, EncodeString(ident.name)); - ICEBERG_ASSIGN_OR_RAISE(std::string encoded_plan_id, EncodeString(plan_id)); - return std::format("{}/v1/{}namespaces/{}/tables/{}/plan/{}", base_uri_, prefix_, - encoded_namespace, encoded_table_name, encoded_plan_id); -} - -Result ResourcePaths::ScanTask(const TableIdentifier& ident) const { +Result ResourcePaths::FetchScanTasks(const TableIdentifier& ident) const { ICEBERG_ASSIGN_OR_RAISE(std::string encoded_namespace, EncodeNamespace(ident.ns)); ICEBERG_ASSIGN_OR_RAISE(std::string encoded_table_name, EncodeString(ident.name)); return std::format("{}/v1/{}namespaces/{}/tables/{}/tasks", base_uri_, prefix_, diff --git a/src/iceberg/catalog/rest/resource_paths.h b/src/iceberg/catalog/rest/resource_paths.h index 15afb35dc..86cd7d5d3 100644 --- a/src/iceberg/catalog/rest/resource_paths.h +++ b/src/iceberg/catalog/rest/resource_paths.h @@ -20,6 +20,7 @@ #pragma once #include +#include #include #include "iceberg/catalog/rest/iceberg_rest_export.h" @@ -82,17 +83,13 @@ class ICEBERG_REST_EXPORT ResourcePaths { Result CommitTransaction() const; /// \brief Get the /v1/{prefix}/namespaces/{namespace}/tables/{table}/plan endpoint - /// path. - Result ScanPlan(const TableIdentifier& ident) const; - - /// \brief Get the /v1/{prefix}/namespaces/{namespace}/tables/{table}/plan/{plan_id} - /// endpoint path. - Result ScanPlan(const TableIdentifier& ident, - const std::string& plan_id) const; + /// path, or /plan/{plan_id} if plan_id is provided. + Result Plan(const TableIdentifier& ident, + std::optional plan_id = std::nullopt) const; /// \brief Get the /v1/{prefix}/namespaces/{namespace}/tables/{table}/tasks endpoint /// path. - Result ScanTask(const TableIdentifier& ident) const; + Result FetchScanTasks(const TableIdentifier& ident) const; private: ResourcePaths(std::string base_uri, const std::string& prefix); diff --git a/src/iceberg/catalog/rest/rest_catalog.cc b/src/iceberg/catalog/rest/rest_catalog.cc index 8966da1e9..4500d4952 100644 --- a/src/iceberg/catalog/rest/rest_catalog.cc +++ b/src/iceberg/catalog/rest/rest_catalog.cc @@ -501,7 +501,7 @@ Result> RestCatalog::RegisterTable( Result RestCatalog::PlanTableScan( const Table& table, const internal::TableScanContext& context) { ICEBERG_ENDPOINT_CHECK(supported_endpoints_, Endpoint::PlanTableScan()); - ICEBERG_ASSIGN_OR_RAISE(auto path, paths_->ScanPlan(table.name())); + ICEBERG_ASSIGN_OR_RAISE(auto path, paths_->Plan(table.name())); ICEBERG_ASSIGN_OR_RAISE(auto schema_ptr, table.schema()); ICEBERG_ASSIGN_OR_RAISE(auto specs_ref, table.specs()); @@ -519,7 +519,7 @@ Result RestCatalog::PlanTableScan( request.use_snapshot_schema = true; } if (context.min_rows_requested.has_value()) { - request.min_rows_required = context.min_rows_requested.value(); + request.min_rows_requested = context.min_rows_requested.value(); } ICEBERG_RETURN_UNEXPECTED(request.Validate()); @@ -527,7 +527,7 @@ Result RestCatalog::PlanTableScan( ICEBERG_ASSIGN_OR_RAISE(auto json_request, ToJsonString(request_json)); ICEBERG_ASSIGN_OR_RAISE( const auto response, - client_->Post(path, json_request, /*headers=*/{}, *ScanPlanErrorHandler::Instance(), + client_->Post(path, json_request, /*headers=*/{}, *PlanErrorHandler::Instance(), *catalog_session_)); ICEBERG_ASSIGN_OR_RAISE(auto json, FromJsonString(response.body())); @@ -537,13 +537,13 @@ Result RestCatalog::PlanTableScan( Result RestCatalog::FetchPlanningResult( const Table& table, const std::string& plan_id) { ICEBERG_ENDPOINT_CHECK(supported_endpoints_, Endpoint::FetchPlanningResult()); - ICEBERG_ASSIGN_OR_RAISE(auto path, paths_->ScanPlan(table.name(), plan_id)); + ICEBERG_ASSIGN_OR_RAISE(auto path, paths_->Plan(table.name(), plan_id)); ICEBERG_ASSIGN_OR_RAISE(auto schema_ptr, table.schema()); ICEBERG_ASSIGN_OR_RAISE(auto specs_ref, table.specs()); ICEBERG_ASSIGN_OR_RAISE( const auto response, - client_->Get(path, /*params=*/{}, /*headers=*/{}, *ScanPlanErrorHandler::Instance(), + client_->Get(path, /*params=*/{}, /*headers=*/{}, *PlanErrorHandler::Instance(), *catalog_session_)); ICEBERG_ASSIGN_OR_RAISE(auto json, FromJsonString(response.body())); return FetchPlanningResultResponseFromJson(json, specs_ref.get(), *schema_ptr); @@ -551,19 +551,19 @@ Result RestCatalog::FetchPlanningResult( Status RestCatalog::CancelPlanning(const Table& table, const std::string& plan_id) { ICEBERG_ENDPOINT_CHECK(supported_endpoints_, Endpoint::CancelPlanning()); - ICEBERG_ASSIGN_OR_RAISE(auto path, paths_->ScanPlan(table.name(), plan_id)); + ICEBERG_ASSIGN_OR_RAISE(auto path, paths_->Plan(table.name(), plan_id)); ICEBERG_ASSIGN_OR_RAISE( const auto response, client_->Delete(path, /*params=*/{}, /*headers=*/{}, - *ScanPlanErrorHandler::Instance(), *catalog_session_)); + *PlanErrorHandler::Instance(), *catalog_session_)); return {}; } Result RestCatalog::FetchScanTasks(const Table& table, const std::string& plan_task) { ICEBERG_ENDPOINT_CHECK(supported_endpoints_, Endpoint::FetchScanTasks()); - ICEBERG_ASSIGN_OR_RAISE(auto path, paths_->ScanTask(table.name())); + ICEBERG_ASSIGN_OR_RAISE(auto path, paths_->FetchScanTasks(table.name())); ICEBERG_ASSIGN_OR_RAISE(auto schema_ptr, table.schema()); ICEBERG_ASSIGN_OR_RAISE(auto specs_ref, table.specs()); diff --git a/src/iceberg/catalog/rest/types.cc b/src/iceberg/catalog/rest/types.cc index 8dcfadd92..e394999da 100644 --- a/src/iceberg/catalog/rest/types.cc +++ b/src/iceberg/catalog/rest/types.cc @@ -123,8 +123,8 @@ bool PlanTableScanRequest::operator==(const PlanTableScanRequest& other) const { filter == other.filter && case_sensitive == other.case_sensitive && use_snapshot_schema == other.use_snapshot_schema && start_snapshot_id == other.start_snapshot_id && - end_snapshot_id == other.end_snapshot_id && statsFields == other.statsFields && - min_rows_required == other.min_rows_required; + end_snapshot_id == other.end_snapshot_id && stats_fields == other.stats_fields && + min_rows_requested == other.min_rows_requested; } bool BaseScanTaskResponse::operator==(const BaseScanTaskResponse& other) const { @@ -213,7 +213,7 @@ Status PlanTableScanRequest::Validate() const { "Invalid incremental scan: startSnapshotId and endSnapshotId is required"); } } - if (min_rows_required.has_value() && min_rows_required.value() < 0) { + if (min_rows_requested.has_value() && min_rows_requested.value() < 0) { return ValidationFailed("Invalid scan: minRowsRequested is negative"); } return {}; diff --git a/src/iceberg/catalog/rest/types.h b/src/iceberg/catalog/rest/types.h index e49791b77..4e0b6e16c 100644 --- a/src/iceberg/catalog/rest/types.h +++ b/src/iceberg/catalog/rest/types.h @@ -307,8 +307,8 @@ struct ICEBERG_REST_EXPORT PlanTableScanRequest { bool use_snapshot_schema = false; std::optional start_snapshot_id; std::optional end_snapshot_id; - std::vector statsFields; - std::optional min_rows_required; + std::vector stats_fields; + std::optional min_rows_requested; Status Validate() const; From a970e07cebd93da615b97ed895854a8b91f0444c Mon Sep 17 00:00:00 2001 From: Sandeep Gottimukkala Date: Sun, 17 May 2026 14:20:31 -0700 Subject: [PATCH 12/17] refactor(rest): move scan plan catalog impl out of this branch Remove PlanTableScan, FetchPlanningResult, CancelPlanning, and FetchScanTasks from Catalog, RestCatalog, and their tests. This branch now contains only the REST models (types.h), ser/de, and error handlers for scan planning. The catalog-layer wiring will live in a separate branch. --- src/iceberg/catalog.h | 40 --------- src/iceberg/catalog/rest/rest_catalog.cc | 84 ------------------- src/iceberg/catalog/rest/rest_catalog.h | 8 -- src/iceberg/test/endpoint_test.cc | 27 ------ .../test/rest_catalog_integration_test.cc | 78 ----------------- 5 files changed, 237 deletions(-) diff --git a/src/iceberg/catalog.h b/src/iceberg/catalog.h index 18e76e4f5..81c9ddd11 100644 --- a/src/iceberg/catalog.h +++ b/src/iceberg/catalog.h @@ -26,10 +26,8 @@ #include #include -#include "iceberg/catalog/rest/types.h" #include "iceberg/result.h" #include "iceberg/table_identifier.h" -#include "iceberg/table_scan.h" #include "iceberg/type_fwd.h" namespace iceberg { @@ -190,44 +188,6 @@ class ICEBERG_EXPORT Catalog { /// \return a Table instance or ErrorKind::kAlreadyExists if the table already exists virtual Result> RegisterTable( const TableIdentifier& identifier, const std::string& metadata_file_location) = 0; - - /// \brief Initiate a scan planning operation for the given table. - /// - /// \param table The table to scan. - /// \param context The scan context containing snapshot, filter, and other options. - /// \return A PlanTableScanResponse with the plan status and initial scan tasks. - virtual Result PlanTableScan( - const Table& table, const internal::TableScanContext& context) { - return NotImplemented("PlanTableScan is not supported by this catalog"); - } - - /// \brief Fetch the current status and results of an asynchronous scan plan. - /// - /// \param table The table being scanned. - /// \param plan_id The plan ID returned by PlanTableScan. - /// \return A FetchPlanningResultResponse with the current plan status and tasks. - virtual Result FetchPlanningResult( - const Table& table, const std::string& plan_id) { - return NotImplemented("FetchPlanningResult is not supported by this catalog"); - } - - /// \brief Cancel an in-progress scan planning operation. - /// - /// \param table The table being scanned. - /// \param plan_id The plan ID returned by PlanTableScan. - virtual Status CancelPlanning(const Table& table, const std::string& plan_id) { - return NotImplemented("CancelPlanning is not supported by this catalog"); - } - - /// \brief Fetch the scan tasks for a given plan task token. - /// - /// \param table The table being scanned. - /// \param plan_task The plan task token returned in a scan plan response. - /// \return A FetchScanTasksResponse with the file scan tasks. - virtual Result FetchScanTasks( - const Table& table, const std::string& plan_task) { - return NotImplemented("FetchScanTasks is not supported by this catalog"); - } }; } // namespace iceberg diff --git a/src/iceberg/catalog/rest/rest_catalog.cc b/src/iceberg/catalog/rest/rest_catalog.cc index 4500d4952..40e112db7 100644 --- a/src/iceberg/catalog/rest/rest_catalog.cc +++ b/src/iceberg/catalog/rest/rest_catalog.cc @@ -43,7 +43,6 @@ #include "iceberg/sort_order.h" #include "iceberg/table.h" #include "iceberg/table_requirement.h" -#include "iceberg/table_scan.h" #include "iceberg/table_update.h" #include "iceberg/transaction.h" #include "iceberg/util/macros.h" @@ -63,8 +62,6 @@ std::unordered_set GetDefaultEndpoints() { Endpoint::UpdateTable(), Endpoint::DeleteTable(), Endpoint::RenameTable(), Endpoint::RegisterTable(), Endpoint::ReportMetrics(), Endpoint::CommitTransaction(), - Endpoint::PlanTableScan(), Endpoint::FetchPlanningResult(), - Endpoint::CancelPlanning(), Endpoint::FetchScanTasks(), }; } @@ -498,85 +495,4 @@ Result> RestCatalog::RegisterTable( shared_from_this()); } -Result RestCatalog::PlanTableScan( - const Table& table, const internal::TableScanContext& context) { - ICEBERG_ENDPOINT_CHECK(supported_endpoints_, Endpoint::PlanTableScan()); - ICEBERG_ASSIGN_OR_RAISE(auto path, paths_->Plan(table.name())); - ICEBERG_ASSIGN_OR_RAISE(auto schema_ptr, table.schema()); - ICEBERG_ASSIGN_OR_RAISE(auto specs_ref, table.specs()); - - PlanTableScanRequest request; - if (context.snapshot_id.has_value()) { - request.snapshot_id = context.snapshot_id; - } - request.select = context.selected_columns; - request.filter = context.filter; - request.case_sensitive = context.case_sensitive; - request.use_snapshot_schema = false; - if (context.from_snapshot_id.has_value() && context.to_snapshot_id.has_value()) { - request.start_snapshot_id = context.from_snapshot_id.value(); - request.end_snapshot_id = context.to_snapshot_id.value(); - request.use_snapshot_schema = true; - } - if (context.min_rows_requested.has_value()) { - request.min_rows_requested = context.min_rows_requested.value(); - } - - ICEBERG_RETURN_UNEXPECTED(request.Validate()); - ICEBERG_ASSIGN_OR_RAISE(auto request_json, ToJson(request)); - ICEBERG_ASSIGN_OR_RAISE(auto json_request, ToJsonString(request_json)); - ICEBERG_ASSIGN_OR_RAISE( - const auto response, - client_->Post(path, json_request, /*headers=*/{}, *PlanErrorHandler::Instance(), - *catalog_session_)); - - ICEBERG_ASSIGN_OR_RAISE(auto json, FromJsonString(response.body())); - return PlanTableScanResponseFromJson(json, specs_ref.get(), *schema_ptr); -} - -Result RestCatalog::FetchPlanningResult( - const Table& table, const std::string& plan_id) { - ICEBERG_ENDPOINT_CHECK(supported_endpoints_, Endpoint::FetchPlanningResult()); - ICEBERG_ASSIGN_OR_RAISE(auto path, paths_->Plan(table.name(), plan_id)); - ICEBERG_ASSIGN_OR_RAISE(auto schema_ptr, table.schema()); - ICEBERG_ASSIGN_OR_RAISE(auto specs_ref, table.specs()); - - ICEBERG_ASSIGN_OR_RAISE( - const auto response, - client_->Get(path, /*params=*/{}, /*headers=*/{}, *PlanErrorHandler::Instance(), - *catalog_session_)); - ICEBERG_ASSIGN_OR_RAISE(auto json, FromJsonString(response.body())); - return FetchPlanningResultResponseFromJson(json, specs_ref.get(), *schema_ptr); -} - -Status RestCatalog::CancelPlanning(const Table& table, const std::string& plan_id) { - ICEBERG_ENDPOINT_CHECK(supported_endpoints_, Endpoint::CancelPlanning()); - ICEBERG_ASSIGN_OR_RAISE(auto path, paths_->Plan(table.name(), plan_id)); - - ICEBERG_ASSIGN_OR_RAISE( - const auto response, - client_->Delete(path, /*params=*/{}, /*headers=*/{}, - *PlanErrorHandler::Instance(), *catalog_session_)); - return {}; -} - -Result RestCatalog::FetchScanTasks(const Table& table, - const std::string& plan_task) { - ICEBERG_ENDPOINT_CHECK(supported_endpoints_, Endpoint::FetchScanTasks()); - ICEBERG_ASSIGN_OR_RAISE(auto path, paths_->FetchScanTasks(table.name())); - ICEBERG_ASSIGN_OR_RAISE(auto schema_ptr, table.schema()); - ICEBERG_ASSIGN_OR_RAISE(auto specs_ref, table.specs()); - - FetchScanTasksRequest request{.planTask = plan_task}; - ICEBERG_RETURN_UNEXPECTED(request.Validate()); - ICEBERG_ASSIGN_OR_RAISE(auto json_request, ToJsonString(ToJson(request))); - ICEBERG_ASSIGN_OR_RAISE( - const auto response, - client_->Post(path, json_request, /*headers=*/{}, *PlanTaskErrorHandler::Instance(), - *catalog_session_)); - - ICEBERG_ASSIGN_OR_RAISE(auto json, FromJsonString(response.body())); - return FetchScanTasksResponseFromJson(json, specs_ref.get(), *schema_ptr); -} - } // namespace iceberg::rest diff --git a/src/iceberg/catalog/rest/rest_catalog.h b/src/iceberg/catalog/rest/rest_catalog.h index d4a86e1ec..38230a5e2 100644 --- a/src/iceberg/catalog/rest/rest_catalog.h +++ b/src/iceberg/catalog/rest/rest_catalog.h @@ -104,14 +104,6 @@ class ICEBERG_REST_EXPORT RestCatalog : public Catalog, const TableIdentifier& identifier, const std::string& metadata_file_location) override; - Result PlanTableScan( - const Table& table, const internal::TableScanContext& context) override; - Result FetchPlanningResult( - const Table& table, const std::string& plan_id) override; - Status CancelPlanning(const Table& table, const std::string& plan_id) override; - Result FetchScanTasks(const Table& table, - const std::string& plan_task) override; - private: RestCatalog(RestCatalogProperties config, std::shared_ptr file_io, std::unique_ptr client, std::unique_ptr paths, diff --git a/src/iceberg/test/endpoint_test.cc b/src/iceberg/test/endpoint_test.cc index 69cc56895..fcdc92a78 100644 --- a/src/iceberg/test/endpoint_test.cc +++ b/src/iceberg/test/endpoint_test.cc @@ -145,31 +145,6 @@ TEST(EndpointTest, TransactionEndpoints) { EXPECT_EQ(commit_transaction.path(), "/v1/{prefix}/transactions/commit"); } -// Test predefined scan planning endpoints -TEST(EndpointTest, ScanPlanEndpoints) { - auto plan_table_scan = Endpoint::PlanTableScan(); - EXPECT_EQ(plan_table_scan.method(), HttpMethod::kPost); - EXPECT_EQ(plan_table_scan.path(), - "/v1/{prefix}/namespaces/{namespace}/tables/{table}/plan"); - EXPECT_EQ(plan_table_scan.ToString(), - "POST /v1/{prefix}/namespaces/{namespace}/tables/{table}/plan"); - - auto fetch_planning_result = Endpoint::FetchPlanningResult(); - EXPECT_EQ(fetch_planning_result.method(), HttpMethod::kGet); - EXPECT_EQ(fetch_planning_result.path(), - "/v1/{prefix}/namespaces/{namespace}/tables/{table}/plan/{plan-id}"); - - auto cancel_planning = Endpoint::CancelPlanning(); - EXPECT_EQ(cancel_planning.method(), HttpMethod::kDelete); - EXPECT_EQ(cancel_planning.path(), - "/v1/{prefix}/namespaces/{namespace}/tables/{table}/plan/{plan-id}"); - - auto fetch_scan_tasks = Endpoint::FetchScanTasks(); - EXPECT_EQ(fetch_scan_tasks.method(), HttpMethod::kPost); - EXPECT_EQ(fetch_scan_tasks.path(), - "/v1/{prefix}/namespaces/{namespace}/tables/{table}/tasks"); -} - // Test endpoint equality TEST(EndpointTest, Equality) { auto endpoint1 = Endpoint::Make(HttpMethod::kGet, "/path"); @@ -267,8 +242,6 @@ TEST(EndpointTest, StringRoundTrip) { Endpoint::ListNamespaces(), Endpoint::GetNamespaceProperties(), Endpoint::CreateNamespace(), Endpoint::LoadTable(), Endpoint::CreateTable(), Endpoint::DeleteTable(), - Endpoint::PlanTableScan(), Endpoint::FetchPlanningResult(), - Endpoint::CancelPlanning(), Endpoint::FetchScanTasks(), }; for (const auto& original : endpoints) { diff --git a/src/iceberg/test/rest_catalog_integration_test.cc b/src/iceberg/test/rest_catalog_integration_test.cc index 7f03932c1..b364ffd36 100644 --- a/src/iceberg/test/rest_catalog_integration_test.cc +++ b/src/iceberg/test/rest_catalog_integration_test.cc @@ -46,7 +46,6 @@ #include "iceberg/table.h" #include "iceberg/table_identifier.h" #include "iceberg/table_requirement.h" -#include "iceberg/table_scan.h" #include "iceberg/table_update.h" #include "iceberg/test/matchers.h" #include "iceberg/test/std_io.h" @@ -449,83 +448,6 @@ TEST_F(RestCatalogIntegrationTest, StageCreateTable) { EXPECT_EQ(committed->metadata()->properties.configs().at("key1"), "value1"); } -// -- Scan plan operations -- - -TEST_F(RestCatalogIntegrationTest, PlanTableScan) { - Namespace ns{.levels = {"test_plan_table_scan"}}; - ICEBERG_UNWRAP_OR_FAIL(auto catalog, CreateCatalogAndNamespace(ns)); - TableIdentifier table_id{.ns = ns, .name = "scan_table"}; - ASSERT_THAT(CreateDefaultTable(catalog, table_id), IsOk()); - ICEBERG_UNWRAP_OR_FAIL(auto table, catalog->LoadTable(table_id)); - - internal::TableScanContext context; - ICEBERG_UNWRAP_OR_FAIL(auto response, catalog->PlanTableScan(*table, context)); - // Empty table: no file scan tasks returned - EXPECT_TRUE(response.file_scan_tasks.empty()); -} - -TEST_F(RestCatalogIntegrationTest, PlanTableScanWithContext) { - Namespace ns{.levels = {"test_plan_scan_context"}}; - ICEBERG_UNWRAP_OR_FAIL(auto catalog, CreateCatalogAndNamespace(ns)); - TableIdentifier table_id{.ns = ns, .name = "context_scan_table"}; - ASSERT_THAT(CreateDefaultTable(catalog, table_id), IsOk()); - ICEBERG_UNWRAP_OR_FAIL(auto table, catalog->LoadTable(table_id)); - - internal::TableScanContext context; - context.selected_columns = {"id", "data"}; - context.case_sensitive = true; - ICEBERG_UNWRAP_OR_FAIL(auto response, catalog->PlanTableScan(*table, context)); - EXPECT_TRUE(response.file_scan_tasks.empty()); -} - -TEST_F(RestCatalogIntegrationTest, FetchPlanningResult) { - Namespace ns{.levels = {"test_fetch_planning_result"}}; - ICEBERG_UNWRAP_OR_FAIL(auto catalog, CreateCatalogAndNamespace(ns)); - TableIdentifier table_id{.ns = ns, .name = "fetch_plan_table"}; - ASSERT_THAT(CreateDefaultTable(catalog, table_id), IsOk()); - ICEBERG_UNWRAP_OR_FAIL(auto table, catalog->LoadTable(table_id)); - - internal::TableScanContext context; - ICEBERG_UNWRAP_OR_FAIL(auto plan_response, catalog->PlanTableScan(*table, context)); - - // NOTE: apache/iceberg-rest-fixture always responds synchronously (status="completed") - // with a non-empty plan_id (e.g. "sync-"). Sync plans are immediately discarded - // server-side. Async paths are covered by unit tests. - EXPECT_FALSE(plan_response.plan_id.empty()); -} - -TEST_F(RestCatalogIntegrationTest, CancelPlanning) { - Namespace ns{.levels = {"test_cancel_planning"}}; - ICEBERG_UNWRAP_OR_FAIL(auto catalog, CreateCatalogAndNamespace(ns)); - TableIdentifier table_id{.ns = ns, .name = "cancel_plan_table"}; - ASSERT_THAT(CreateDefaultTable(catalog, table_id), IsOk()); - ICEBERG_UNWRAP_OR_FAIL(auto table, catalog->LoadTable(table_id)); - - internal::TableScanContext context; - ICEBERG_UNWRAP_OR_FAIL(auto plan_response, catalog->PlanTableScan(*table, context)); - - // NOTE: apache/iceberg-rest-fixture always responds synchronously — sync plan_id is - // accepted for cancellation (idempotent). Async paths are covered by unit tests. - ASSERT_FALSE(plan_response.plan_id.empty()); - ASSERT_THAT(catalog->CancelPlanning(*table, plan_response.plan_id), IsOk()); -} - -TEST_F(RestCatalogIntegrationTest, FetchScanTasks) { - Namespace ns{.levels = {"test_fetch_scan_tasks"}}; - ICEBERG_UNWRAP_OR_FAIL(auto catalog, CreateCatalogAndNamespace(ns)); - TableIdentifier table_id{.ns = ns, .name = "fetch_tasks_table"}; - ASSERT_THAT(CreateDefaultTable(catalog, table_id), IsOk()); - ICEBERG_UNWRAP_OR_FAIL(auto table, catalog->LoadTable(table_id)); - - internal::TableScanContext context; - ICEBERG_UNWRAP_OR_FAIL(auto plan_response, catalog->PlanTableScan(*table, context)); - - // NOTE: apache/iceberg-rest-fixture always responds synchronously — plan_tasks is - // always empty. FetchScanTasks is only relevant for async plans; async paths are - // covered by unit tests. - EXPECT_TRUE(plan_response.plan_tasks.empty()); -} - // -- Snapshot loading mode -- TEST_F(RestCatalogIntegrationTest, LoadTableWithSnapshotModeAll) { From c115c0a79d4419a9541181a098e99123f4e8a7d9 Mon Sep 17 00:00:00 2001 From: Sandeep Gottimukkala Date: Sun, 17 May 2026 14:57:36 -0700 Subject: [PATCH 13/17] feat(rest): use PlanStatus enum and shared_ptr for scan task types Replace string-based plan_status fields in PlanTableScanResponse and FetchPlanningResultResponse with a PlanStatus enum defined in table_scan.h. Change file_scan_tasks and delete_files in BaseScanTaskResponse from value vectors to vector> for shared ownership semantics. --- src/iceberg/catalog/rest/json_serde.cc | 8 +++-- src/iceberg/catalog/rest/types.cc | 45 ++++++++++++++---------- src/iceberg/catalog/rest/types.h | 9 ++--- src/iceberg/json_serde.cc | 16 +++++---- src/iceberg/json_serde_internal.h | 5 +-- src/iceberg/table_scan.cc | 18 ++++++++++ src/iceberg/table_scan.h | 12 +++++++ src/iceberg/test/json_serde_test.cc | 15 ++++---- src/iceberg/test/rest_json_serde_test.cc | 14 ++++---- 9 files changed, 93 insertions(+), 49 deletions(-) diff --git a/src/iceberg/catalog/rest/json_serde.cc b/src/iceberg/catalog/rest/json_serde.cc index 649c1e81e..a8801690f 100644 --- a/src/iceberg/catalog/rest/json_serde.cc +++ b/src/iceberg/catalog/rest/json_serde.cc @@ -113,7 +113,7 @@ Status BaseScanTaskResponseFromJson( for (const auto& entry_json : delete_files_json) { ICEBERG_ASSIGN_OR_RAISE(auto delete_file, DataFileFromJson(entry_json, partition_specs_by_id, schema)); - response->delete_files.push_back(std::move(delete_file)); + response->delete_files.push_back(std::make_shared(std::move(delete_file))); } // 3. file_scan_tasks @@ -595,8 +595,9 @@ Result PlanTableScanResponseFromJson( partition_specs_by_id, const Schema& schema) { PlanTableScanResponse response; - ICEBERG_ASSIGN_OR_RAISE(response.plan_status, + ICEBERG_ASSIGN_OR_RAISE(auto plan_status_str, GetJsonValue(json, kPlanStatus)); + ICEBERG_ASSIGN_OR_RAISE(response.plan_status, PlanStatusFromString(plan_status_str)); ICEBERG_ASSIGN_OR_RAISE(response.plan_id, GetJsonValueOrDefault(json, kPlanId)); ICEBERG_RETURN_UNEXPECTED( @@ -611,8 +612,9 @@ Result FetchPlanningResultResponseFromJson( partition_specs_by_id, const Schema& schema) { FetchPlanningResultResponse response; - ICEBERG_ASSIGN_OR_RAISE(response.plan_status, + ICEBERG_ASSIGN_OR_RAISE(auto plan_status_str, GetJsonValue(json, kPlanStatus)); + ICEBERG_ASSIGN_OR_RAISE(response.plan_status, PlanStatusFromString(plan_status_str)); ICEBERG_RETURN_UNEXPECTED( BaseScanTaskResponseFromJson(json, &response, partition_specs_by_id, schema)); ICEBERG_RETURN_UNEXPECTED(response.Validate()); diff --git a/src/iceberg/catalog/rest/types.cc b/src/iceberg/catalog/rest/types.cc index e394999da..53d057d9f 100644 --- a/src/iceberg/catalog/rest/types.cc +++ b/src/iceberg/catalog/rest/types.cc @@ -131,33 +131,45 @@ bool BaseScanTaskResponse::operator==(const BaseScanTaskResponse& other) const { if (plan_tasks != other.plan_tasks) { return false; } - if (delete_files != other.delete_files) { + if (delete_files.size() != other.delete_files.size()) { return false; } + for (size_t i = 0; i < delete_files.size(); ++i) { + if (!delete_files[i] != !other.delete_files[i]) { + return false; + } + if (delete_files[i] && *delete_files[i] != *other.delete_files[i]) { + return false; + } + } if (file_scan_tasks.size() != other.file_scan_tasks.size()) { return false; } for (size_t i = 0; i < file_scan_tasks.size(); ++i) { const auto& a = file_scan_tasks[i]; const auto& b = other.file_scan_tasks[i]; - if (!a.data_file() != !b.data_file()) { + if (!a != !b) { return false; } - if (a.data_file() && *a.data_file() != *b.data_file()) { + if (!a) continue; + if (!a->data_file() != !b->data_file()) { return false; } - if (a.delete_files().size() != b.delete_files().size()) { + if (a->data_file() && *a->data_file() != *b->data_file()) { return false; } - for (size_t j = 0; j < a.delete_files().size(); ++j) { - if (!a.delete_files()[j] != !b.delete_files()[j]) { + if (a->delete_files().size() != b->delete_files().size()) { + return false; + } + for (size_t j = 0; j < a->delete_files().size(); ++j) { + if (!a->delete_files()[j] != !b->delete_files()[j]) { return false; } - if (a.delete_files()[j] && *a.delete_files()[j] != *b.delete_files()[j]) { + if (a->delete_files()[j] && *a->delete_files()[j] != *b->delete_files()[j]) { return false; } } - if (a.residual_filter() != b.residual_filter()) { + if (a->residual_filter() != b->residual_filter()) { return false; } } @@ -220,22 +232,20 @@ Status PlanTableScanRequest::Validate() const { } Status PlanTableScanResponse::Validate() const { - if (plan_status.empty()) { - return ValidationFailed("Invalid response: plan status must be defined"); - } - if (plan_status == "submitted" && plan_id.empty()) { + if (plan_status == PlanStatus::kSubmitted && plan_id.empty()) { return ValidationFailed( "Invalid response: plan id should be defined when status is 'submitted'"); } - if (plan_status == "cancelled") { + if (plan_status == PlanStatus::kCancelled) { return ValidationFailed( "Invalid response: 'cancelled' is not a valid status for planTableScan"); } - if (plan_status != "completed" && (!plan_tasks.empty() || !file_scan_tasks.empty())) { + if (plan_status != PlanStatus::kCompleted && (!plan_tasks.empty() || !file_scan_tasks.empty())) { return ValidationFailed( "Invalid response: tasks can only be defined when status is 'completed'"); } - if (!plan_id.empty() && plan_status != "submitted" && plan_status != "completed") { + if (!plan_id.empty() && plan_status != PlanStatus::kSubmitted && + plan_status != PlanStatus::kCompleted) { return ValidationFailed( "Invalid response: plan id can only be defined when status is 'submitted' or " "'completed'"); @@ -249,10 +259,7 @@ Status PlanTableScanResponse::Validate() const { } Status FetchPlanningResultResponse::Validate() const { - if (plan_status.empty()) { - return ValidationFailed("Invalid status: null"); - } - if (plan_status != "completed" && (!plan_tasks.empty() || !file_scan_tasks.empty())) { + if (plan_status != PlanStatus::kCompleted && (!plan_tasks.empty() || !file_scan_tasks.empty())) { return ValidationFailed( "Invalid response: tasks can only be returned in a 'completed' status"); } diff --git a/src/iceberg/catalog/rest/types.h b/src/iceberg/catalog/rest/types.h index 4e0b6e16c..3e735cdcb 100644 --- a/src/iceberg/catalog/rest/types.h +++ b/src/iceberg/catalog/rest/types.h @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -319,8 +320,8 @@ struct ICEBERG_REST_EXPORT PlanTableScanRequest { /// endpoints. struct ICEBERG_REST_EXPORT BaseScanTaskResponse { std::vector plan_tasks; - std::vector file_scan_tasks; - std::vector delete_files; + std::vector> file_scan_tasks; + std::vector> delete_files; // std::unordered_map specsById; Status Validate() const { return {}; }; @@ -331,7 +332,7 @@ struct ICEBERG_REST_EXPORT BaseScanTaskResponse { /// \brief Response from initiating a scan planning operation, including plan status and /// initial scan tasks. struct ICEBERG_REST_EXPORT PlanTableScanResponse : BaseScanTaskResponse { - std::string plan_status; + PlanStatus plan_status = PlanStatus::kCompleted; std::string plan_id; // TODO(sandeepg): Add credentials. @@ -343,7 +344,7 @@ struct ICEBERG_REST_EXPORT PlanTableScanResponse : BaseScanTaskResponse { /// \brief Response from polling an asynchronous scan plan, including current status and /// available scan tasks. struct ICEBERG_REST_EXPORT FetchPlanningResultResponse : BaseScanTaskResponse { - std::string plan_status; + PlanStatus plan_status = PlanStatus::kCompleted; // TODO(sandeepg): Add credentials. Status Validate() const; diff --git a/src/iceberg/json_serde.cc b/src/iceberg/json_serde.cc index f7b782b49..b3bf850ea 100644 --- a/src/iceberg/json_serde.cc +++ b/src/iceberg/json_serde.cc @@ -1894,15 +1894,16 @@ Result DataFileFromJson( return df; } -Result> FileScanTasksFromJson( - const nlohmann::json& json, const std::vector& delete_files, +Result>> FileScanTasksFromJson( + const nlohmann::json& json, + const std::vector>& delete_files, const std::unordered_map>& partitionSpecById, const Schema& schema) { if (!json.is_array()) { return JsonParseError("Cannot parse file scan tasks from non-array: {}", SafeDumpJson(json)); } - std::vector file_scan_tasks; + std::vector> file_scan_tasks; for (const auto& task_json : json) { if (!task_json.is_object()) { return JsonParseError("Cannot parse file scan task from a non-object: {}", @@ -1925,7 +1926,7 @@ Result> FileScanTasksFromJson( "delete-file-references index {} is out of range (delete_files size: {})", ref, delete_files.size()); } - task_delete_files.push_back(std::make_shared(delete_files[ref])); + task_delete_files.push_back(delete_files[ref]); } } @@ -1936,9 +1937,10 @@ Result> FileScanTasksFromJson( ICEBERG_ASSIGN_OR_RAISE(residual_filter, ExpressionFromJson(filter_json)); } - file_scan_tasks.emplace_back(std::make_shared(std::move(data_file)), - std::move(task_delete_files), - std::move(residual_filter)); + file_scan_tasks.push_back( + std::make_shared(std::make_shared(std::move(data_file)), + std::move(task_delete_files), + std::move(residual_filter))); } return file_scan_tasks; } diff --git a/src/iceberg/json_serde_internal.h b/src/iceberg/json_serde_internal.h index 410d92ae2..5e6fa66cf 100644 --- a/src/iceberg/json_serde_internal.h +++ b/src/iceberg/json_serde_internal.h @@ -447,8 +447,9 @@ ICEBERG_EXPORT Result DataFileFromJson( /// \param schema The table schema, used with partitionSpecById to resolve partition /// types. /// \return A vector of `FileScanTask` objects or an error if the conversion fails. -ICEBERG_EXPORT Result> FileScanTasksFromJson( - const nlohmann::json& json, const std::vector& delete_files, +ICEBERG_EXPORT Result>> FileScanTasksFromJson( + const nlohmann::json& json, + const std::vector>& delete_files, const std::unordered_map>& partitionSpecById, const Schema& schema); diff --git a/src/iceberg/table_scan.cc b/src/iceberg/table_scan.cc index ed2ede707..96c2ad4f2 100644 --- a/src/iceberg/table_scan.cc +++ b/src/iceberg/table_scan.cc @@ -765,4 +765,22 @@ IncrementalChangelogScan::PlanFiles(std::optional from_snapshot_id_excl return NotImplemented("IncrementalChangelogScan::PlanFiles is not implemented"); } +std::string_view ToString(PlanStatus status) { + switch (status) { + case PlanStatus::kSubmitted: return "submitted"; + case PlanStatus::kCompleted: return "completed"; + case PlanStatus::kCancelled: return "cancelled"; + case PlanStatus::kFailed: return "failed"; + } + return "unknown"; +} + +Result PlanStatusFromString(std::string_view status_str) { + if (status_str == "submitted") return PlanStatus::kSubmitted; + if (status_str == "completed") return PlanStatus::kCompleted; + if (status_str == "cancelled") return PlanStatus::kCancelled; + if (status_str == "failed") return PlanStatus::kFailed; + return JsonParseError("Unknown plan status: {}", status_str); +} + } // namespace iceberg diff --git a/src/iceberg/table_scan.h b/src/iceberg/table_scan.h index b21adcca6..506baba4d 100644 --- a/src/iceberg/table_scan.h +++ b/src/iceberg/table_scan.h @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -35,6 +36,17 @@ namespace iceberg { +/// \brief Status of a server-side scan planning operation. +enum class PlanStatus { + kSubmitted, + kCompleted, + kCancelled, + kFailed, +}; + +ICEBERG_EXPORT std::string_view ToString(PlanStatus status); +ICEBERG_EXPORT Result PlanStatusFromString(std::string_view status_str); + /// \brief An abstract scan task. class ICEBERG_EXPORT ScanTask { public: diff --git a/src/iceberg/test/json_serde_test.cc b/src/iceberg/test/json_serde_test.cc index 6979a02f2..6fc7aaa07 100644 --- a/src/iceberg/test/json_serde_test.cc +++ b/src/iceberg/test/json_serde_test.cc @@ -942,10 +942,10 @@ TEST(FileScanTasksFromJsonTest, SingleTaskNoDeleteFiles) { ASSERT_THAT(result, IsOk()); ASSERT_EQ(result.value().size(), 1U); const auto& task = result.value()[0]; - ASSERT_NE(task.data_file(), nullptr); - EXPECT_EQ(task.data_file()->file_path, "s3://bucket/data/file.parquet"); - EXPECT_TRUE(task.delete_files().empty()); - EXPECT_EQ(task.residual_filter(), nullptr); + ASSERT_NE(task->data_file(), nullptr); + EXPECT_EQ(task->data_file()->file_path, "s3://bucket/data/file.parquet"); + EXPECT_TRUE(task->delete_files().empty()); + EXPECT_EQ(task->residual_filter(), nullptr); } TEST(FileScanTasksFromJsonTest, TaskWithDeleteFileReferences) { @@ -967,12 +967,13 @@ TEST(FileScanTasksFromJsonTest, TaskWithDeleteFileReferences) { "delete-file-references": [0] }])"_json; - auto result = FileScanTasksFromJson(json, {delete_file}, {}, Schema({}, 0)); + auto result = + FileScanTasksFromJson(json, {std::make_shared(delete_file)}, {}, Schema({}, 0)); ASSERT_THAT(result, IsOk()); ASSERT_EQ(result.value().size(), 1U); const auto& task = result.value()[0]; - ASSERT_EQ(task.delete_files().size(), 1U); - EXPECT_EQ(task.delete_files()[0]->file_path, "s3://bucket/deletes/pos_delete.parquet"); + ASSERT_EQ(task->delete_files().size(), 1U); + EXPECT_EQ(task->delete_files()[0]->file_path, "s3://bucket/deletes/pos_delete.parquet"); } TEST(FileScanTasksFromJsonTest, DeleteFileReferenceOutOfRange) { diff --git a/src/iceberg/test/rest_json_serde_test.cc b/src/iceberg/test/rest_json_serde_test.cc index e000f26b9..72df61425 100644 --- a/src/iceberg/test/rest_json_serde_test.cc +++ b/src/iceberg/test/rest_json_serde_test.cc @@ -1395,7 +1395,7 @@ TEST(PlanTableScanResponseFromJsonTest, SubmittedStatusMissingOptionalFields) { auto json = nlohmann::json::parse(R"({"status":"submitted","plan-id":"abc-123"})"); auto result = PlanTableScanResponseFromJson(json, EmptySpecs(), EmptySchema()); ASSERT_THAT(result, IsOk()); - EXPECT_EQ(result->plan_status, "submitted"); + EXPECT_EQ(result->plan_status, PlanStatus::kSubmitted); EXPECT_EQ(result->plan_id, "abc-123"); EXPECT_TRUE(result->plan_tasks.empty()); EXPECT_TRUE(result->file_scan_tasks.empty()); @@ -1408,7 +1408,7 @@ TEST(PlanTableScanResponseFromJsonTest, CompletedStatusWithPlanTasks) { R"({"status":"completed","plan-id":"abc-123","plan-tasks":["task-1","task-2"],"delete-files":[],"file-scan-tasks":[]})"); auto result = PlanTableScanResponseFromJson(json, EmptySpecs(), EmptySchema()); ASSERT_THAT(result, IsOk()); - EXPECT_EQ(result->plan_status, "completed"); + EXPECT_EQ(result->plan_status, PlanStatus::kCompleted); EXPECT_EQ(result->plan_id, "abc-123"); ASSERT_EQ(result->plan_tasks.size(), 2); EXPECT_EQ(result->plan_tasks[0], "task-1"); @@ -1436,7 +1436,7 @@ TEST(FetchPlanningResultResponseFromJsonTest, SubmittedStatusNoTasks) { auto json = nlohmann::json::parse(R"({"status":"submitted"})"); auto result = FetchPlanningResultResponseFromJson(json, EmptySpecs(), EmptySchema()); ASSERT_THAT(result, IsOk()); - EXPECT_EQ(result->plan_status, "submitted"); + EXPECT_EQ(result->plan_status, PlanStatus::kSubmitted); EXPECT_TRUE(result->plan_tasks.empty()); EXPECT_TRUE(result->file_scan_tasks.empty()); EXPECT_TRUE(result->delete_files.empty()); @@ -1447,7 +1447,7 @@ TEST(FetchPlanningResultResponseFromJsonTest, CompletedStatusWithPlanTasks) { R"({"status":"completed","plan-tasks":["task-1"],"delete-files":[],"file-scan-tasks":[]})"); auto result = FetchPlanningResultResponseFromJson(json, EmptySpecs(), EmptySchema()); ASSERT_THAT(result, IsOk()); - EXPECT_EQ(result->plan_status, "completed"); + EXPECT_EQ(result->plan_status, PlanStatus::kCompleted); ASSERT_EQ(result->plan_tasks.size(), 1); EXPECT_EQ(result->plan_tasks[0], "task-1"); } @@ -1492,10 +1492,10 @@ TEST(FetchScanTasksResponseFromJsonTest, WithFileScanTasks) { EXPECT_TRUE(result->plan_tasks.empty()); ASSERT_EQ(result->delete_files.size(), 1); ASSERT_EQ(result->file_scan_tasks.size(), 1); - EXPECT_EQ(result->file_scan_tasks[0].data_file()->file_path, + EXPECT_EQ(result->file_scan_tasks[0]->data_file()->file_path, "s3://bucket/data/file.parquet"); - ASSERT_EQ(result->file_scan_tasks[0].delete_files().size(), 1); - EXPECT_EQ(result->file_scan_tasks[0].delete_files()[0]->file_path, + ASSERT_EQ(result->file_scan_tasks[0]->delete_files().size(), 1); + EXPECT_EQ(result->file_scan_tasks[0]->delete_files()[0]->file_path, "s3://bucket/deletes/delete.parquet"); } From 6edd3611eabfd32353a68bbce18b9ecec02df8eb Mon Sep 17 00:00:00 2001 From: Sandeep Gottimukkala Date: Sun, 17 May 2026 15:05:23 -0700 Subject: [PATCH 14/17] refactor: rename partitionSpecById to partition_spec_by_id --- src/iceberg/json_serde.cc | 10 +++++----- src/iceberg/json_serde_internal.h | 12 ++++++------ 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/src/iceberg/json_serde.cc b/src/iceberg/json_serde.cc index b3bf850ea..2271f4e26 100644 --- a/src/iceberg/json_serde.cc +++ b/src/iceberg/json_serde.cc @@ -1753,7 +1753,7 @@ Result> TableRequirementFromJson( Result DataFileFromJson( const nlohmann::json& json, - const std::unordered_map>& partitionSpecById, + const std::unordered_map>& partition_spec_by_id, const Schema& schema) { if (!json.is_object()) { return JsonParseError("DataFile must be a JSON object: {}", SafeDumpJson(json)); @@ -1788,8 +1788,8 @@ Result DataFileFromJson( SafeDumpJson(partition_vals)); } std::vector literals; - auto it = partitionSpecById.find(df.partition_spec_id.value_or(-1)); - if (it == partitionSpecById.end()) { + auto it = partition_spec_by_id.find(df.partition_spec_id.value_or(-1)); + if (it == partition_spec_by_id.end()) { return JsonParseError("Invalid partition spec id: {}", df.partition_spec_id.value_or(-1)); } @@ -1897,7 +1897,7 @@ Result DataFileFromJson( Result>> FileScanTasksFromJson( const nlohmann::json& json, const std::vector>& delete_files, - const std::unordered_map>& partitionSpecById, + const std::unordered_map>& partition_spec_by_id, const Schema& schema) { if (!json.is_array()) { return JsonParseError("Cannot parse file scan tasks from non-array: {}", @@ -1913,7 +1913,7 @@ Result>> FileScanTasksFromJson( ICEBERG_ASSIGN_OR_RAISE(auto data_file_json, GetJsonValue(task_json, kDataFile)); ICEBERG_ASSIGN_OR_RAISE(auto data_file, - DataFileFromJson(data_file_json, partitionSpecById, schema)); + DataFileFromJson(data_file_json, partition_spec_by_id, schema)); std::vector> task_delete_files; if (task_json.contains(kDeleteFileReferences) && diff --git a/src/iceberg/json_serde_internal.h b/src/iceberg/json_serde_internal.h index 5e6fa66cf..a96243765 100644 --- a/src/iceberg/json_serde_internal.h +++ b/src/iceberg/json_serde_internal.h @@ -424,16 +424,16 @@ ICEBERG_EXPORT Result> TableRequirementFromJso /// Binary fields (lower-bounds, upper-bounds, key-metadata) are base64-encoded. /// /// \param json The JSON object representing a `DataFile`. -/// \param partitionSpecById Map from spec ID to PartitionSpec for type-aware partition +/// \param partition_spec_by_id Map from spec ID to PartitionSpec for type-aware partition /// parsing. -/// \param schema The table schema, used with partitionSpecById to resolve partition +/// \param schema The table schema, used with partition_spec_by_id to resolve partition /// types. /// \return A `DataFile` object or an error if the conversion fails. ICEBERG_EXPORT Result DataFileFromJson(const nlohmann::json& json); ICEBERG_EXPORT Result DataFileFromJson( const nlohmann::json& json, - const std::unordered_map>& partitionSpecById, + const std::unordered_map>& partition_spec_by_id, const Schema& schema); /// \brief Deserializes a JSON array of file scan tasks. @@ -442,15 +442,15 @@ ICEBERG_EXPORT Result DataFileFromJson( /// /// \param json The JSON array of file scan task objects. /// \param delete_files Delete files indexed by the tasks' delete-file-references. -/// \param partitionSpecById Map from spec ID to PartitionSpec for type-aware partition +/// \param partition_spec_by_id Map from spec ID to PartitionSpec for type-aware partition /// parsing. -/// \param schema The table schema, used with partitionSpecById to resolve partition +/// \param schema The table schema, used with partition_spec_by_id to resolve partition /// types. /// \return A vector of `FileScanTask` objects or an error if the conversion fails. ICEBERG_EXPORT Result>> FileScanTasksFromJson( const nlohmann::json& json, const std::vector>& delete_files, - const std::unordered_map>& partitionSpecById, + const std::unordered_map>& partition_spec_by_id, const Schema& schema); } // namespace iceberg From ae43c16c59a9f3179640c162329e61111e71ea9d Mon Sep 17 00:00:00 2001 From: Sandeep Gottimukkala Date: Sun, 17 May 2026 15:17:21 -0700 Subject: [PATCH 15/17] refactor: move DataFileFromJson and FileScanTasksFromJson to catalog/rest These parsers implement REST Catalog scan response wire format and have no callers outside the REST catalog module, mirroring Java TableScanResponseParser. Remove from top-level json_serde layer, declare in catalog/rest/json_serde_internal.h, and move tests to rest_json_serde_test.cc. --- src/iceberg/catalog/rest/json_serde.cc | 220 +++++++++++++++++ .../catalog/rest/json_serde_internal.h | 15 ++ src/iceberg/json_serde.cc | 194 --------------- src/iceberg/json_serde_internal.h | 36 --- src/iceberg/test/json_serde_test.cc | 228 ------------------ src/iceberg/test/rest_json_serde_test.cc | 226 +++++++++++++++++ 6 files changed, 461 insertions(+), 458 deletions(-) diff --git a/src/iceberg/catalog/rest/json_serde.cc b/src/iceberg/catalog/rest/json_serde.cc index a8801690f..c384b4dbd 100644 --- a/src/iceberg/catalog/rest/json_serde.cc +++ b/src/iceberg/catalog/rest/json_serde.cc @@ -27,6 +27,7 @@ #include "iceberg/catalog/rest/json_serde_internal.h" #include "iceberg/catalog/rest/types.h" #include "iceberg/expression/json_serde_internal.h" +#include "iceberg/file_format.h" #include "iceberg/json_serde_internal.h" #include "iceberg/partition_spec.h" #include "iceberg/schema.h" @@ -95,6 +96,225 @@ constexpr std::string_view kEndSnapshotId = "end-snapshot-id"; constexpr std::string_view kStatsFields = "stats-fields"; constexpr std::string_view kMinRowsRequested = "min-rows-requested"; constexpr std::string_view kPlanTask = "plan-task"; +constexpr std::string_view kContent = "content"; +constexpr std::string_view kFilePath = "file-path"; +constexpr std::string_view kFileFormat = "file-format"; +constexpr std::string_view kSpecId = "spec-id"; +constexpr std::string_view kPartition = "partition"; +constexpr std::string_view kRecordCount = "record-count"; +constexpr std::string_view kFileSizeInBytes = "file-size-in-bytes"; +constexpr std::string_view kColumnSizes = "column-sizes"; +constexpr std::string_view kValueCounts = "value-counts"; +constexpr std::string_view kNullValueCounts = "null-value-counts"; +constexpr std::string_view kNanValueCounts = "nan-value-counts"; +constexpr std::string_view kLowerBounds = "lower-bounds"; +constexpr std::string_view kUpperBounds = "upper-bounds"; +constexpr std::string_view kKeyMetadata = "key-metadata"; +constexpr std::string_view kSplitOffsets = "split-offsets"; +constexpr std::string_view kEqualityIds = "equality-ids"; +constexpr std::string_view kSortOrderId = "sort-order-id"; +constexpr std::string_view kFirstRowId = "first-row-id"; +constexpr std::string_view kReferencedDataFile = "referenced-data-file"; +constexpr std::string_view kContentOffset = "content-offset"; +constexpr std::string_view kContentSizeInBytes = "content-size-in-bytes"; +constexpr std::string_view kDataFile = "data-file"; +constexpr std::string_view kDeleteFileReferences = "delete-file-references"; +constexpr std::string_view kResidualFilter = "residual-filter"; + +} // namespace + +Result DataFileFromJson( + const nlohmann::json& json, + const std::unordered_map>& partition_spec_by_id, + const Schema& schema) { + if (!json.is_object()) { + return JsonParseError("DataFile must be a JSON object: {}", SafeDumpJson(json)); + } + DataFile df; + + ICEBERG_ASSIGN_OR_RAISE(auto content_str, GetJsonValue(json, kContent)); + if (content_str == ToString(DataFile::Content::kData)) { + df.content = DataFile::Content::kData; + } else if (content_str == ToString(DataFile::Content::kPositionDeletes)) { + df.content = DataFile::Content::kPositionDeletes; + } else if (content_str == ToString(DataFile::Content::kEqualityDeletes)) { + df.content = DataFile::Content::kEqualityDeletes; + } else { + return JsonParseError("Unknown data file content: {}", content_str); + } + + ICEBERG_ASSIGN_OR_RAISE(df.file_path, GetJsonValue(json, kFilePath)); + ICEBERG_ASSIGN_OR_RAISE(auto format_str, GetJsonValue(json, kFileFormat)); + ICEBERG_ASSIGN_OR_RAISE(df.file_format, FileFormatTypeFromString(format_str)); + + if (json.contains(kSpecId) && !json.at(kSpecId).is_null()) { + ICEBERG_ASSIGN_OR_RAISE(auto spec_id, GetJsonValue(json, kSpecId)); + df.partition_spec_id = spec_id; + } + + if (json.contains(kPartition)) { + ICEBERG_ASSIGN_OR_RAISE(auto partition_vals, + GetJsonValue(json, kPartition)); + if (!partition_vals.is_array()) { + return JsonParseError("PartitionValues must be a JSON array: {}", + SafeDumpJson(partition_vals)); + } + std::vector literals; + auto it = partition_spec_by_id.find(df.partition_spec_id.value_or(-1)); + if (it == partition_spec_by_id.end()) { + return JsonParseError("Invalid partition spec id: {}", + df.partition_spec_id.value_or(-1)); + } + ICEBERG_ASSIGN_OR_RAISE(auto struct_type, it->second->PartitionType(schema)); + auto fields = struct_type->fields(); + if (partition_vals.size() != fields.size()) { + return JsonParseError("Invalid partition data size: expected = {}, actual = {}", + fields.size(), partition_vals.size()); + } + for (size_t pos = 0; pos < fields.size(); ++pos) { + ICEBERG_ASSIGN_OR_RAISE( + auto literal, LiteralFromJson(partition_vals[pos], fields[pos].type().get())); + literals.push_back(std::move(literal)); + } + df.partition = PartitionValues(std::move(literals)); + } + + ICEBERG_ASSIGN_OR_RAISE(df.record_count, GetJsonValue(json, kRecordCount)); + ICEBERG_ASSIGN_OR_RAISE(df.file_size_in_bytes, + GetJsonValue(json, kFileSizeInBytes)); + + auto parse_int_map = [&](std::string_view key, + std::map& target) -> Status { + if (!json.contains(key) || json.at(key).is_null()) { + return {}; + } + ICEBERG_ASSIGN_OR_RAISE(auto map_json, GetJsonValue(json, key)); + ICEBERG_ASSIGN_OR_RAISE(auto keys, + GetTypedJsonValue>(map_json.at("keys"))); + ICEBERG_ASSIGN_OR_RAISE( + auto values, GetTypedJsonValue>(map_json.at("values"))); + if (keys.size() != values.size()) { + return JsonParseError("'{}' map keys and values have different lengths", key); + } + for (size_t i = 0; i < keys.size(); ++i) { + target[keys[i]] = values[i]; + } + return {}; + }; + + ICEBERG_RETURN_UNEXPECTED(parse_int_map(kColumnSizes, df.column_sizes)); + ICEBERG_RETURN_UNEXPECTED(parse_int_map(kValueCounts, df.value_counts)); + ICEBERG_RETURN_UNEXPECTED(parse_int_map(kNullValueCounts, df.null_value_counts)); + ICEBERG_RETURN_UNEXPECTED(parse_int_map(kNanValueCounts, df.nan_value_counts)); + + auto parse_binary_map = [&](std::string_view key, + std::map>& target) -> Status { + if (!json.contains(key) || json.at(key).is_null()) { + return {}; + } + ICEBERG_ASSIGN_OR_RAISE(auto map_json, GetJsonValue(json, key)); + ICEBERG_ASSIGN_OR_RAISE(auto keys, + GetJsonValue>(map_json, "keys")); + ICEBERG_ASSIGN_OR_RAISE( + auto values, GetJsonValue>>(map_json, "values")); + if (keys.size() != values.size()) { + return JsonParseError("'{}' binary map keys and values have different lengths", key); + } + for (size_t i = 0; i < keys.size(); ++i) { + target[keys[i]] = values[i]; + } + return {}; + }; + + ICEBERG_RETURN_UNEXPECTED(parse_binary_map(kLowerBounds, df.lower_bounds)); + ICEBERG_RETURN_UNEXPECTED(parse_binary_map(kUpperBounds, df.upper_bounds)); + + if (json.contains(kKeyMetadata) && !json.at(kKeyMetadata).is_null()) { + ICEBERG_ASSIGN_OR_RAISE(df.key_metadata, + GetJsonValue>(json, kKeyMetadata)); + } + if (json.contains(kSplitOffsets) && !json.at(kSplitOffsets).is_null()) { + ICEBERG_ASSIGN_OR_RAISE(df.split_offsets, + GetJsonValue>(json, kSplitOffsets)); + } + if (json.contains(kEqualityIds) && !json.at(kEqualityIds).is_null()) { + ICEBERG_ASSIGN_OR_RAISE(df.equality_ids, + GetJsonValue>(json, kEqualityIds)); + } + if (json.contains(kSortOrderId) && !json.at(kSortOrderId).is_null()) { + ICEBERG_ASSIGN_OR_RAISE(df.sort_order_id, GetJsonValue(json, kSortOrderId)); + } + if (json.contains(kFirstRowId) && !json.at(kFirstRowId).is_null()) { + ICEBERG_ASSIGN_OR_RAISE(df.first_row_id, GetJsonValue(json, kFirstRowId)); + } + if (json.contains(kReferencedDataFile) && !json.at(kReferencedDataFile).is_null()) { + ICEBERG_ASSIGN_OR_RAISE(df.referenced_data_file, + GetJsonValue(json, kReferencedDataFile)); + } + if (json.contains(kContentOffset) && !json.at(kContentOffset).is_null()) { + ICEBERG_ASSIGN_OR_RAISE(df.content_offset, + GetJsonValue(json, kContentOffset)); + } + if (json.contains(kContentSizeInBytes) && !json.at(kContentSizeInBytes).is_null()) { + ICEBERG_ASSIGN_OR_RAISE(df.content_size_in_bytes, + GetJsonValue(json, kContentSizeInBytes)); + } + + return df; +} + +Result>> FileScanTasksFromJson( + const nlohmann::json& json, + const std::vector>& delete_files, + const std::unordered_map>& partition_spec_by_id, + const Schema& schema) { + if (!json.is_array()) { + return JsonParseError("Cannot parse file scan tasks from non-array: {}", + SafeDumpJson(json)); + } + std::vector> file_scan_tasks; + for (const auto& task_json : json) { + if (!task_json.is_object()) { + return JsonParseError("Cannot parse file scan task from a non-object: {}", + SafeDumpJson(task_json)); + } + + ICEBERG_ASSIGN_OR_RAISE(auto data_file_json, + GetJsonValue(task_json, kDataFile)); + ICEBERG_ASSIGN_OR_RAISE(auto data_file, + DataFileFromJson(data_file_json, partition_spec_by_id, schema)); + + std::vector> task_delete_files; + if (task_json.contains(kDeleteFileReferences) && + !task_json.at(kDeleteFileReferences).is_null()) { + ICEBERG_ASSIGN_OR_RAISE(auto refs, GetJsonValue>( + task_json, kDeleteFileReferences)); + for (int32_t ref : refs) { + if (ref < 0 || static_cast(ref) >= delete_files.size()) { + return JsonParseError( + "delete-file-references index {} is out of range (delete_files size: {})", + ref, delete_files.size()); + } + task_delete_files.push_back(delete_files[ref]); + } + } + + std::shared_ptr residual_filter; + if (task_json.contains(kResidualFilter) && !task_json.at(kResidualFilter).is_null()) { + ICEBERG_ASSIGN_OR_RAISE(auto filter_json, + GetJsonValue(task_json, kResidualFilter)); + ICEBERG_ASSIGN_OR_RAISE(residual_filter, ExpressionFromJson(filter_json)); + } + + file_scan_tasks.push_back( + std::make_shared(std::make_shared(std::move(data_file)), + std::move(task_delete_files), + std::move(residual_filter))); + } + return file_scan_tasks; +} + +namespace { Status BaseScanTaskResponseFromJson( const nlohmann::json& json, BaseScanTaskResponse* response, diff --git a/src/iceberg/catalog/rest/json_serde_internal.h b/src/iceberg/catalog/rest/json_serde_internal.h index cdd2dab4d..c7e42e479 100644 --- a/src/iceberg/catalog/rest/json_serde_internal.h +++ b/src/iceberg/catalog/rest/json_serde_internal.h @@ -19,15 +19,19 @@ #pragma once +#include #include +#include #include #include "iceberg/catalog/rest/iceberg_rest_export.h" #include "iceberg/catalog/rest/types.h" +#include "iceberg/manifest/manifest_entry.h" #include "iceberg/partition_spec.h" #include "iceberg/result.h" #include "iceberg/schema.h" +#include "iceberg/table_scan.h" /// \file iceberg/catalog/rest/json_serde_internal.h /// JSON serialization and deserialization for Iceberg REST Catalog API types. @@ -88,4 +92,15 @@ ICEBERG_REST_EXPORT Result FetchScanTasksResponseFromJso ICEBERG_REST_EXPORT Result ToJson(const PlanTableScanRequest& request); ICEBERG_REST_EXPORT nlohmann::json ToJson(const FetchScanTasksRequest& request); +ICEBERG_REST_EXPORT Result DataFileFromJson( + const nlohmann::json& json, + const std::unordered_map>& partition_spec_by_id, + const Schema& schema); + +ICEBERG_REST_EXPORT Result>> FileScanTasksFromJson( + const nlohmann::json& json, + const std::vector>& delete_files, + const std::unordered_map>& partition_spec_by_id, + const Schema& schema); + } // namespace iceberg::rest diff --git a/src/iceberg/json_serde.cc b/src/iceberg/json_serde.cc index 2271f4e26..a88cf003c 100644 --- a/src/iceberg/json_serde.cc +++ b/src/iceberg/json_serde.cc @@ -1751,198 +1751,4 @@ Result> TableRequirementFromJson( return JsonParseError("Unknown table requirement type: {}", type); } -Result DataFileFromJson( - const nlohmann::json& json, - const std::unordered_map>& partition_spec_by_id, - const Schema& schema) { - if (!json.is_object()) { - return JsonParseError("DataFile must be a JSON object: {}", SafeDumpJson(json)); - } - DataFile df; - - ICEBERG_ASSIGN_OR_RAISE(auto content_str, GetJsonValue(json, kContent)); - if (content_str == ToString(DataFile::Content::kData)) { - df.content = DataFile::Content::kData; - } else if (content_str == ToString(DataFile::Content::kPositionDeletes)) { - df.content = DataFile::Content::kPositionDeletes; - } else if (content_str == ToString(DataFile::Content::kEqualityDeletes)) { - df.content = DataFile::Content::kEqualityDeletes; - } else { - return JsonParseError("Unknown data file content: {}", content_str); - } - - ICEBERG_ASSIGN_OR_RAISE(df.file_path, GetJsonValue(json, kFilePath)); - ICEBERG_ASSIGN_OR_RAISE(auto format_str, GetJsonValue(json, kFileFormat)); - ICEBERG_ASSIGN_OR_RAISE(df.file_format, FileFormatTypeFromString(format_str)); - - if (json.contains(kSpecId) && !json.at(kSpecId).is_null()) { - ICEBERG_ASSIGN_OR_RAISE(auto spec_id, GetJsonValue(json, kSpecId)); - df.partition_spec_id = spec_id; - } - - if (json.contains(kPartition)) { - ICEBERG_ASSIGN_OR_RAISE(auto partition_vals, - GetJsonValue(json, kPartition)); - if (!partition_vals.is_array()) { - return JsonParseError("PartitionValues must be a JSON array: {}", - SafeDumpJson(partition_vals)); - } - std::vector literals; - auto it = partition_spec_by_id.find(df.partition_spec_id.value_or(-1)); - if (it == partition_spec_by_id.end()) { - return JsonParseError("Invalid partition spec id: {}", - df.partition_spec_id.value_or(-1)); - } - ICEBERG_ASSIGN_OR_RAISE(auto struct_type, it->second->PartitionType(schema)); - auto fields = struct_type->fields(); - if (partition_vals.size() != fields.size()) { - return JsonParseError("Invalid partition data size: expected = {}, actual = {}", - fields.size(), partition_vals.size()); - } - for (size_t pos = 0; pos < fields.size(); ++pos) { - ICEBERG_ASSIGN_OR_RAISE( - auto literal, LiteralFromJson(partition_vals[pos], fields[pos].type().get())); - literals.push_back(std::move(literal)); - } - df.partition = PartitionValues(std::move(literals)); - } - - ICEBERG_ASSIGN_OR_RAISE(df.record_count, GetJsonValue(json, kRecordCount)); - ICEBERG_ASSIGN_OR_RAISE(df.file_size_in_bytes, - GetJsonValue(json, kFileSizeInBytes)); - - // Parse CountMap: {"keys": [int, ...], "values": [long, ...]} - auto parse_int_map = [&](std::string_view key, - std::map& target) -> Status { - if (!json.contains(key) || json.at(key).is_null()) { - return {}; - } - ICEBERG_ASSIGN_OR_RAISE(auto map_json, GetJsonValue(json, key)); - ICEBERG_ASSIGN_OR_RAISE(auto keys, - GetTypedJsonValue>(map_json.at("keys"))); - ICEBERG_ASSIGN_OR_RAISE( - auto values, GetTypedJsonValue>(map_json.at("values"))); - if (keys.size() != values.size()) { - return JsonParseError("'{}' map keys and values have different lengths", key); - } - for (size_t i = 0; i < keys.size(); ++i) { - target[keys[i]] = values[i]; - } - return {}; - }; - - ICEBERG_RETURN_UNEXPECTED(parse_int_map(kColumnSizes, df.column_sizes)); - ICEBERG_RETURN_UNEXPECTED(parse_int_map(kValueCounts, df.value_counts)); - ICEBERG_RETURN_UNEXPECTED(parse_int_map(kNullValueCounts, df.null_value_counts)); - ICEBERG_RETURN_UNEXPECTED(parse_int_map(kNanValueCounts, df.nan_value_counts)); - - // Parse BinaryMap: {"keys": [int, ...], "values": [...]} - auto parse_binary_map = [&](std::string_view key, - std::map>& target) -> Status { - if (!json.contains(key) || json.at(key).is_null()) { - return {}; - } - ICEBERG_ASSIGN_OR_RAISE(auto map_json, GetJsonValue(json, key)); - ICEBERG_ASSIGN_OR_RAISE(auto keys, - GetJsonValue>(map_json, "keys")); - ICEBERG_ASSIGN_OR_RAISE( - auto values, GetJsonValue>>(map_json, "values")); - if (keys.size() != values.size()) { - return JsonParseError("'{}' binary map keys and values have different lengths", - key); - } - for (size_t i = 0; i < keys.size(); ++i) { - target[keys[i]] = values[i]; - } - return {}; - }; - - ICEBERG_RETURN_UNEXPECTED(parse_binary_map(kLowerBounds, df.lower_bounds)); - ICEBERG_RETURN_UNEXPECTED(parse_binary_map(kUpperBounds, df.upper_bounds)); - - if (json.contains(kKeyMetadata) && !json.at(kKeyMetadata).is_null()) { - ICEBERG_ASSIGN_OR_RAISE(df.key_metadata, - GetJsonValue>(json, kKeyMetadata)); - } - if (json.contains(kSplitOffsets) && !json.at(kSplitOffsets).is_null()) { - ICEBERG_ASSIGN_OR_RAISE(df.split_offsets, - GetJsonValue>(json, kSplitOffsets)); - } - if (json.contains(kEqualityIds) && !json.at(kEqualityIds).is_null()) { - ICEBERG_ASSIGN_OR_RAISE(df.equality_ids, - GetJsonValue>(json, kEqualityIds)); - } - if (json.contains(kSortOrderId) && !json.at(kSortOrderId).is_null()) { - ICEBERG_ASSIGN_OR_RAISE(df.sort_order_id, GetJsonValue(json, kSortOrderId)); - } - if (json.contains(kFirstRowId) && !json.at(kFirstRowId).is_null()) { - ICEBERG_ASSIGN_OR_RAISE(df.first_row_id, GetJsonValue(json, kFirstRowId)); - } - if (json.contains(kReferencedDataFile) && !json.at(kReferencedDataFile).is_null()) { - ICEBERG_ASSIGN_OR_RAISE(df.referenced_data_file, - GetJsonValue(json, kReferencedDataFile)); - } - if (json.contains(kContentOffset) && !json.at(kContentOffset).is_null()) { - ICEBERG_ASSIGN_OR_RAISE(df.content_offset, - GetJsonValue(json, kContentOffset)); - } - if (json.contains(kContentSizeInBytes) && !json.at(kContentSizeInBytes).is_null()) { - ICEBERG_ASSIGN_OR_RAISE(df.content_size_in_bytes, - GetJsonValue(json, kContentSizeInBytes)); - } - - return df; -} - -Result>> FileScanTasksFromJson( - const nlohmann::json& json, - const std::vector>& delete_files, - const std::unordered_map>& partition_spec_by_id, - const Schema& schema) { - if (!json.is_array()) { - return JsonParseError("Cannot parse file scan tasks from non-array: {}", - SafeDumpJson(json)); - } - std::vector> file_scan_tasks; - for (const auto& task_json : json) { - if (!task_json.is_object()) { - return JsonParseError("Cannot parse file scan task from a non-object: {}", - SafeDumpJson(task_json)); - } - - ICEBERG_ASSIGN_OR_RAISE(auto data_file_json, - GetJsonValue(task_json, kDataFile)); - ICEBERG_ASSIGN_OR_RAISE(auto data_file, - DataFileFromJson(data_file_json, partition_spec_by_id, schema)); - - std::vector> task_delete_files; - if (task_json.contains(kDeleteFileReferences) && - !task_json.at(kDeleteFileReferences).is_null()) { - ICEBERG_ASSIGN_OR_RAISE(auto refs, GetJsonValue>( - task_json, kDeleteFileReferences)); - for (int32_t ref : refs) { - if (ref < 0 || static_cast(ref) >= delete_files.size()) { - return JsonParseError( - "delete-file-references index {} is out of range (delete_files size: {})", - ref, delete_files.size()); - } - task_delete_files.push_back(delete_files[ref]); - } - } - - std::shared_ptr residual_filter; - if (task_json.contains(kResidualFilter) && !task_json.at(kResidualFilter).is_null()) { - ICEBERG_ASSIGN_OR_RAISE(auto filter_json, - GetJsonValue(task_json, kResidualFilter)); - ICEBERG_ASSIGN_OR_RAISE(residual_filter, ExpressionFromJson(filter_json)); - } - - file_scan_tasks.push_back( - std::make_shared(std::make_shared(std::move(data_file)), - std::move(task_delete_files), - std::move(residual_filter))); - } - return file_scan_tasks; -} - } // namespace iceberg diff --git a/src/iceberg/json_serde_internal.h b/src/iceberg/json_serde_internal.h index a96243765..d67bb61eb 100644 --- a/src/iceberg/json_serde_internal.h +++ b/src/iceberg/json_serde_internal.h @@ -417,40 +417,4 @@ ICEBERG_EXPORT nlohmann::json ToJson(const TableRequirement& requirement); ICEBERG_EXPORT Result> TableRequirementFromJson( const nlohmann::json& json); -/// \brief Deserializes a JSON object into a `DataFile` object. -/// -/// Parses a DataFile from the REST Catalog JSON format. Maps (column-sizes, -/// value-counts, etc.) use the CountMap/BinaryMap parallel arrays format. -/// Binary fields (lower-bounds, upper-bounds, key-metadata) are base64-encoded. -/// -/// \param json The JSON object representing a `DataFile`. -/// \param partition_spec_by_id Map from spec ID to PartitionSpec for type-aware partition -/// parsing. -/// \param schema The table schema, used with partition_spec_by_id to resolve partition -/// types. -/// \return A `DataFile` object or an error if the conversion fails. -ICEBERG_EXPORT Result DataFileFromJson(const nlohmann::json& json); - -ICEBERG_EXPORT Result DataFileFromJson( - const nlohmann::json& json, - const std::unordered_map>& partition_spec_by_id, - const Schema& schema); - -/// \brief Deserializes a JSON array of file scan tasks. -/// -/// Each task may reference delete files by index into \p delete_files. -/// -/// \param json The JSON array of file scan task objects. -/// \param delete_files Delete files indexed by the tasks' delete-file-references. -/// \param partition_spec_by_id Map from spec ID to PartitionSpec for type-aware partition -/// parsing. -/// \param schema The table schema, used with partition_spec_by_id to resolve partition -/// types. -/// \return A vector of `FileScanTask` objects or an error if the conversion fails. -ICEBERG_EXPORT Result>> FileScanTasksFromJson( - const nlohmann::json& json, - const std::vector>& delete_files, - const std::unordered_map>& partition_spec_by_id, - const Schema& schema); - } // namespace iceberg diff --git a/src/iceberg/test/json_serde_test.cc b/src/iceberg/test/json_serde_test.cc index 6fc7aaa07..f019375d3 100644 --- a/src/iceberg/test/json_serde_test.cc +++ b/src/iceberg/test/json_serde_test.cc @@ -23,9 +23,7 @@ #include #include -#include "iceberg/file_format.h" #include "iceberg/json_serde_internal.h" -#include "iceberg/manifest/manifest_entry.h" #include "iceberg/name_mapping.h" #include "iceberg/partition_spec.h" #include "iceberg/schema.h" @@ -34,7 +32,6 @@ #include "iceberg/sort_order.h" #include "iceberg/statistics_file.h" #include "iceberg/table_requirement.h" -#include "iceberg/table_scan.h" #include "iceberg/table_update.h" #include "iceberg/test/matchers.h" #include "iceberg/transform.h" @@ -775,229 +772,4 @@ TEST(TableRequirementJsonTest, TableRequirementUnknownType) { EXPECT_THAT(result, HasErrorMessage("Unknown table requirement type")); } -// ---- DataFileFromJson tests ---- - -TEST(DataFileFromJsonTest, RequiredFieldsOnly) { - auto json = R"({ - "content": "data", - "file-path": "s3://bucket/data/file.parquet", - "file-format": "PARQUET", - "file-size-in-bytes": 12345, - "record-count": 100 - })"_json; - - auto result = DataFileFromJson(json, {}, Schema({}, 0)); - ASSERT_THAT(result, IsOk()); - const auto& df = result.value(); - EXPECT_EQ(df.content, DataFile::Content::kData); - EXPECT_EQ(df.file_path, "s3://bucket/data/file.parquet"); - EXPECT_EQ(df.file_format, FileFormatType::kParquet); - EXPECT_EQ(df.file_size_in_bytes, 12345); - EXPECT_EQ(df.record_count, 100); - EXPECT_TRUE(df.column_sizes.empty()); - EXPECT_FALSE(df.sort_order_id.has_value()); - EXPECT_FALSE(df.partition_spec_id.has_value()); -} - -TEST(DataFileFromJsonTest, LowercaseFormat) { - auto json = R"({ - "content": "data", - "file-path": "s3://bucket/data/file.avro", - "file-format": "avro", - "file-size-in-bytes": 500, - "record-count": 10 - })"_json; - - auto result = DataFileFromJson(json, {}, Schema({}, 0)); - ASSERT_THAT(result, IsOk()); - EXPECT_EQ(result.value().content, DataFile::Content::kData); - EXPECT_EQ(result.value().file_format, FileFormatType::kAvro); -} - -TEST(DataFileFromJsonTest, WithOptionalFields) { - auto json = R"({ - "content": "data", - "file-path": "s3://bucket/data/file.parquet", - "file-format": "PARQUET", - "spec-id": 1, - "file-size-in-bytes": 12345, - "record-count": 100, - "column-sizes": {"keys": [1, 2], "values": [1000, 2000]}, - "value-counts": {"keys": [1, 2], "values": [100, 100]}, - "null-value-counts": {"keys": [1], "values": [0]}, - "nan-value-counts": {"keys": [2], "values": [5]}, - "split-offsets": [0, 4096], - "sort-order-id": 0 - })"_json; - - auto result = DataFileFromJson(json, {}, Schema({}, 0)); - ASSERT_THAT(result, IsOk()); - const auto& df = result.value(); - EXPECT_EQ(df.partition_spec_id, 1); - ASSERT_EQ(df.column_sizes.size(), 2U); - EXPECT_EQ(df.column_sizes.at(1), 1000); - EXPECT_EQ(df.column_sizes.at(2), 2000); - ASSERT_EQ(df.value_counts.size(), 2U); - EXPECT_EQ(df.value_counts.at(1), 100); - ASSERT_EQ(df.null_value_counts.size(), 1U); - EXPECT_EQ(df.null_value_counts.at(1), 0); - ASSERT_EQ(df.nan_value_counts.size(), 1U); - EXPECT_EQ(df.nan_value_counts.at(2), 5); - ASSERT_EQ(df.split_offsets.size(), 2U); - EXPECT_EQ(df.split_offsets[0], 0); - EXPECT_EQ(df.split_offsets[1], 4096); - EXPECT_EQ(df.sort_order_id, 0); -} - -TEST(DataFileFromJsonTest, EqualityDeleteFile) { - auto json = R"({ - "content": "equality_deletes", - "file-path": "s3://bucket/deletes/eq_delete.parquet", - "file-format": "PARQUET", - "file-size-in-bytes": 5000, - "record-count": 50, - "equality-ids": [1, 2] - })"_json; - - auto result = DataFileFromJson(json, {}, Schema({}, 0)); - ASSERT_THAT(result, IsOk()); - const auto& df = result.value(); - EXPECT_EQ(df.content, DataFile::Content::kEqualityDeletes); - ASSERT_EQ(df.equality_ids.size(), 2U); - EXPECT_EQ(df.equality_ids[0], 1); - EXPECT_EQ(df.equality_ids[1], 2); -} - -TEST(DataFileFromJsonTest, PositionDeleteFileWithReferencedDataFile) { - auto json = R"({ - "content": "position_deletes", - "file-path": "s3://bucket/deletes/pos_delete.parquet", - "file-format": "PARQUET", - "file-size-in-bytes": 3000, - "record-count": 20, - "referenced-data-file": "s3://bucket/data/file.parquet" - })"_json; - - auto result = DataFileFromJson(json, {}, Schema({}, 0)); - ASSERT_THAT(result, IsOk()); - const auto& df = result.value(); - EXPECT_EQ(df.content, DataFile::Content::kPositionDeletes); - ASSERT_TRUE(df.referenced_data_file.has_value()); - EXPECT_EQ(df.referenced_data_file.value(), "s3://bucket/data/file.parquet"); -} - -TEST(DataFileFromJsonTest, InvalidContentType) { - auto json = R"({ - "content": "UNKNOWN", - "file-path": "s3://bucket/file.parquet", - "file-format": "PARQUET", - "file-size-in-bytes": 100, - "record-count": 10 - })"_json; - - auto result = DataFileFromJson(json, {}, Schema({}, 0)); - EXPECT_THAT(result, IsError(ErrorKind::kJsonParseError)); - EXPECT_THAT(result, HasErrorMessage("Unknown data file content")); -} - -TEST(DataFileFromJsonTest, MissingRequiredField) { - // Missing "file-path" - auto json = R"({ - "content": "data", - "file-format": "PARQUET", - "file-size-in-bytes": 100, - "record-count": 10 - })"_json; - - auto result = DataFileFromJson(json, {}, Schema({}, 0)); - EXPECT_THAT(result, IsError(ErrorKind::kJsonParseError)); -} - -TEST(DataFileFromJsonTest, NotAnObject) { - auto result = DataFileFromJson(nlohmann::json::array(), {}, Schema({}, 0)); - EXPECT_THAT(result, IsError(ErrorKind::kJsonParseError)); - EXPECT_THAT(result, HasErrorMessage("DataFile must be a JSON object")); -} - -// ---- FileScanTasksFromJson tests ---- - -TEST(FileScanTasksFromJsonTest, EmptyArray) { - auto result = FileScanTasksFromJson(nlohmann::json::array(), {}, {}, Schema({}, 0)); - ASSERT_THAT(result, IsOk()); - EXPECT_TRUE(result.value().empty()); -} - -TEST(FileScanTasksFromJsonTest, SingleTaskNoDeleteFiles) { - auto json = R"([{ - "data-file": { - "content": "data", - "file-path": "s3://bucket/data/file.parquet", - "file-format": "PARQUET", - "file-size-in-bytes": 12345, - "record-count": 100 - } - }])"_json; - - auto result = FileScanTasksFromJson(json, {}, {}, Schema({}, 0)); - ASSERT_THAT(result, IsOk()); - ASSERT_EQ(result.value().size(), 1U); - const auto& task = result.value()[0]; - ASSERT_NE(task->data_file(), nullptr); - EXPECT_EQ(task->data_file()->file_path, "s3://bucket/data/file.parquet"); - EXPECT_TRUE(task->delete_files().empty()); - EXPECT_EQ(task->residual_filter(), nullptr); -} - -TEST(FileScanTasksFromJsonTest, TaskWithDeleteFileReferences) { - DataFile delete_file; - delete_file.content = DataFile::Content::kPositionDeletes; - delete_file.file_path = "s3://bucket/deletes/pos_delete.parquet"; - delete_file.file_format = FileFormatType::kParquet; - delete_file.file_size_in_bytes = 1000; - delete_file.record_count = 5; - - auto json = R"([{ - "data-file": { - "content": "data", - "file-path": "s3://bucket/data/file.parquet", - "file-format": "PARQUET", - "file-size-in-bytes": 12345, - "record-count": 100 - }, - "delete-file-references": [0] - }])"_json; - - auto result = - FileScanTasksFromJson(json, {std::make_shared(delete_file)}, {}, Schema({}, 0)); - ASSERT_THAT(result, IsOk()); - ASSERT_EQ(result.value().size(), 1U); - const auto& task = result.value()[0]; - ASSERT_EQ(task->delete_files().size(), 1U); - EXPECT_EQ(task->delete_files()[0]->file_path, "s3://bucket/deletes/pos_delete.parquet"); -} - -TEST(FileScanTasksFromJsonTest, DeleteFileReferenceOutOfRange) { - auto json = R"([{ - "data-file": { - "content": "data", - "file-path": "s3://bucket/data/file.parquet", - "file-format": "PARQUET", - "file-size-in-bytes": 100, - "record-count": 10 - }, - "delete-file-references": [5] - }])"_json; - - // No delete files provided, so index 5 is out of range - auto result = FileScanTasksFromJson(json, {}, {}, Schema({}, 0)); - EXPECT_THAT(result, IsError(ErrorKind::kJsonParseError)); - EXPECT_THAT(result, HasErrorMessage("out of range")); -} - -TEST(FileScanTasksFromJsonTest, NotAnArray) { - auto result = FileScanTasksFromJson(nlohmann::json::object(), {}, {}, Schema({}, 0)); - EXPECT_THAT(result, IsError(ErrorKind::kJsonParseError)); - EXPECT_THAT(result, HasErrorMessage("non-array")); -} - } // namespace iceberg diff --git a/src/iceberg/test/rest_json_serde_test.cc b/src/iceberg/test/rest_json_serde_test.cc index 72df61425..c3038b7ac 100644 --- a/src/iceberg/test/rest_json_serde_test.cc +++ b/src/iceberg/test/rest_json_serde_test.cc @@ -25,6 +25,8 @@ #include "iceberg/catalog/rest/json_serde_internal.h" #include "iceberg/catalog/rest/types.h" +#include "iceberg/file_format.h" +#include "iceberg/manifest/manifest_entry.h" #include "iceberg/partition_spec.h" #include "iceberg/result.h" #include "iceberg/schema.h" @@ -32,6 +34,7 @@ #include "iceberg/table_identifier.h" #include "iceberg/table_metadata.h" #include "iceberg/table_requirement.h" +#include "iceberg/table_scan.h" #include "iceberg/table_update.h" #include "iceberg/test/matchers.h" @@ -1516,4 +1519,227 @@ TEST(FetchScanTasksResponseFromJsonTest, AllFieldsMissing) { ASSERT_THAT(result, IsError(ErrorKind::kValidationFailed)); } +// --- DataFileFromJson --- + +TEST(DataFileFromJsonTest, RequiredFieldsOnly) { + auto json = R"({ + "content": "data", + "file-path": "s3://bucket/data/file.parquet", + "file-format": "PARQUET", + "file-size-in-bytes": 12345, + "record-count": 100 + })"_json; + + auto result = DataFileFromJson(json, {}, Schema({}, 0)); + ASSERT_THAT(result, IsOk()); + const auto& df = result.value(); + EXPECT_EQ(df.content, DataFile::Content::kData); + EXPECT_EQ(df.file_path, "s3://bucket/data/file.parquet"); + EXPECT_EQ(df.file_format, FileFormatType::kParquet); + EXPECT_EQ(df.file_size_in_bytes, 12345); + EXPECT_EQ(df.record_count, 100); + EXPECT_TRUE(df.column_sizes.empty()); + EXPECT_FALSE(df.sort_order_id.has_value()); + EXPECT_FALSE(df.partition_spec_id.has_value()); +} + +TEST(DataFileFromJsonTest, LowercaseFormat) { + auto json = R"({ + "content": "data", + "file-path": "s3://bucket/data/file.avro", + "file-format": "avro", + "file-size-in-bytes": 500, + "record-count": 10 + })"_json; + + auto result = DataFileFromJson(json, {}, Schema({}, 0)); + ASSERT_THAT(result, IsOk()); + EXPECT_EQ(result.value().content, DataFile::Content::kData); + EXPECT_EQ(result.value().file_format, FileFormatType::kAvro); +} + +TEST(DataFileFromJsonTest, WithOptionalFields) { + auto json = R"({ + "content": "data", + "file-path": "s3://bucket/data/file.parquet", + "file-format": "PARQUET", + "spec-id": 1, + "file-size-in-bytes": 12345, + "record-count": 100, + "column-sizes": {"keys": [1, 2], "values": [1000, 2000]}, + "value-counts": {"keys": [1, 2], "values": [100, 100]}, + "null-value-counts": {"keys": [1], "values": [0]}, + "nan-value-counts": {"keys": [2], "values": [5]}, + "split-offsets": [0, 4096], + "sort-order-id": 0 + })"_json; + + auto result = DataFileFromJson(json, {}, Schema({}, 0)); + ASSERT_THAT(result, IsOk()); + const auto& df = result.value(); + EXPECT_EQ(df.partition_spec_id, 1); + ASSERT_EQ(df.column_sizes.size(), 2U); + EXPECT_EQ(df.column_sizes.at(1), 1000); + EXPECT_EQ(df.column_sizes.at(2), 2000); + ASSERT_EQ(df.value_counts.size(), 2U); + EXPECT_EQ(df.value_counts.at(1), 100); + ASSERT_EQ(df.null_value_counts.size(), 1U); + EXPECT_EQ(df.null_value_counts.at(1), 0); + ASSERT_EQ(df.nan_value_counts.size(), 1U); + EXPECT_EQ(df.nan_value_counts.at(2), 5); + ASSERT_EQ(df.split_offsets.size(), 2U); + EXPECT_EQ(df.split_offsets[0], 0); + EXPECT_EQ(df.split_offsets[1], 4096); + EXPECT_EQ(df.sort_order_id, 0); +} + +TEST(DataFileFromJsonTest, EqualityDeleteFile) { + auto json = R"({ + "content": "equality_deletes", + "file-path": "s3://bucket/deletes/eq_delete.parquet", + "file-format": "PARQUET", + "file-size-in-bytes": 5000, + "record-count": 50, + "equality-ids": [1, 2] + })"_json; + + auto result = DataFileFromJson(json, {}, Schema({}, 0)); + ASSERT_THAT(result, IsOk()); + const auto& df = result.value(); + EXPECT_EQ(df.content, DataFile::Content::kEqualityDeletes); + ASSERT_EQ(df.equality_ids.size(), 2U); + EXPECT_EQ(df.equality_ids[0], 1); + EXPECT_EQ(df.equality_ids[1], 2); +} + +TEST(DataFileFromJsonTest, PositionDeleteFileWithReferencedDataFile) { + auto json = R"({ + "content": "position_deletes", + "file-path": "s3://bucket/deletes/pos_delete.parquet", + "file-format": "PARQUET", + "file-size-in-bytes": 3000, + "record-count": 20, + "referenced-data-file": "s3://bucket/data/file.parquet" + })"_json; + + auto result = DataFileFromJson(json, {}, Schema({}, 0)); + ASSERT_THAT(result, IsOk()); + const auto& df = result.value(); + EXPECT_EQ(df.content, DataFile::Content::kPositionDeletes); + ASSERT_TRUE(df.referenced_data_file.has_value()); + EXPECT_EQ(df.referenced_data_file.value(), "s3://bucket/data/file.parquet"); +} + +TEST(DataFileFromJsonTest, InvalidContentType) { + auto json = R"({ + "content": "UNKNOWN", + "file-path": "s3://bucket/file.parquet", + "file-format": "PARQUET", + "file-size-in-bytes": 100, + "record-count": 10 + })"_json; + + auto result = DataFileFromJson(json, {}, Schema({}, 0)); + EXPECT_THAT(result, IsError(ErrorKind::kJsonParseError)); + EXPECT_THAT(result, HasErrorMessage("Unknown data file content")); +} + +TEST(DataFileFromJsonTest, MissingRequiredField) { + auto json = R"({ + "content": "data", + "file-format": "PARQUET", + "file-size-in-bytes": 100, + "record-count": 10 + })"_json; + + auto result = DataFileFromJson(json, {}, Schema({}, 0)); + EXPECT_THAT(result, IsError(ErrorKind::kJsonParseError)); +} + +TEST(DataFileFromJsonTest, NotAnObject) { + auto result = DataFileFromJson(nlohmann::json::array(), {}, Schema({}, 0)); + EXPECT_THAT(result, IsError(ErrorKind::kJsonParseError)); + EXPECT_THAT(result, HasErrorMessage("DataFile must be a JSON object")); +} + +// --- FileScanTasksFromJson --- + +TEST(FileScanTasksFromJsonTest, EmptyArray) { + auto result = FileScanTasksFromJson(nlohmann::json::array(), {}, {}, Schema({}, 0)); + ASSERT_THAT(result, IsOk()); + EXPECT_TRUE(result.value().empty()); +} + +TEST(FileScanTasksFromJsonTest, SingleTaskNoDeleteFiles) { + auto json = R"([{ + "data-file": { + "content": "data", + "file-path": "s3://bucket/data/file.parquet", + "file-format": "PARQUET", + "file-size-in-bytes": 12345, + "record-count": 100 + } + }])"_json; + + auto result = FileScanTasksFromJson(json, {}, {}, Schema({}, 0)); + ASSERT_THAT(result, IsOk()); + ASSERT_EQ(result.value().size(), 1U); + const auto& task = result.value()[0]; + ASSERT_NE(task->data_file(), nullptr); + EXPECT_EQ(task->data_file()->file_path, "s3://bucket/data/file.parquet"); + EXPECT_TRUE(task->delete_files().empty()); + EXPECT_EQ(task->residual_filter(), nullptr); +} + +TEST(FileScanTasksFromJsonTest, TaskWithDeleteFileReferences) { + DataFile delete_file; + delete_file.content = DataFile::Content::kPositionDeletes; + delete_file.file_path = "s3://bucket/deletes/pos_delete.parquet"; + delete_file.file_format = FileFormatType::kParquet; + delete_file.file_size_in_bytes = 1000; + delete_file.record_count = 5; + + auto json = R"([{ + "data-file": { + "content": "data", + "file-path": "s3://bucket/data/file.parquet", + "file-format": "PARQUET", + "file-size-in-bytes": 12345, + "record-count": 100 + }, + "delete-file-references": [0] + }])"_json; + + auto result = + FileScanTasksFromJson(json, {std::make_shared(delete_file)}, {}, Schema({}, 0)); + ASSERT_THAT(result, IsOk()); + ASSERT_EQ(result.value().size(), 1U); + const auto& task = result.value()[0]; + ASSERT_EQ(task->delete_files().size(), 1U); + EXPECT_EQ(task->delete_files()[0]->file_path, "s3://bucket/deletes/pos_delete.parquet"); +} + +TEST(FileScanTasksFromJsonTest, DeleteFileReferenceOutOfRange) { + auto json = R"([{ + "data-file": { + "content": "data", + "file-path": "s3://bucket/data/file.parquet", + "file-format": "PARQUET", + "file-size-in-bytes": 100, + "record-count": 10 + }, + "delete-file-references": [5] + }])"_json; + + auto result = FileScanTasksFromJson(json, {}, {}, Schema({}, 0)); + EXPECT_THAT(result, IsError(ErrorKind::kJsonParseError)); + EXPECT_THAT(result, HasErrorMessage("out of range")); +} + +TEST(FileScanTasksFromJsonTest, NotAnArray) { + auto result = FileScanTasksFromJson(nlohmann::json::object(), {}, {}, Schema({}, 0)); + EXPECT_THAT(result, IsError(ErrorKind::kJsonParseError)); + EXPECT_THAT(result, HasErrorMessage("non-array")); +} + } // namespace iceberg::rest From 2ca9bb2de1ea5f0ba027fe138ed5867972d08926 Mon Sep 17 00:00:00 2001 From: Sandeep Gottimukkala Date: Sun, 17 May 2026 15:26:16 -0700 Subject: [PATCH 16/17] feat(rest): add ToJson for DataFile and scan task responses with roundtrip tests Add ToJson(DataFile) and ToJson for PlanTableScanResponse, FetchPlanningResultResponse, and FetchScanTasksResponse to support both roundtrip test coverage and server-side use cases. The response serializers use an indexed delete-files structure matching the REST Catalog wire format. --- src/iceberg/catalog/rest/json_serde.cc | 142 ++++++++++++++++++ .../catalog/rest/json_serde_internal.h | 6 + src/iceberg/test/rest_json_serde_test.cc | 97 ++++++++++++ 3 files changed, 245 insertions(+) diff --git a/src/iceberg/catalog/rest/json_serde.cc b/src/iceberg/catalog/rest/json_serde.cc index c384b4dbd..552435191 100644 --- a/src/iceberg/catalog/rest/json_serde.cc +++ b/src/iceberg/catalog/rest/json_serde.cc @@ -17,8 +17,10 @@ * under the License. */ +#include #include #include +#include #include #include @@ -314,8 +316,129 @@ Result>> FileScanTasksFromJson( return file_scan_tasks; } +nlohmann::json ToJson(const DataFile& df) { + nlohmann::json json; + json[kContent] = ToString(df.content); + json[kFilePath] = df.file_path; + json[kFileFormat] = ToString(df.file_format); + + if (df.partition_spec_id.has_value()) { + json[kSpecId] = df.partition_spec_id.value(); + } + + json[kRecordCount] = df.record_count; + json[kFileSizeInBytes] = df.file_size_in_bytes; + + auto write_int_map = [&](std::string_view key, + const std::map& m) { + if (!m.empty()) { + std::vector keys; + std::vector values; + for (const auto& [k, v] : m) { + keys.push_back(k); + values.push_back(v); + } + json[key] = {{"keys", std::move(keys)}, {"values", std::move(values)}}; + } + }; + + write_int_map(kColumnSizes, df.column_sizes); + write_int_map(kValueCounts, df.value_counts); + write_int_map(kNullValueCounts, df.null_value_counts); + write_int_map(kNanValueCounts, df.nan_value_counts); + + auto write_binary_map = [&](std::string_view key, + const std::map>& m) { + if (!m.empty()) { + std::vector keys; + std::vector> values; + for (const auto& [k, v] : m) { + keys.push_back(k); + values.push_back(v); + } + json[key] = {{"keys", std::move(keys)}, {"values", std::move(values)}}; + } + }; + + write_binary_map(kLowerBounds, df.lower_bounds); + write_binary_map(kUpperBounds, df.upper_bounds); + + if (!df.key_metadata.empty()) { + json[kKeyMetadata] = df.key_metadata; + } + if (!df.split_offsets.empty()) { + json[kSplitOffsets] = df.split_offsets; + } + if (!df.equality_ids.empty()) { + json[kEqualityIds] = df.equality_ids; + } + if (df.sort_order_id.has_value()) { + json[kSortOrderId] = df.sort_order_id.value(); + } + if (df.first_row_id.has_value()) { + json[kFirstRowId] = df.first_row_id.value(); + } + if (df.referenced_data_file.has_value()) { + json[kReferencedDataFile] = df.referenced_data_file.value(); + } + if (df.content_offset.has_value()) { + json[kContentOffset] = df.content_offset.value(); + } + if (df.content_size_in_bytes.has_value()) { + json[kContentSizeInBytes] = df.content_size_in_bytes.value(); + } + + return json; +} + namespace { +nlohmann::json BaseScanTaskResponseToJson(const BaseScanTaskResponse& response) { + nlohmann::json json; + + SetContainerField(json, kPlanTasks, response.plan_tasks); + + // Build delete_files array and a pointer-to-index map for reference lookup. + std::unordered_map delete_file_index; + nlohmann::json delete_files_json = nlohmann::json::array(); + for (size_t i = 0; i < response.delete_files.size(); ++i) { + if (response.delete_files[i]) { + delete_files_json.push_back(ToJson(*response.delete_files[i])); + delete_file_index[response.delete_files[i].get()] = static_cast(i); + } + } + if (!delete_files_json.empty()) { + json[kDeleteFiles] = std::move(delete_files_json); + } + + nlohmann::json tasks_json = nlohmann::json::array(); + for (const auto& task : response.file_scan_tasks) { + if (!task) continue; + nlohmann::json task_json; + if (task->data_file()) { + task_json[kDataFile] = ToJson(*task->data_file()); + } + if (!task->delete_files().empty()) { + std::vector refs; + for (const auto& df : task->delete_files()) { + auto it = delete_file_index.find(df.get()); + if (it != delete_file_index.end()) { + refs.push_back(it->second); + } + } + if (!refs.empty()) { + task_json[kDeleteFileReferences] = std::move(refs); + } + } + tasks_json.push_back(std::move(task_json)); + } + if (!tasks_json.empty()) { + json[kFileScanTasks] = std::move(tasks_json); + } + + return json; +} + Status BaseScanTaskResponseFromJson( const nlohmann::json& json, BaseScanTaskResponse* response, const std::unordered_map>& @@ -853,6 +976,25 @@ Result FetchScanTasksResponseFromJson( return response; } +nlohmann::json ToJson(const PlanTableScanResponse& response) { + nlohmann::json json = BaseScanTaskResponseToJson(response); + json[kPlanStatus] = ToString(response.plan_status); + if (!response.plan_id.empty()) { + json[kPlanId] = response.plan_id; + } + return json; +} + +nlohmann::json ToJson(const FetchPlanningResultResponse& response) { + nlohmann::json json = BaseScanTaskResponseToJson(response); + json[kPlanStatus] = ToString(response.plan_status); + return json; +} + +nlohmann::json ToJson(const FetchScanTasksResponse& response) { + return BaseScanTaskResponseToJson(response); +} + #define ICEBERG_DEFINE_FROM_JSON(Model) \ template <> \ Result FromJson(const nlohmann::json& json) { \ diff --git a/src/iceberg/catalog/rest/json_serde_internal.h b/src/iceberg/catalog/rest/json_serde_internal.h index c7e42e479..0726d0afc 100644 --- a/src/iceberg/catalog/rest/json_serde_internal.h +++ b/src/iceberg/catalog/rest/json_serde_internal.h @@ -92,6 +92,8 @@ ICEBERG_REST_EXPORT Result FetchScanTasksResponseFromJso ICEBERG_REST_EXPORT Result ToJson(const PlanTableScanRequest& request); ICEBERG_REST_EXPORT nlohmann::json ToJson(const FetchScanTasksRequest& request); +ICEBERG_REST_EXPORT nlohmann::json ToJson(const DataFile& df); + ICEBERG_REST_EXPORT Result DataFileFromJson( const nlohmann::json& json, const std::unordered_map>& partition_spec_by_id, @@ -103,4 +105,8 @@ ICEBERG_REST_EXPORT Result>> FileScanT const std::unordered_map>& partition_spec_by_id, const Schema& schema); +ICEBERG_REST_EXPORT nlohmann::json ToJson(const PlanTableScanResponse& response); +ICEBERG_REST_EXPORT nlohmann::json ToJson(const FetchPlanningResultResponse& response); +ICEBERG_REST_EXPORT nlohmann::json ToJson(const FetchScanTasksResponse& response); + } // namespace iceberg::rest diff --git a/src/iceberg/test/rest_json_serde_test.cc b/src/iceberg/test/rest_json_serde_test.cc index c3038b7ac..4bc650e6f 100644 --- a/src/iceberg/test/rest_json_serde_test.cc +++ b/src/iceberg/test/rest_json_serde_test.cc @@ -1742,4 +1742,101 @@ TEST(FileScanTasksFromJsonTest, NotAnArray) { EXPECT_THAT(result, HasErrorMessage("non-array")); } +// --- Roundtrip tests --- + +TEST(DataFileRoundtripTest, RequiredFieldsOnly) { + DataFile df; + df.content = DataFile::Content::kData; + df.file_path = "s3://bucket/data/file.parquet"; + df.file_format = FileFormatType::kParquet; + df.file_size_in_bytes = 12345; + df.record_count = 100; + + auto json = ToJson(df); + auto result = DataFileFromJson(json, {}, Schema({}, 0)); + ASSERT_THAT(result, IsOk()); + EXPECT_EQ(result.value(), df); +} + +TEST(DataFileRoundtripTest, WithOptionalFields) { + DataFile df; + df.content = DataFile::Content::kPositionDeletes; + df.file_path = "s3://bucket/deletes/pos.parquet"; + df.file_format = FileFormatType::kParquet; + df.file_size_in_bytes = 5000; + df.record_count = 50; + df.partition_spec_id = 1; + df.column_sizes = {{1, 1000}, {2, 2000}}; + df.value_counts = {{1, 100}, {2, 100}}; + df.null_value_counts = {{1, 0}}; + df.nan_value_counts = {{2, 5}}; + df.split_offsets = {0, 4096}; + df.sort_order_id = 0; + df.referenced_data_file = "s3://bucket/data/file.parquet"; + + auto json = ToJson(df); + auto result = DataFileFromJson(json, {}, Schema({}, 0)); + ASSERT_THAT(result, IsOk()); + EXPECT_EQ(result.value(), df); +} + +TEST(FetchScanTasksResponseRoundtripTest, WithFileScanTasksAndDeleteFiles) { + auto json = nlohmann::json::parse(R"({ + "plan-tasks": [], + "delete-files": [ + { + "content": "position_deletes", + "file-path": "s3://bucket/deletes/delete.parquet", + "file-format": "PARQUET", + "file-size-in-bytes": 512, + "record-count": 5 + } + ], + "file-scan-tasks": [ + { + "data-file": { + "content": "data", + "file-path": "s3://bucket/data/file.parquet", + "file-format": "PARQUET", + "file-size-in-bytes": 12345, + "record-count": 100 + }, + "delete-file-references": [0] + } + ] + })"); + + auto result = FetchScanTasksResponseFromJson(json, EmptySpecs(), EmptySchema()); + ASSERT_THAT(result, IsOk()); + + auto roundtrip_json = ToJson(*result); + auto result2 = FetchScanTasksResponseFromJson(roundtrip_json, EmptySpecs(), EmptySchema()); + ASSERT_THAT(result2, IsOk()); + EXPECT_EQ(*result, *result2); +} + +TEST(PlanTableScanResponseRoundtripTest, SubmittedStatus) { + auto json = nlohmann::json::parse(R"({"status": "submitted", "plan-id": "abc-123"})"); + auto result = PlanTableScanResponseFromJson(json, EmptySpecs(), EmptySchema()); + ASSERT_THAT(result, IsOk()); + + auto roundtrip_json = ToJson(*result); + auto result2 = PlanTableScanResponseFromJson(roundtrip_json, EmptySpecs(), EmptySchema()); + ASSERT_THAT(result2, IsOk()); + EXPECT_EQ(*result, *result2); +} + +TEST(FetchPlanningResultResponseRoundtripTest, CompletedWithPlanTasks) { + auto json = + nlohmann::json::parse(R"({"status": "completed", "plan-tasks": ["task-1", "task-2"]})"); + auto result = FetchPlanningResultResponseFromJson(json, EmptySpecs(), EmptySchema()); + ASSERT_THAT(result, IsOk()); + + auto roundtrip_json = ToJson(*result); + auto result2 = + FetchPlanningResultResponseFromJson(roundtrip_json, EmptySpecs(), EmptySchema()); + ASSERT_THAT(result2, IsOk()); + EXPECT_EQ(*result, *result2); +} + } // namespace iceberg::rest From 5f7c8f8eb933a2948b3613d78e2d35664b95f9d8 Mon Sep 17 00:00:00 2001 From: Sandeep Gottimukkala Date: Sun, 17 May 2026 15:37:39 -0700 Subject: [PATCH 17/17] revert: remove leftover includes and constants from json_serde_internal.h and json_serde.cc --- src/iceberg/json_serde.cc | 28 ---------------------------- src/iceberg/json_serde_internal.h | 4 ---- 2 files changed, 32 deletions(-) diff --git a/src/iceberg/json_serde.cc b/src/iceberg/json_serde.cc index a88cf003c..2d8c22255 100644 --- a/src/iceberg/json_serde.cc +++ b/src/iceberg/json_serde.cc @@ -27,10 +27,7 @@ #include #include "iceberg/constants.h" -#include "iceberg/expression/json_serde_internal.h" -#include "iceberg/file_format.h" #include "iceberg/json_serde_internal.h" -#include "iceberg/manifest/manifest_entry.h" #include "iceberg/name_mapping.h" #include "iceberg/partition_field.h" #include "iceberg/partition_spec.h" @@ -43,7 +40,6 @@ #include "iceberg/table_metadata.h" #include "iceberg/table_properties.h" #include "iceberg/table_requirement.h" -#include "iceberg/table_scan.h" #include "iceberg/table_update.h" #include "iceberg/transform.h" #include "iceberg/type.h" @@ -229,30 +225,6 @@ constexpr std::string_view kRequirementAssertDefaultSortOrderID = constexpr std::string_view kLastAssignedFieldId = "last-assigned-field-id"; constexpr std::string_view kLastAssignedPartitionId = "last-assigned-partition-id"; -// FileScanTask / DataFile constants (kSpecId, kSortOrderId, and kFileSizeInBytes -// are already defined above) -constexpr std::string_view kDataFile = "data-file"; -constexpr std::string_view kDeleteFileReferences = "delete-file-references"; -constexpr std::string_view kResidualFilter = "residual-filter"; -constexpr std::string_view kContent = "content"; -constexpr std::string_view kFilePath = "file-path"; -constexpr std::string_view kFileFormat = "file-format"; -constexpr std::string_view kPartition = "partition"; -constexpr std::string_view kRecordCount = "record-count"; -constexpr std::string_view kColumnSizes = "column-sizes"; -constexpr std::string_view kValueCounts = "value-counts"; -constexpr std::string_view kNullValueCounts = "null-value-counts"; -constexpr std::string_view kNanValueCounts = "nan-value-counts"; -constexpr std::string_view kLowerBounds = "lower-bounds"; -constexpr std::string_view kUpperBounds = "upper-bounds"; -constexpr std::string_view kKeyMetadata = "key-metadata"; -constexpr std::string_view kSplitOffsets = "split-offsets"; -constexpr std::string_view kEqualityIds = "equality-ids"; -constexpr std::string_view kFirstRowId = "first-row-id"; -constexpr std::string_view kReferencedDataFile = "referenced-data-file"; -constexpr std::string_view kContentOffset = "content-offset"; -constexpr std::string_view kContentSizeInBytes = "content-size-in-bytes"; - } // namespace nlohmann::json ToJson(const SortField& sort_field) { diff --git a/src/iceberg/json_serde_internal.h b/src/iceberg/json_serde_internal.h index d67bb61eb..8699e3dd1 100644 --- a/src/iceberg/json_serde_internal.h +++ b/src/iceberg/json_serde_internal.h @@ -28,13 +28,9 @@ #include -#include "iceberg/manifest/manifest_entry.h" -#include "iceberg/partition_spec.h" #include "iceberg/result.h" -#include "iceberg/schema.h" #include "iceberg/statistics_file.h" #include "iceberg/table_metadata.h" -#include "iceberg/table_scan.h" #include "iceberg/type_fwd.h" namespace iceberg {