From 777ea8afe73f5a4ad3a14bce9c781e4653d8d0e4 Mon Sep 17 00:00:00 2001 From: Alexis Placet Date: Mon, 10 Nov 2025 15:00:26 +0100 Subject: [PATCH 01/24] Add integration tests --- .github/workflows/integration_tests.yaml | 55 +++ CMakeLists.txt | 10 + ci/docker/integration.dockerfile | 42 ++ integration_tests/CMakeLists.txt | 130 ++++++ integration_tests/file_to_stream.cpp | 116 +++++ integration_tests/main.cpp | 2 + integration_tests/stream_to_file.cpp | 110 +++++ integration_tests/test_integration_tools.cpp | 455 +++++++++++++++++++ 8 files changed, 920 insertions(+) create mode 100644 .github/workflows/integration_tests.yaml create mode 100644 ci/docker/integration.dockerfile create mode 100644 integration_tests/CMakeLists.txt create mode 100644 integration_tests/file_to_stream.cpp create mode 100644 integration_tests/main.cpp create mode 100644 integration_tests/stream_to_file.cpp create mode 100644 integration_tests/test_integration_tools.cpp diff --git a/.github/workflows/integration_tests.yaml b/.github/workflows/integration_tests.yaml new file mode 100644 index 0000000..23d75cc --- /dev/null +++ b/.github/workflows/integration_tests.yaml @@ -0,0 +1,55 @@ +name: Integration tests + +on: + workflow_dispatch: + pull_request: + push: + branches: [main] + +jobs: + build_integration_container_and_run_tests: + runs-on: ubuntu-22.04 + steps: + - name: Checkout code + uses: actions/checkout@v2 + + - name: Run sccache-cache + uses: mozilla-actions/sccache-action@v0.0.9 + + - name: Install dependencies + run: | + sudo apt-get update + sudo apt-get install -y libpthread-stubs0-dev libboost-thread-dev doctest-dev + + - name: Install specific version of tzdata + run: sudo apt-get install tzdata + + - name: Configure using CMake + run: | + cmake -G Ninja \ + -Bbuild \ + -DCMAKE_BUILD_TYPE:STRING=RELEASE \ + -DSPARROW_IPC_BUILD_INTEGRATION_TESTS=ON \ + -DFETCH_DEPENDENCIES_WITH_CMAKE=MISSING \ + -DSPARROW_IPC_BUILD_SHARED=ON + + - name: Build file_to_stream target + working-directory: build + run: cmake --build . --config Release --target file_to_stream + + - name: Build stream_to_file target + working-directory: build + run: cmake --build . --config Release --target stream_to_file + + - name: Build Docker image + run: docker build -t sparrow/integration-tests -f ci/docker/integration.dockerfile . + + - name: Run Integration tests + run: | + docker run --rm \ + -e ARCHERY_INTEGRATION_WITH_EXTERNAL_LIBRARY=/workspace/build/bin/RELEASE/ \ + -e ARCHERY_INTEGRATION_EXTERNAL_LIBRARY_IPC_PRODUCER=true \ + -e ARCHERY_INTEGRATION_EXTERNAL_LIBRARY_IPC_CONSUMER=true \ + -v ${{ github.workspace }}:/workspace \ + -w /arrow-integration sparrow/integration-tests \ + "/arrow-integration/ci/scripts/integration_arrow.sh /arrow-integration /build" diff --git a/CMakeLists.txt b/CMakeLists.txt index 47fab25..0dca62c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -85,6 +85,9 @@ MESSAGE(STATUS "šŸ”§ Build docs: ${SPARROW_IPC_BUILD_DOCS}") OPTION(SPARROW_IPC_BUILD_EXAMPLES "Build sparrow-ipc examples" OFF) MESSAGE(STATUS "šŸ”§ Build examples: ${SPARROW_IPC_BUILD_EXAMPLES}") +OPTION(SPARROW_IPC_BUILD_INTEGRATION_TESTS "Build sparrow-ipc integration tests" OFF) +MESSAGE(STATUS "šŸ”§ Build integration tests: ${SPARROW_IPC_BUILD_INTEGRATION_TESTS}") + # Code coverage # ============= OPTION(SPARROW_IPC_ENABLE_COVERAGE "Enable sparrow-ipc test coverage" OFF) @@ -284,6 +287,13 @@ if(SPARROW_IPC_BUILD_EXAMPLES) add_subdirectory(examples) endif() +# Integration tests +# ================= +if(SPARROW_IPC_BUILD_INTEGRATION_TESTS) + message(STATUS "šŸ”Ø Create integration tests targets") + add_subdirectory(integration_tests) +endif() + # Installation # ============ include(GNUInstallDirs) diff --git a/ci/docker/integration.dockerfile b/ci/docker/integration.dockerfile new file mode 100644 index 0000000..67e9061 --- /dev/null +++ b/ci/docker/integration.dockerfile @@ -0,0 +1,42 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +FROM apache/arrow-dev:amd64-conda-integration + +ENV ARROW_USE_CCACHE=OFF \ + ARROW_CPP_EXE_PATH=/build/cpp/debug \ + BUILD_DOCS_CPP=OFF \ + ARROW_INTEGRATION_CPP=ON \ + ARROW_INTEGRATION_CSHARP=OFF \ + ARROW_INTEGRATION_GO=OFF \ + ARROW_INTEGRATION_JAVA=OFF \ + ARROW_INTEGRATION_JS=OFF \ + ARCHERY_INTEGRATION_WITH_NANOARROW="0" \ + ARCHERY_INTEGRATION_WITH_RUST="0" + +RUN apt update + +RUN apt install build-essential git -y + +# Clone the arrow monorepo // TODO: change to the official repo +RUN git clone --depth 1 --branch archery_supports_external_libraries https://github.com/Alex-PLACET/arrow.git /arrow-integration --recurse-submodules + +# Build all the integrations +RUN conda run --no-capture-output \ + /arrow-integration/ci/scripts/integration_arrow_build.sh \ + /arrow-integration \ + /build diff --git a/integration_tests/CMakeLists.txt b/integration_tests/CMakeLists.txt new file mode 100644 index 0000000..fb08d24 --- /dev/null +++ b/integration_tests/CMakeLists.txt @@ -0,0 +1,130 @@ +cmake_minimum_required(VERSION 3.28) + +# Create executable for file_to_stream integration test +add_executable(file_to_stream file_to_stream.cpp) + +target_link_libraries(file_to_stream + PRIVATE + sparrow-ipc + sparrow::sparrow + sparrow::json_reader +) + +set_target_properties(file_to_stream + PROPERTIES + CXX_STANDARD 20 + CXX_STANDARD_REQUIRED ON + CXX_EXTENSIONS OFF +) + +target_include_directories(file_to_stream + PRIVATE + ${CMAKE_SOURCE_DIR}/include + ${CMAKE_BINARY_DIR}/generated +) + +add_dependencies(file_to_stream generate_flatbuffers_headers) + +# Create executable for stream_to_file integration test +add_executable(stream_to_file stream_to_file.cpp) + +target_link_libraries(stream_to_file + PRIVATE + sparrow-ipc + sparrow::sparrow +) + +set_target_properties(stream_to_file + PROPERTIES + CXX_STANDARD 20 + CXX_STANDARD_REQUIRED ON + CXX_EXTENSIONS OFF +) + +target_include_directories(stream_to_file + PRIVATE + ${CMAKE_SOURCE_DIR}/include + ${CMAKE_BINARY_DIR}/generated +) + +add_dependencies(stream_to_file generate_flatbuffers_headers) + +# Create test executable for integration tools +add_executable(test_integration_tools main.cpp test_integration_tools.cpp) + +target_link_libraries(test_integration_tools + PRIVATE + sparrow-ipc + sparrow::sparrow + sparrow::json_reader + doctest::doctest + arrow-testing-data +) + +target_compile_definitions(test_integration_tools + PRIVATE + INTEGRATION_TOOLS_DIR="${CMAKE_CURRENT_BINARY_DIR}" +) + +set_target_properties(test_integration_tools + PROPERTIES + CXX_STANDARD 20 + CXX_STANDARD_REQUIRED ON + CXX_EXTENSIONS OFF +) + +target_include_directories(test_integration_tools + PRIVATE + ${CMAKE_SOURCE_DIR}/include + ${CMAKE_BINARY_DIR}/generated +) + +add_dependencies(test_integration_tools generate_flatbuffers_headers file_to_stream stream_to_file) + +# Register with CTest +enable_testing() +add_test(NAME integration_tools_test COMMAND test_integration_tools) + +# On Windows, copy required DLLs +if(WIN32) + add_custom_command( + TARGET file_to_stream POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy_if_different + "$" + "$" + COMMAND ${CMAKE_COMMAND} -E copy_if_different + "$" + "$" + COMMAND ${CMAKE_COMMAND} -E copy_if_different + "$" + "$" + COMMENT "Copying DLLs to file_to_stream executable directory" + ) + + add_custom_command( + TARGET stream_to_file POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy_if_different + "$" + "$" + COMMAND ${CMAKE_COMMAND} -E copy_if_different + "$" + "$" + COMMENT "Copying DLLs to stream_to_file executable directory" + ) + + add_custom_command( + TARGET test_integration_tools POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy_if_different + "$" + "$" + COMMAND ${CMAKE_COMMAND} -E copy_if_different + "$" + "$" + COMMAND ${CMAKE_COMMAND} -E copy_if_different + "$" + "$" + COMMENT "Copying DLLs to test_integration_tools executable directory" + ) +endif() + +set_target_properties(file_to_stream stream_to_file test_integration_tools PROPERTIES FOLDER "Integration Tests") diff --git a/integration_tests/file_to_stream.cpp b/integration_tests/file_to_stream.cpp new file mode 100644 index 0000000..5a4579a --- /dev/null +++ b/integration_tests/file_to_stream.cpp @@ -0,0 +1,116 @@ +#include +#include +#include +#include +#include + +#include +#include + +#include "sparrow/json_reader/json_parser.hpp" + +#include +#include + +/** + * @brief Reads a JSON file containing record batches and outputs the serialized Arrow IPC stream to stdout. + * + * This program takes a JSON file path as a command-line argument, parses the record batches + * from the JSON data, serializes them into Arrow IPC stream format, and writes the binary + * stream to stdout. The output can be redirected to a file or piped to another program. + * + * Usage: file_to_stream + * + * @param argc Number of command-line arguments + * @param argv Array of command-line arguments + * @return EXIT_SUCCESS on success, EXIT_FAILURE on error + */ +int main(int argc, char* argv[]) +{ + // Check command-line arguments + if (argc != 2) + { + std::cerr << "Usage: " << argv[0] << " \n"; + std::cerr << "Reads a JSON file and outputs the serialized Arrow IPC stream to stdout.\n"; + return EXIT_FAILURE; + } + + const std::filesystem::path json_path(argv[1]); + + try + { + // Check if the JSON file exists + if (!std::filesystem::exists(json_path)) + { + std::cerr << "Error: File not found: " << json_path << "\n"; + return EXIT_FAILURE; + } + + // Open and parse the JSON file + std::ifstream json_file(json_path); + if (!json_file.is_open()) + { + std::cerr << "Error: Could not open file: " << json_path << "\n"; + return EXIT_FAILURE; + } + + nlohmann::json json_data; + try + { + json_data = nlohmann::json::parse(json_file); + } + catch (const nlohmann::json::parse_error& e) + { + std::cerr << "Error: Failed to parse JSON file: " << e.what() << "\n"; + return EXIT_FAILURE; + } + json_file.close(); + + // Get the number of batches + if (!json_data.contains("batches") || !json_data["batches"].is_array()) + { + std::cerr << "Error: JSON file does not contain a 'batches' array.\n"; + return EXIT_FAILURE; + } + + const size_t num_batches = json_data["batches"].size(); + + // Parse all record batches from JSON + std::vector record_batches; + record_batches.reserve(num_batches); + + for (size_t batch_idx = 0; batch_idx < num_batches; ++batch_idx) + { + try + { + record_batches.emplace_back( + sparrow::json_reader::build_record_batch_from_json(json_data, batch_idx) + ); + } + catch (const std::exception& e) + { + std::cerr << "Error: Failed to build record batch " << batch_idx << ": " << e.what() + << "\n"; + return EXIT_FAILURE; + } + } + + // Serialize record batches to Arrow IPC stream format + std::vector stream_data; + sparrow_ipc::memory_output_stream stream(stream_data); + sparrow_ipc::serializer serializer(stream); + + serializer << record_batches << sparrow_ipc::end_stream; + + // Write the binary stream to stdout + std::cout.write(reinterpret_cast(stream_data.data()), stream_data.size()); + std::cout.flush(); + + return EXIT_SUCCESS; + } + catch (const std::exception& e) + { + std::cerr << "Error: " << e.what() << "\n"; + return EXIT_FAILURE; + } +} diff --git a/integration_tests/main.cpp b/integration_tests/main.cpp new file mode 100644 index 0000000..9522fa7 --- /dev/null +++ b/integration_tests/main.cpp @@ -0,0 +1,2 @@ +#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN +#include "doctest/doctest.h" diff --git a/integration_tests/stream_to_file.cpp b/integration_tests/stream_to_file.cpp new file mode 100644 index 0000000..fd84e56 --- /dev/null +++ b/integration_tests/stream_to_file.cpp @@ -0,0 +1,110 @@ +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +/** + * @brief Reads an Arrow IPC stream from a file and writes it to another file. + * + * This program reads a binary Arrow IPC stream from an input file, deserializes it + * to verify its validity, then re-serializes it and writes the result to the specified + * output file. This ensures the output file contains a valid Arrow IPC stream. + * + * Usage: stream_to_file + * + * @param argc Number of command-line arguments + * @param argv Array of command-line arguments + * @return EXIT_SUCCESS on success, EXIT_FAILURE on error + */ +int main(int argc, char* argv[]) +{ + // Check command-line arguments + if (argc != 3) + { + std::cerr << "Usage: " << argv[0] << " \n"; + std::cerr << "Reads an Arrow IPC stream from a file and writes it to another file.\n"; + return EXIT_FAILURE; + } + + const std::filesystem::path input_path(argv[1]); + const std::filesystem::path output_path(argv[2]); + + try + { + // Check if the input file exists + if (!std::filesystem::exists(input_path)) + { + std::cerr << "Error: Input file not found: " << input_path << "\n"; + return EXIT_FAILURE; + } + + // Read the entire stream from the input file + std::ifstream input_file(input_path, std::ios::in | std::ios::binary); + if (!input_file.is_open()) + { + std::cerr << "Error: Could not open input file: " << input_path << "\n"; + return EXIT_FAILURE; + } + + std::vector input_stream_data( + (std::istreambuf_iterator(input_file)), + std::istreambuf_iterator() + ); + input_file.close(); + + if (input_stream_data.empty()) + { + std::cerr << "Error: No data received from stdin.\n"; + return EXIT_FAILURE; + } + + // Deserialize the stream to validate it and extract record batches + std::vector record_batches; + try + { + record_batches = sparrow_ipc::deserialize_stream(std::span(input_stream_data)); + } + catch (const std::exception& e) + { + std::cerr << "Error: Failed to deserialize stream: " << e.what() << "\n"; + return EXIT_FAILURE; + } + + // Re-serialize the record batches to ensure a valid output stream + std::vector output_stream_data; + sparrow_ipc::memory_output_stream stream(output_stream_data); + sparrow_ipc::serializer serializer(stream); + + serializer << record_batches << sparrow_ipc::end_stream; + + // Write the stream to the output file + std::ofstream output_file(output_path, std::ios::out | std::ios::binary); + if (!output_file.is_open()) + { + std::cerr << "Error: Could not open output file: " << output_path << "\n"; + return EXIT_FAILURE; + } + + output_file.write(reinterpret_cast(output_stream_data.data()), output_stream_data.size()); + output_file.close(); + + if (!output_file.good()) + { + std::cerr << "Error: Failed to write to output file: " << output_path << "\n"; + return EXIT_FAILURE; + } + + return EXIT_SUCCESS; + } + catch (const std::exception& e) + { + std::cerr << "Error: " << e.what() << "\n"; + return EXIT_FAILURE; + } +} diff --git a/integration_tests/test_integration_tools.cpp b/integration_tests/test_integration_tools.cpp new file mode 100644 index 0000000..4056d95 --- /dev/null +++ b/integration_tests/test_integration_tools.cpp @@ -0,0 +1,455 @@ +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include "sparrow/json_reader/json_parser.hpp" + +#include "doctest/doctest.h" +#include "sparrow_ipc/deserialize.hpp" + +// Helper function to execute a command and capture output +struct CommandResult +{ + int exit_code; + std::string stdout_data; + std::string stderr_data; +}; + +#ifdef _WIN32 +#include + +CommandResult execute_command(const std::string& command) +{ + CommandResult result; + + // Create temporary files for stdout and stderr + const std::string stdout_file = std::tmpnam(nullptr); + const std::string stderr_file = std::tmpnam(nullptr); + + const std::string full_command = command + " > " + stdout_file + " 2> " + stderr_file; + + result.exit_code = std::system(full_command.c_str()); + + // Read stdout + std::ifstream stdout_stream(stdout_file, std::ios::binary); + if (stdout_stream) + { + std::ostringstream ss; + ss << stdout_stream.rdbuf(); + result.stdout_data = ss.str(); + } + + // Read stderr + std::ifstream stderr_stream(stderr_file, std::ios::binary); + if (stderr_stream) + { + std::ostringstream ss; + ss << stderr_stream.rdbuf(); + result.stderr_data = ss.str(); + } + + // Clean up + std::filesystem::remove(stdout_file); + std::filesystem::remove(stderr_file); + + return result; +} + +#else +#include +#include + +CommandResult execute_command(const std::string& command) +{ + CommandResult result; + + // Check if command already contains output redirection + const bool has_redirection = (command.find('>') != std::string::npos); + + if (has_redirection) + { + // Command already has redirection, execute as-is + // But we still want to capture stderr for error checking + const std::filesystem::path stderr_file = std::filesystem::temp_directory_path() / ("stderr_" + std::to_string(std::time(nullptr))); + const std::string full_command = command + " 2> " + stderr_file.string(); + result.exit_code = std::system(full_command.c_str()); + + // Read stderr + std::ifstream stderr_stream(stderr_file, std::ios::binary); + if (stderr_stream) + { + std::ostringstream ss; + ss << stderr_stream.rdbuf(); + result.stderr_data = ss.str(); + } + + // Clean up + std::filesystem::remove(stderr_file); + } + else + { + // Create temporary files for stdout and stderr + const std::filesystem::path stdout_file = std::filesystem::temp_directory_path() / ("stdout_" + std::to_string(std::time(nullptr))); + const std::filesystem::path stderr_file = std::filesystem::temp_directory_path() / ("stderr_" + std::to_string(std::time(nullptr))); + + // The command string is already properly formed (executable path + args) + // We need to redirect stdout and stderr to files + const std::string full_command = command + " > " + stdout_file.string() + " 2> " + stderr_file.string(); + + result.exit_code = std::system(full_command.c_str()); + + // Read stdout + std::ifstream stdout_stream(stdout_file, std::ios::binary); + if (stdout_stream) + { + std::ostringstream ss; + ss << stdout_stream.rdbuf(); + result.stdout_data = ss.str(); + } + + // Read stderr + std::ifstream stderr_stream(stderr_file, std::ios::binary); + if (stderr_stream) + { + std::ostringstream ss; + ss << stderr_stream.rdbuf(); + result.stderr_data = ss.str(); + } + + // Clean up + std::filesystem::remove(stdout_file); + std::filesystem::remove(stderr_file); + } + + return result; +} +#endif + +// Helper to compare record batches +void compare_record_batches( + const std::vector& record_batches_1, + const std::vector& record_batches_2 +) +{ + REQUIRE_EQ(record_batches_1.size(), record_batches_2.size()); + for (size_t i = 0; i < record_batches_1.size(); ++i) + { + REQUIRE_EQ(record_batches_1[i].nb_columns(), record_batches_2[i].nb_columns()); + for (size_t y = 0; y < record_batches_1[i].nb_columns(); y++) + { + const auto& column_1 = record_batches_1[i].get_column(y); + const auto& column_2 = record_batches_2[i].get_column(y); + REQUIRE_EQ(column_1.size(), column_2.size()); + CHECK_EQ(record_batches_1[i].names()[y], record_batches_2[i].names()[y]); + for (size_t z = 0; z < column_1.size(); z++) + { + const auto col_name = column_1.name().value_or("NA"); + INFO("Comparing batch " << i << ", column " << y << " named: " << col_name << ", row " << z); + REQUIRE_EQ(column_1.data_type(), column_2.data_type()); + CHECK_EQ(column_1[z], column_2[z]); + } + } + } +} + +TEST_SUITE("Integration Tools Tests") +{ + // Get paths to test data + const std::filesystem::path arrow_testing_data_dir = ARROW_TESTING_DATA_DIR; + const std::filesystem::path tests_resources_files_path = + arrow_testing_data_dir / "data" / "arrow-ipc-stream" / "integration" / "cpp-21.0.0"; + + // Paths to the executables - defined at compile time + const std::filesystem::path exe_dir = INTEGRATION_TOOLS_DIR; + const std::filesystem::path file_to_stream_exe = exe_dir / "file_to_stream"; + const std::filesystem::path stream_to_file_exe = exe_dir / "stream_to_file"; + + // Helper to build command with properly quoted executable + auto make_command = [](const std::filesystem::path& exe, const std::string& args = "") { + std::string cmd = "\"" + exe.string() + "\""; + if (!args.empty()) { + cmd += " " + args; + } + return cmd; + }; + + TEST_CASE("file_to_stream - No arguments") + { + auto result = execute_command(make_command(file_to_stream_exe)); + CHECK_NE(result.exit_code, 0); + CHECK(result.stderr_data.find("Usage:") != std::string::npos); + } + + TEST_CASE("file_to_stream - Non-existent file") + { + const std::string non_existent = "non_existent_file_12345.json"; + auto result = execute_command(make_command(file_to_stream_exe, non_existent)); + CHECK_NE(result.exit_code, 0); + CHECK(result.stderr_data.find("not found") != std::string::npos); + } + + TEST_CASE("stream_to_file - No arguments") + { + auto result = execute_command(make_command(stream_to_file_exe)); + CHECK_NE(result.exit_code, 0); + CHECK(result.stderr_data.find("Usage:") != std::string::npos); + } + + TEST_CASE("stream_to_file - Only one argument") + { + auto result = execute_command(make_command(stream_to_file_exe, "output.stream")); + CHECK_NE(result.exit_code, 0); + CHECK(result.stderr_data.find("Usage:") != std::string::npos); + } + + TEST_CASE("stream_to_file - Non-existent input file") + { + const std::string non_existent = "non_existent_file_12345.stream"; + const std::string output_file = "output.stream"; + auto result = execute_command(make_command(stream_to_file_exe, non_existent + " " + output_file)); + CHECK_NE(result.exit_code, 0); + CHECK(result.stderr_data.find("not found") != std::string::npos); + } + + TEST_CASE("file_to_stream - Convert JSON to stream") + { + // Test with a known good JSON file + const std::filesystem::path json_file = tests_resources_files_path / "generated_primitive.json"; + + if (!std::filesystem::exists(json_file)) + { + MESSAGE("Skipping test: test file not found at " << json_file); + return; + } + + const std::filesystem::path output_stream = std::filesystem::temp_directory_path() / "test_output.stream"; + + // Execute file_to_stream + const std::string command = "\"" + file_to_stream_exe.string() + "\" \"" + json_file.string() + "\" > \"" + output_stream.string() + "\""; + auto result = execute_command(command); + + CHECK_EQ(result.exit_code, 0); + CHECK(std::filesystem::exists(output_stream)); + CHECK_GT(std::filesystem::file_size(output_stream), 0); + + // Verify the output is a valid stream by deserializing it + std::ifstream stream_file(output_stream, std::ios::binary); + REQUIRE(stream_file.is_open()); + + std::vector stream_data( + (std::istreambuf_iterator(stream_file)), + std::istreambuf_iterator() + ); + stream_file.close(); + + // Should be able to deserialize without errors + CHECK_NOTHROW(sparrow_ipc::deserialize_stream(std::span(stream_data))); + + // Clean up + std::filesystem::remove(output_stream); + } + + TEST_CASE("stream_to_file - Process stream file") + { + const std::filesystem::path input_stream = tests_resources_files_path / "generated_primitive.stream"; + + if (!std::filesystem::exists(input_stream)) + { + MESSAGE("Skipping test: test file not found at " << input_stream); + return; + } + + const std::filesystem::path output_stream = std::filesystem::temp_directory_path() / "test_stream_output.stream"; + + // Execute stream_to_file + const std::string command = "\"" + stream_to_file_exe.string() + "\" \"" + input_stream.string() + "\" \"" + output_stream.string() + "\""; + auto result = execute_command(command); + + CHECK_EQ(result.exit_code, 0); + CHECK(std::filesystem::exists(output_stream)); + CHECK_GT(std::filesystem::file_size(output_stream), 0); + + // Verify the output is a valid stream + std::ifstream output_file(output_stream, std::ios::binary); + REQUIRE(output_file.is_open()); + + std::vector output_data( + (std::istreambuf_iterator(output_file)), + std::istreambuf_iterator() + ); + output_file.close(); + + CHECK_NOTHROW(sparrow_ipc::deserialize_stream(std::span(output_data))); + + // Clean up + std::filesystem::remove(output_stream); + } + + TEST_CASE("Round-trip: JSON -> stream -> file -> deserialize") + { + const std::filesystem::path json_file = tests_resources_files_path / "generated_primitive.json"; + + if (!std::filesystem::exists(json_file)) + { + MESSAGE("Skipping test: test file not found at " << json_file); + return; + } + + const std::filesystem::path intermediate_stream = std::filesystem::temp_directory_path() / "intermediate.stream"; + const std::filesystem::path final_stream = std::filesystem::temp_directory_path() / "final.stream"; + + // Step 1: JSON -> stream + { + const std::string command = "\"" + file_to_stream_exe.string() + "\" \"" + json_file.string() + "\" > \"" + intermediate_stream.string() + "\""; + auto result = execute_command(command); + REQUIRE_EQ(result.exit_code, 0); + REQUIRE(std::filesystem::exists(intermediate_stream)); + } + + // Step 2: stream -> file + { + const std::string command = "\"" + stream_to_file_exe.string() + "\" \"" + intermediate_stream.string() + "\" \"" + final_stream.string() + "\""; + auto result = execute_command(command); + REQUIRE_EQ(result.exit_code, 0); + REQUIRE(std::filesystem::exists(final_stream)); + } + + // Step 3: Compare the results + // Load original JSON data + std::ifstream json_input(json_file); + REQUIRE(json_input.is_open()); + nlohmann::json json_data = nlohmann::json::parse(json_input); + json_input.close(); + + const size_t num_batches = json_data["batches"].size(); + std::vector original_batches; + for (size_t i = 0; i < num_batches; ++i) + { + original_batches.emplace_back( + sparrow::json_reader::build_record_batch_from_json(json_data, i) + ); + } + + // Load final stream + std::ifstream final_file(final_stream, std::ios::binary); + REQUIRE(final_file.is_open()); + std::vector final_data( + (std::istreambuf_iterator(final_file)), + std::istreambuf_iterator() + ); + final_file.close(); + + auto final_batches = sparrow_ipc::deserialize_stream(std::span(final_data)); + + // Compare + compare_record_batches(original_batches, final_batches); + + // Clean up + std::filesystem::remove(intermediate_stream); + std::filesystem::remove(final_stream); + } + + TEST_CASE("Paths with spaces") + { + const std::filesystem::path json_file = tests_resources_files_path / "generated_primitive.json"; + + if (!std::filesystem::exists(json_file)) + { + MESSAGE("Skipping test: test file not found at " << json_file); + return; + } + + // Create temporary directory with spaces in the name + const std::filesystem::path temp_dir = std::filesystem::temp_directory_path() / "test dir with spaces"; + std::filesystem::create_directories(temp_dir); + + const std::filesystem::path output_stream = temp_dir / "output file.stream"; + const std::filesystem::path final_stream = temp_dir / "final output.stream"; + + // Step 1: JSON -> stream with spaces in output path + { + const std::string command = "\"" + file_to_stream_exe.string() + "\" \"" + json_file.string() + "\" > \"" + output_stream.string() + "\""; + auto result = execute_command(command); + CHECK_EQ(result.exit_code, 0); + CHECK(std::filesystem::exists(output_stream)); + } + + // Step 2: stream -> file with spaces in both paths + { + const std::string command = "\"" + stream_to_file_exe.string() + "\" \"" + output_stream.string() + "\" \"" + final_stream.string() + "\""; + auto result = execute_command(command); + CHECK_EQ(result.exit_code, 0); + CHECK(std::filesystem::exists(final_stream)); + } + + // Verify the final output is valid + std::ifstream final_file(final_stream, std::ios::binary); + REQUIRE(final_file.is_open()); + std::vector final_data( + (std::istreambuf_iterator(final_file)), + std::istreambuf_iterator() + ); + final_file.close(); + + CHECK_NOTHROW(sparrow_ipc::deserialize_stream(std::span(final_data))); + + // Clean up + std::filesystem::remove_all(temp_dir); + } + + TEST_CASE("Multiple test files") + { + const std::vector test_files = { + "generated_primitive", + "generated_binary", + "generated_primitive_zerolength", + "generated_binary_zerolength" + }; + + for (const auto& test_file : test_files) + { + const std::filesystem::path json_file = tests_resources_files_path / (test_file + ".json"); + + if (!std::filesystem::exists(json_file)) + { + MESSAGE("Skipping test file: " << json_file); + continue; + } + + SUBCASE(test_file.c_str()) + { + const std::filesystem::path output_stream = std::filesystem::temp_directory_path() / (test_file + "_output.stream"); + + // Convert JSON to stream + const std::string command = "\"" + file_to_stream_exe.string() + "\" \"" + json_file.string() + "\" > \"" + output_stream.string() + "\""; + auto result = execute_command(command); + + CHECK_EQ(result.exit_code, 0); + CHECK(std::filesystem::exists(output_stream)); + + // Deserialize and verify + std::ifstream stream_file(output_stream, std::ios::binary); + if (stream_file.is_open()) + { + std::vector stream_data( + (std::istreambuf_iterator(stream_file)), + std::istreambuf_iterator() + ); + stream_file.close(); + + CHECK_NOTHROW(sparrow_ipc::deserialize_stream(std::span(stream_data))); + } + + // Clean up + std::filesystem::remove(output_stream); + } + } + } +} From d803f10854292ce30d1365482193ce33209aa36e Mon Sep 17 00:00:00 2001 From: Alexis Placet Date: Mon, 10 Nov 2025 15:17:13 +0100 Subject: [PATCH 02/24] try fix --- .github/workflows/integration_tests.yaml | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/.github/workflows/integration_tests.yaml b/.github/workflows/integration_tests.yaml index 23d75cc..efa933c 100644 --- a/.github/workflows/integration_tests.yaml +++ b/.github/workflows/integration_tests.yaml @@ -8,21 +8,17 @@ on: jobs: build_integration_container_and_run_tests: - runs-on: ubuntu-22.04 + runs-on: ubuntu-24.04 steps: - name: Checkout code - uses: actions/checkout@v2 - - - name: Run sccache-cache - uses: mozilla-actions/sccache-action@v0.0.9 - - - name: Install dependencies - run: | - sudo apt-get update - sudo apt-get install -y libpthread-stubs0-dev libboost-thread-dev doctest-dev - - - name: Install specific version of tzdata - run: sudo apt-get install tzdata + uses: actions/checkout@v5 + + - name: Create build environment + uses: mamba-org/setup-micromamba@v2 + with: + environment-file: ./environment-dev.yml + environment-name: build_env + cache-environment: true - name: Configure using CMake run: | From ec770f43c1c336ebb414e4788067e916124816eb Mon Sep 17 00:00:00 2001 From: Alexis Placet Date: Mon, 10 Nov 2025 15:26:14 +0100 Subject: [PATCH 03/24] fix --- cmake/external_dependencies.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cmake/external_dependencies.cmake b/cmake/external_dependencies.cmake index 897b48f..1e3a2eb 100644 --- a/cmake/external_dependencies.cmake +++ b/cmake/external_dependencies.cmake @@ -68,7 +68,7 @@ function(find_package_or_fetch) endfunction() set(SPARROW_BUILD_SHARED ${SPARROW_IPC_BUILD_SHARED}) -if(${SPARROW_IPC_BUILD_TESTS}) +if(${SPARROW_IPC_BUILD_TESTS} OR ${SPARROW_IPC_BUILD_INTEGRATION_TESTS}) set(CREATE_JSON_READER_TARGET ON) endif() find_package_or_fetch( @@ -81,7 +81,7 @@ unset(CREATE_JSON_READER_TARGET) if(NOT TARGET sparrow::sparrow) add_library(sparrow::sparrow ALIAS sparrow) endif() -if(${SPARROW_IPC_BUILD_TESTS}) +if(${SPARROW_IPC_BUILD_TESTS} OR ${SPARROW_IPC_BUILD_INTEGRATION_TESTS}) find_package_or_fetch( PACKAGE_NAME sparrow-json-reader ) From 53bdd50e5301e36e653e747f0c07958df589136c Mon Sep 17 00:00:00 2001 From: Alexis Placet Date: Mon, 10 Nov 2025 15:39:59 +0100 Subject: [PATCH 04/24] fix --- CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 0dca62c..c4ddbd2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -11,8 +11,6 @@ include(CMakeDependentOption) list(PREPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake") message(DEBUG "CMake module path: ${CMAKE_MODULE_PATH}") -include(external_dependencies) - set(SPARROW_IPC_COMPILE_DEFINITIONS "" CACHE STRING "List of public compile definitions of the sparrow-ipc target") set(SPARROW_IPC_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/include) @@ -93,6 +91,8 @@ MESSAGE(STATUS "šŸ”§ Build integration tests: ${SPARROW_IPC_BUILD_INTEGRATION_TE OPTION(SPARROW_IPC_ENABLE_COVERAGE "Enable sparrow-ipc test coverage" OFF) MESSAGE(STATUS "šŸ”§ Enable coverage: ${SPARROW_IPC_ENABLE_COVERAGE}") +include(external_dependencies) + if(SPARROW_IPC_ENABLE_COVERAGE) include(code_coverage) endif() From c5e56bcd6985979c6c8d6210cb89e70c85217412 Mon Sep 17 00:00:00 2001 From: Alexis Placet Date: Mon, 10 Nov 2025 15:55:13 +0100 Subject: [PATCH 05/24] fix --- cmake/external_dependencies.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/external_dependencies.cmake b/cmake/external_dependencies.cmake index 1e3a2eb..56a4054 100644 --- a/cmake/external_dependencies.cmake +++ b/cmake/external_dependencies.cmake @@ -123,7 +123,7 @@ if(NOT TARGET lz4::lz4) add_library(lz4::lz4 ALIAS lz4) endif() -if(SPARROW_IPC_BUILD_TESTS) +if(${SPARROW_IPC_BUILD_TESTS} OR ${SPARROW_IPC_BUILD_INTEGRATION_TESTS}) find_package_or_fetch( PACKAGE_NAME doctest GIT_REPOSITORY https://github.com/doctest/doctest.git From a7c54a9d16d8c0db2cb885c2d4c3c61405e39057 Mon Sep 17 00:00:00 2001 From: Alexis Placet Date: Wed, 12 Nov 2025 10:36:54 +0100 Subject: [PATCH 06/24] wip --- .github/workflows/integration_tests.yaml | 16 +- integration_tests/CMakeLists.txt | 198 +++++++--- ...to_stream.cpp => arrow_file_to_stream.cpp} | 2 +- integration_tests/arrow_json_to_file.cpp | 128 ++++++ ...m_to_file.cpp => arrow_stream_to_file.cpp} | 0 integration_tests/arrow_validate.cpp | 287 ++++++++++++++ integration_tests/test_integration_tools.cpp | 373 +++++++++++++++--- 7 files changed, 884 insertions(+), 120 deletions(-) rename integration_tests/{file_to_stream.cpp => arrow_file_to_stream.cpp} (98%) create mode 100644 integration_tests/arrow_json_to_file.cpp rename integration_tests/{stream_to_file.cpp => arrow_stream_to_file.cpp} (100%) create mode 100644 integration_tests/arrow_validate.cpp diff --git a/.github/workflows/integration_tests.yaml b/.github/workflows/integration_tests.yaml index efa933c..4ae0132 100644 --- a/.github/workflows/integration_tests.yaml +++ b/.github/workflows/integration_tests.yaml @@ -29,13 +29,21 @@ jobs: -DFETCH_DEPENDENCIES_WITH_CMAKE=MISSING \ -DSPARROW_IPC_BUILD_SHARED=ON - - name: Build file_to_stream target + - name: Build arrow_file_to_stream target working-directory: build - run: cmake --build . --config Release --target file_to_stream + run: cmake --build . --config Release --target arrow_file_to_stream - - name: Build stream_to_file target + - name: Build arrow_stream_to_file target working-directory: build - run: cmake --build . --config Release --target stream_to_file + run: cmake --build . --config Release --target arrow_stream_to_file + + - name: Build arrow_json_to_file target + working-directory: build + run: cmake --build . --config Release --target arrow_json_to_file + + - name: Build arrow_validate target + working-directory: build + run: cmake --build . --config Release --target arrow_validate - name: Build Docker image run: docker build -t sparrow/integration-tests -f ci/docker/integration.dockerfile . diff --git a/integration_tests/CMakeLists.txt b/integration_tests/CMakeLists.txt index fb08d24..5555c4c 100644 --- a/integration_tests/CMakeLists.txt +++ b/integration_tests/CMakeLists.txt @@ -1,85 +1,135 @@ cmake_minimum_required(VERSION 3.28) -# Create executable for file_to_stream integration test -add_executable(file_to_stream file_to_stream.cpp) +# Create executable for arrow_file_to_stream integration test +add_executable(arrow_file_to_stream arrow_file_to_stream.cpp) -target_link_libraries(file_to_stream +target_link_libraries(arrow_file_to_stream PRIVATE - sparrow-ipc - sparrow::sparrow - sparrow::json_reader + sparrow-ipc + sparrow::sparrow + sparrow::json_reader ) -set_target_properties(file_to_stream +set_target_properties(arrow_file_to_stream PROPERTIES - CXX_STANDARD 20 - CXX_STANDARD_REQUIRED ON - CXX_EXTENSIONS OFF + CXX_STANDARD 20 + CXX_STANDARD_REQUIRED ON + CXX_EXTENSIONS OFF ) -target_include_directories(file_to_stream +target_include_directories(arrow_file_to_stream PRIVATE - ${CMAKE_SOURCE_DIR}/include - ${CMAKE_BINARY_DIR}/generated + ${CMAKE_SOURCE_DIR}/include + ${CMAKE_BINARY_DIR}/generated ) -add_dependencies(file_to_stream generate_flatbuffers_headers) +add_dependencies(arrow_file_to_stream generate_flatbuffers_headers) -# Create executable for stream_to_file integration test -add_executable(stream_to_file stream_to_file.cpp) +# Create executable for arrow_stream_to_file integration test +add_executable(arrow_stream_to_file arrow_stream_to_file.cpp) -target_link_libraries(stream_to_file +target_link_libraries(arrow_stream_to_file PRIVATE - sparrow-ipc - sparrow::sparrow + sparrow-ipc + sparrow::sparrow ) -set_target_properties(stream_to_file +set_target_properties(arrow_stream_to_file PROPERTIES - CXX_STANDARD 20 - CXX_STANDARD_REQUIRED ON - CXX_EXTENSIONS OFF + CXX_STANDARD 20 + CXX_STANDARD_REQUIRED ON + CXX_EXTENSIONS OFF ) -target_include_directories(stream_to_file +target_include_directories(arrow_stream_to_file PRIVATE - ${CMAKE_SOURCE_DIR}/include - ${CMAKE_BINARY_DIR}/generated + ${CMAKE_SOURCE_DIR}/include + ${CMAKE_BINARY_DIR}/generated ) -add_dependencies(stream_to_file generate_flatbuffers_headers) +add_dependencies(arrow_stream_to_file generate_flatbuffers_headers) + +# Create executable for arrow_json_to_file integration test +add_executable(arrow_json_to_file arrow_json_to_file.cpp) + +target_link_libraries(arrow_json_to_file + PRIVATE + sparrow-ipc + sparrow::sparrow + sparrow::json_reader +) + +set_target_properties(arrow_json_to_file + PROPERTIES + CXX_STANDARD 20 + CXX_STANDARD_REQUIRED ON + CXX_EXTENSIONS OFF +) + +target_include_directories(arrow_json_to_file + PRIVATE + ${CMAKE_SOURCE_DIR}/include + ${CMAKE_BINARY_DIR}/generated +) + +add_dependencies(arrow_json_to_file generate_flatbuffers_headers) + +# Create executable for arrow_validate integration test +add_executable(arrow_validate arrow_validate.cpp) + +target_link_libraries(arrow_validate + PRIVATE + sparrow-ipc + sparrow::sparrow + sparrow::json_reader +) + +set_target_properties(arrow_validate + PROPERTIES + CXX_STANDARD 20 + CXX_STANDARD_REQUIRED ON + CXX_EXTENSIONS OFF +) + +target_include_directories(arrow_validate + PRIVATE + ${CMAKE_SOURCE_DIR}/include + ${CMAKE_BINARY_DIR}/generated +) + +add_dependencies(arrow_validate generate_flatbuffers_headers) # Create test executable for integration tools add_executable(test_integration_tools main.cpp test_integration_tools.cpp) target_link_libraries(test_integration_tools PRIVATE - sparrow-ipc - sparrow::sparrow - sparrow::json_reader - doctest::doctest - arrow-testing-data + sparrow-ipc + sparrow::sparrow + sparrow::json_reader + doctest::doctest + arrow-testing-data ) target_compile_definitions(test_integration_tools PRIVATE - INTEGRATION_TOOLS_DIR="${CMAKE_CURRENT_BINARY_DIR}" + INTEGRATION_TOOLS_DIR="${CMAKE_CURRENT_BINARY_DIR}" ) set_target_properties(test_integration_tools PROPERTIES - CXX_STANDARD 20 - CXX_STANDARD_REQUIRED ON - CXX_EXTENSIONS OFF + CXX_STANDARD 20 + CXX_STANDARD_REQUIRED ON + CXX_EXTENSIONS OFF ) target_include_directories(test_integration_tools PRIVATE - ${CMAKE_SOURCE_DIR}/include - ${CMAKE_BINARY_DIR}/generated + ${CMAKE_SOURCE_DIR}/include + ${CMAKE_BINARY_DIR}/generated ) -add_dependencies(test_integration_tools generate_flatbuffers_headers file_to_stream stream_to_file) +add_dependencies(test_integration_tools generate_flatbuffers_headers arrow_file_to_stream arrow_stream_to_file arrow_json_to_file arrow_validate) # Register with CTest enable_testing() @@ -88,43 +138,71 @@ add_test(NAME integration_tools_test COMMAND test_integration_tools) # On Windows, copy required DLLs if(WIN32) add_custom_command( - TARGET file_to_stream POST_BUILD + TARGET arrow_file_to_stream POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_if_different - "$" - "$" + "$" + "$" COMMAND ${CMAKE_COMMAND} -E copy_if_different - "$" - "$" + "$" + "$" COMMAND ${CMAKE_COMMAND} -E copy_if_different - "$" - "$" - COMMENT "Copying DLLs to file_to_stream executable directory" + "$" + "$" + COMMENT "Copying DLLs to arrow_file_to_stream executable directory" ) add_custom_command( - TARGET stream_to_file POST_BUILD + TARGET arrow_stream_to_file POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy_if_different + "$" + "$" + COMMAND ${CMAKE_COMMAND} -E copy_if_different + "$" + "$" + COMMENT "Copying DLLs to arrow_stream_to_file executable directory" + ) + + add_custom_command( + TARGET arrow_json_to_file POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy_if_different + "$" + "$" + COMMAND ${CMAKE_COMMAND} -E copy_if_different + "$" + "$" + COMMAND ${CMAKE_COMMAND} -E copy_if_different + "$" + "$" + COMMENT "Copying DLLs to arrow_json_to_file executable directory" + ) + + add_custom_command( + TARGET arrow_validate POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy_if_different + "$" + "$" COMMAND ${CMAKE_COMMAND} -E copy_if_different - "$" - "$" + "$" + "$" COMMAND ${CMAKE_COMMAND} -E copy_if_different - "$" - "$" - COMMENT "Copying DLLs to stream_to_file executable directory" + "$" + "$" + COMMENT "Copying DLLs to arrow_validate executable directory" ) add_custom_command( TARGET test_integration_tools POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_if_different - "$" - "$" + "$" + "$" COMMAND ${CMAKE_COMMAND} -E copy_if_different - "$" - "$" + "$" + "$" COMMAND ${CMAKE_COMMAND} -E copy_if_different - "$" - "$" + "$" + "$" COMMENT "Copying DLLs to test_integration_tools executable directory" ) endif() -set_target_properties(file_to_stream stream_to_file test_integration_tools PROPERTIES FOLDER "Integration Tests") +set_target_properties(arrow_file_to_stream arrow_stream_to_file arrow_json_to_file arrow_validate test_integration_tools PROPERTIES FOLDER "Integration Tests") diff --git a/integration_tests/file_to_stream.cpp b/integration_tests/arrow_file_to_stream.cpp similarity index 98% rename from integration_tests/file_to_stream.cpp rename to integration_tests/arrow_file_to_stream.cpp index 5a4579a..e321f59 100644 --- a/integration_tests/file_to_stream.cpp +++ b/integration_tests/arrow_file_to_stream.cpp @@ -19,7 +19,7 @@ * from the JSON data, serializes them into Arrow IPC stream format, and writes the binary * stream to stdout. The output can be redirected to a file or piped to another program. * - * Usage: file_to_stream + * Usage: arrow_file_to_stream * * @param argc Number of command-line arguments * @param argv Array of command-line arguments diff --git a/integration_tests/arrow_json_to_file.cpp b/integration_tests/arrow_json_to_file.cpp new file mode 100644 index 0000000..35f39a2 --- /dev/null +++ b/integration_tests/arrow_json_to_file.cpp @@ -0,0 +1,128 @@ +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +/** + * @brief Reads a JSON file containing record batches and writes the serialized Arrow IPC stream to a file. + * + * This program takes a JSON file path and an output file path as command-line arguments, + * parses the record batches from the JSON data, serializes them into Arrow IPC stream format, + * and writes the binary stream to the specified output file. + * + * Usage: json_to_file + * + * @param argc Number of command-line arguments + * @param argv Array of command-line arguments + * @return EXIT_SUCCESS on success, EXIT_FAILURE on error + */ +int main(int argc, char* argv[]) +{ + // Check command-line arguments + if (argc != 3) + { + std::cerr << "Usage: " << argv[0] << " \n"; + std::cerr << "Reads a JSON file and writes the serialized Arrow IPC stream to a file.\n"; + return EXIT_FAILURE; + } + + const std::filesystem::path json_path(argv[1]); + const std::filesystem::path output_path(argv[2]); + + try + { + // Check if the JSON file exists + if (!std::filesystem::exists(json_path)) + { + std::cerr << "Error: Input file not found: " << json_path << "\n"; + return EXIT_FAILURE; + } + + // Open and parse the JSON file + std::ifstream json_file(json_path); + if (!json_file.is_open()) + { + std::cerr << "Error: Could not open input file: " << json_path << "\n"; + return EXIT_FAILURE; + } + + nlohmann::json json_data; + try + { + json_data = nlohmann::json::parse(json_file); + } + catch (const nlohmann::json::parse_error& e) + { + std::cerr << "Error: Failed to parse JSON file: " << e.what() << "\n"; + return EXIT_FAILURE; + } + json_file.close(); + + // Get the number of batches + if (!json_data.contains("batches") || !json_data["batches"].is_array()) + { + std::cerr << "Error: JSON file does not contain a 'batches' array.\n"; + return EXIT_FAILURE; + } + + const size_t num_batches = json_data["batches"].size(); + + // Parse all record batches from JSON + std::vector record_batches; + record_batches.reserve(num_batches); + + for (size_t batch_idx = 0; batch_idx < num_batches; ++batch_idx) + { + try + { + record_batches.emplace_back( + sparrow::json_reader::build_record_batch_from_json(json_data, batch_idx) + ); + } + catch (const std::exception& e) + { + std::cerr << "Error: Failed to build record batch " << batch_idx << ": " << e.what() << "\n"; + return EXIT_FAILURE; + } + } + + // Serialize record batches to Arrow IPC stream format + std::vector stream_data; + sparrow_ipc::memory_output_stream stream(stream_data); + sparrow_ipc::serializer serializer(stream); + + serializer << record_batches << sparrow_ipc::end_stream; + + // Write the binary stream to the output file + std::ofstream output_file(output_path, std::ios::out | std::ios::binary); + if (!output_file.is_open()) + { + std::cerr << "Error: Could not open output file: " << output_path << "\n"; + return EXIT_FAILURE; + } + + output_file.write(reinterpret_cast(stream_data.data()), stream_data.size()); + output_file.close(); + + if (!output_file.good()) + { + std::cerr << "Error: Failed to write to output file: " << output_path << "\n"; + return EXIT_FAILURE; + } + + return EXIT_SUCCESS; + } + catch (const std::exception& e) + { + std::cerr << "Error: " << e.what() << "\n"; + return EXIT_FAILURE; + } +} diff --git a/integration_tests/stream_to_file.cpp b/integration_tests/arrow_stream_to_file.cpp similarity index 100% rename from integration_tests/stream_to_file.cpp rename to integration_tests/arrow_stream_to_file.cpp diff --git a/integration_tests/arrow_validate.cpp b/integration_tests/arrow_validate.cpp new file mode 100644 index 0000000..4292e91 --- /dev/null +++ b/integration_tests/arrow_validate.cpp @@ -0,0 +1,287 @@ +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "sparrow/json_reader/json_parser.hpp" + +#include + +/** + * @brief Helper function to compare two record batches for equality. + * + * Compares the structure and data of two record batches element-by-element. + * Reports detailed error messages for any mismatches found. + * + * @param rb1 The first record batch to compare + * @param rb2 The second record batch to compare + * @param batch_idx The index of the batch being compared (for error reporting) + * @return true if the batches are identical, false otherwise + */ +bool compare_record_batch( + const sparrow::record_batch& rb1, + const sparrow::record_batch& rb2, + size_t batch_idx +) +{ + bool all_match = true; + + // Check number of columns + if (rb1.nb_columns() != rb2.nb_columns()) + { + std::cerr << "Error: Batch " << batch_idx << " has different number of columns: " << rb1.nb_columns() + << " vs " << rb2.nb_columns() << "\n"; + return false; + } + + // Check number of rows + if (rb1.nb_rows() != rb2.nb_rows()) + { + std::cerr << "Error: Batch " << batch_idx << " has different number of rows: " << rb1.nb_rows() + << " vs " << rb2.nb_rows() << "\n"; + return false; + } + + // Check column names + const auto& names1 = rb1.names(); + const auto& names2 = rb2.names(); + if (names1.size() != names2.size()) + { + std::cerr << "Error: Batch " << batch_idx << " has different number of column names\n"; + all_match = false; + } + else + { + for (size_t i = 0; i < names1.size(); ++i) + { + if (names1[i] != names2[i]) + { + std::cerr << "Error: Batch " << batch_idx << " column " << i << " has different name: '" + << names1[i] << "' vs '" << names2[i] << "'\n"; + all_match = false; + } + } + } + + // Check each column + for (size_t col_idx = 0; col_idx < rb1.nb_columns(); ++col_idx) + { + const auto& col1 = rb1.get_column(col_idx); + const auto& col2 = rb2.get_column(col_idx); + + // Check column size + if (col1.size() != col2.size()) + { + std::cerr << "Error: Batch " << batch_idx << ", column " << col_idx << " has different size: " + << col1.size() << " vs " << col2.size() << "\n"; + all_match = false; + continue; + } + + // Check column data type + if (col1.data_type() != col2.data_type()) + { + std::cerr << "Error: Batch " << batch_idx << ", column " << col_idx + << " has different data type\n"; + all_match = false; + continue; + } + + // Check column name + const auto col_name1 = col1.name(); + const auto col_name2 = col2.name(); + if (col_name1 != col_name2) + { + std::cerr << "Warning: Batch " << batch_idx << ", column " << col_idx + << " has different name in column metadata\n"; + } + + // Check each value in the column + for (size_t row_idx = 0; row_idx < col1.size(); ++row_idx) + { + if (col1[row_idx] != col2[row_idx]) + { + std::cerr << "Error: Batch " << batch_idx << ", column " << col_idx << " ('" + << col_name1.value_or("unnamed") << "'), row " << row_idx + << " has different value\n"; + std::cerr << " JSON value: " << col1[row_idx] << "\n"; + std::cerr << " Stream value: " << col2[row_idx] << "\n"; + all_match = false; + } + } + } + + return all_match; +} + +/** + * @brief Validates that a JSON file and an Arrow stream file contain identical data. + * + * This program reads a JSON file containing Arrow record batches and an Arrow IPC + * stream file, converts both to vectors of record batches, and compares them + * element-by-element to ensure they are identical. + * + * Usage: validate + * + * @param argc Number of command-line arguments + * @param argv Array of command-line arguments + * @return EXIT_SUCCESS if the files match, EXIT_FAILURE on error or mismatch + */ +int main(int argc, char* argv[]) +{ + // Check command-line arguments + if (argc != 3) + { + std::cerr << "Usage: " << argv[0] << " \n"; + std::cerr << "Validates that a JSON file and an Arrow stream file contain identical data.\n"; + return EXIT_FAILURE; + } + + const std::filesystem::path json_path(argv[1]); + const std::filesystem::path stream_path(argv[2]); + + try + { + // Check if the JSON file exists + if (!std::filesystem::exists(json_path)) + { + std::cerr << "Error: JSON file not found: " << json_path << "\n"; + return EXIT_FAILURE; + } + + // Check if the stream file exists + if (!std::filesystem::exists(stream_path)) + { + std::cerr << "Error: Stream file not found: " << stream_path << "\n"; + return EXIT_FAILURE; + } + + // Load and parse the JSON file + std::cout << "Loading JSON file: " << json_path << "\n"; + std::ifstream json_file(json_path); + if (!json_file.is_open()) + { + std::cerr << "Error: Could not open JSON file: " << json_path << "\n"; + return EXIT_FAILURE; + } + + nlohmann::json json_data; + try + { + json_data = nlohmann::json::parse(json_file); + } + catch (const nlohmann::json::parse_error& e) + { + std::cerr << "Error: Failed to parse JSON file: " << e.what() << "\n"; + return EXIT_FAILURE; + } + json_file.close(); + + // Check for batches in JSON + if (!json_data.contains("batches") || !json_data["batches"].is_array()) + { + std::cerr << "Error: JSON file does not contain a 'batches' array.\n"; + return EXIT_FAILURE; + } + + const size_t num_batches = json_data["batches"].size(); + std::cout << "JSON file contains " << num_batches << " batch(es)\n"; + + // Parse all record batches from JSON + std::vector json_batches; + json_batches.reserve(num_batches); + + for (size_t batch_idx = 0; batch_idx < num_batches; ++batch_idx) + { + try + { + json_batches.emplace_back( + sparrow::json_reader::build_record_batch_from_json(json_data, batch_idx) + ); + } + catch (const std::exception& e) + { + std::cerr << "Error: Failed to build record batch " << batch_idx << " from JSON: " + << e.what() << "\n"; + return EXIT_FAILURE; + } + } + + // Load and deserialize the stream file + std::cout << "Loading stream file: " << stream_path << "\n"; + std::ifstream stream_file(stream_path, std::ios::in | std::ios::binary); + if (!stream_file.is_open()) + { + std::cerr << "Error: Could not open stream file: " << stream_path << "\n"; + return EXIT_FAILURE; + } + + std::vector stream_data( + (std::istreambuf_iterator(stream_file)), + std::istreambuf_iterator() + ); + stream_file.close(); + + if (stream_data.empty()) + { + std::cerr << "Error: Stream file is empty.\n"; + return EXIT_FAILURE; + } + + // Deserialize the stream + std::vector stream_batches; + try + { + stream_batches = sparrow_ipc::deserialize_stream(std::span(stream_data)); + } + catch (const std::exception& e) + { + std::cerr << "Error: Failed to deserialize stream: " << e.what() << "\n"; + return EXIT_FAILURE; + } + + std::cout << "Stream file contains " << stream_batches.size() << " batch(es)\n"; + + // Compare the number of batches + if (json_batches.size() != stream_batches.size()) + { + std::cerr << "Error: Number of batches mismatch!\n"; + std::cerr << " JSON file: " << json_batches.size() << " batch(es)\n"; + std::cerr << " Stream file: " << stream_batches.size() << " batch(es)\n"; + return EXIT_FAILURE; + } + + // Compare each batch + std::cout << "Comparing " << json_batches.size() << " batch(es)...\n"; + bool all_match = true; + for (size_t batch_idx = 0; batch_idx < json_batches.size(); ++batch_idx) + { + std::cout << " Comparing batch " << batch_idx << "...\n"; + if (!compare_record_batch(json_batches[batch_idx], stream_batches[batch_idx], batch_idx)) + { + all_match = false; + } + } + + if (all_match) + { + std::cout << "\nāœ“ Validation successful: JSON and stream files contain identical data!\n"; + return EXIT_SUCCESS; + } + else + { + std::cerr << "\nāœ— Validation failed: JSON and stream files contain different data.\n"; + return EXIT_FAILURE; + } + } + catch (const std::exception& e) + { + std::cerr << "Error: " << e.what() << "\n"; + return EXIT_FAILURE; + } +} diff --git a/integration_tests/test_integration_tools.cpp b/integration_tests/test_integration_tools.cpp index 4056d95..b0347fe 100644 --- a/integration_tests/test_integration_tools.cpp +++ b/integration_tests/test_integration_tools.cpp @@ -23,20 +23,20 @@ struct CommandResult }; #ifdef _WIN32 -#include +# include CommandResult execute_command(const std::string& command) { CommandResult result; - + // Create temporary files for stdout and stderr const std::string stdout_file = std::tmpnam(nullptr); const std::string stderr_file = std::tmpnam(nullptr); - + const std::string full_command = command + " > " + stdout_file + " 2> " + stderr_file; - + result.exit_code = std::system(full_command.c_str()); - + // Read stdout std::ifstream stdout_stream(stdout_file, std::ios::binary); if (stdout_stream) @@ -45,7 +45,7 @@ CommandResult execute_command(const std::string& command) ss << stdout_stream.rdbuf(); result.stdout_data = ss.str(); } - + // Read stderr std::ifstream stderr_stream(stderr_file, std::ios::binary); if (stderr_stream) @@ -54,33 +54,34 @@ CommandResult execute_command(const std::string& command) ss << stderr_stream.rdbuf(); result.stderr_data = ss.str(); } - + // Clean up std::filesystem::remove(stdout_file); std::filesystem::remove(stderr_file); - + return result; } #else -#include -#include +# include +# include CommandResult execute_command(const std::string& command) { CommandResult result; - + // Check if command already contains output redirection const bool has_redirection = (command.find('>') != std::string::npos); - + if (has_redirection) { // Command already has redirection, execute as-is // But we still want to capture stderr for error checking - const std::filesystem::path stderr_file = std::filesystem::temp_directory_path() / ("stderr_" + std::to_string(std::time(nullptr))); + const std::filesystem::path stderr_file = std::filesystem::temp_directory_path() + / ("stderr_" + std::to_string(std::time(nullptr))); const std::string full_command = command + " 2> " + stderr_file.string(); result.exit_code = std::system(full_command.c_str()); - + // Read stderr std::ifstream stderr_stream(stderr_file, std::ios::binary); if (stderr_stream) @@ -89,22 +90,24 @@ CommandResult execute_command(const std::string& command) ss << stderr_stream.rdbuf(); result.stderr_data = ss.str(); } - + // Clean up std::filesystem::remove(stderr_file); } else { // Create temporary files for stdout and stderr - const std::filesystem::path stdout_file = std::filesystem::temp_directory_path() / ("stdout_" + std::to_string(std::time(nullptr))); - const std::filesystem::path stderr_file = std::filesystem::temp_directory_path() / ("stderr_" + std::to_string(std::time(nullptr))); - + const std::filesystem::path stdout_file = std::filesystem::temp_directory_path() + / ("stdout_" + std::to_string(std::time(nullptr))); + const std::filesystem::path stderr_file = std::filesystem::temp_directory_path() + / ("stderr_" + std::to_string(std::time(nullptr))); + // The command string is already properly formed (executable path + args) // We need to redirect stdout and stderr to files const std::string full_command = command + " > " + stdout_file.string() + " 2> " + stderr_file.string(); - + result.exit_code = std::system(full_command.c_str()); - + // Read stdout std::ifstream stdout_stream(stdout_file, std::ios::binary); if (stdout_stream) @@ -113,7 +116,7 @@ CommandResult execute_command(const std::string& command) ss << stdout_stream.rdbuf(); result.stdout_data = ss.str(); } - + // Read stderr std::ifstream stderr_stream(stderr_file, std::ios::binary); if (stderr_stream) @@ -122,12 +125,12 @@ CommandResult execute_command(const std::string& command) ss << stderr_stream.rdbuf(); result.stderr_data = ss.str(); } - + // Clean up std::filesystem::remove(stdout_file); std::filesystem::remove(stderr_file); } - + return result; } #endif @@ -163,18 +166,22 @@ TEST_SUITE("Integration Tools Tests") { // Get paths to test data const std::filesystem::path arrow_testing_data_dir = ARROW_TESTING_DATA_DIR; - const std::filesystem::path tests_resources_files_path = - arrow_testing_data_dir / "data" / "arrow-ipc-stream" / "integration" / "cpp-21.0.0"; - + const std::filesystem::path tests_resources_files_path = arrow_testing_data_dir / "data" / "arrow-ipc-stream" + / "integration" / "cpp-21.0.0"; + // Paths to the executables - defined at compile time const std::filesystem::path exe_dir = INTEGRATION_TOOLS_DIR; const std::filesystem::path file_to_stream_exe = exe_dir / "file_to_stream"; const std::filesystem::path stream_to_file_exe = exe_dir / "stream_to_file"; - + const std::filesystem::path json_to_file_exe = exe_dir / "json_to_file"; + const std::filesystem::path validate_exe = exe_dir / "validate"; + // Helper to build command with properly quoted executable - auto make_command = [](const std::filesystem::path& exe, const std::string& args = "") { + auto make_command = [](const std::filesystem::path& exe, const std::string& args = "") + { std::string cmd = "\"" + exe.string() + "\""; - if (!args.empty()) { + if (!args.empty()) + { cmd += " " + args; } return cmd; @@ -218,23 +225,95 @@ TEST_SUITE("Integration Tools Tests") CHECK(result.stderr_data.find("not found") != std::string::npos); } + TEST_CASE("json_to_file - No arguments") + { + auto result = execute_command(make_command(json_to_file_exe)); + CHECK_NE(result.exit_code, 0); + CHECK(result.stderr_data.find("Usage:") != std::string::npos); + } + + TEST_CASE("json_to_file - Only one argument") + { + const std::string json_file = "input.json"; + auto result = execute_command(make_command(json_to_file_exe, json_file)); + CHECK_NE(result.exit_code, 0); + CHECK(result.stderr_data.find("Usage:") != std::string::npos); + } + + TEST_CASE("json_to_file - Non-existent input file") + { + const std::string non_existent = "non_existent_file_12345.json"; + const std::string output_file = "output.stream"; + auto result = execute_command( + make_command(json_to_file_exe, "\"" + non_existent + "\" \"" + output_file + "\"") + ); + CHECK_NE(result.exit_code, 0); + CHECK(result.stderr_data.find("not found") != std::string::npos); + } + + TEST_CASE("validate - No arguments") + { + auto result = execute_command(make_command(validate_exe)); + CHECK_NE(result.exit_code, 0); + CHECK(result.stderr_data.find("Usage:") != std::string::npos); + } + + TEST_CASE("validate - Only one argument") + { + const std::string json_file = "input.json"; + auto result = execute_command(make_command(validate_exe, json_file)); + CHECK_NE(result.exit_code, 0); + CHECK(result.stderr_data.find("Usage:") != std::string::npos); + } + + TEST_CASE("validate - Non-existent JSON file") + { + const std::string non_existent_json = "non_existent_file_12345.json"; + const std::string stream_file = "existing.stream"; + auto result = execute_command( + make_command(validate_exe, "\"" + non_existent_json + "\" \"" + stream_file + "\"") + ); + CHECK_NE(result.exit_code, 0); + CHECK(result.stderr_data.find("not found") != std::string::npos); + } + + TEST_CASE("validate - Non-existent stream file") + { + const std::filesystem::path json_file = tests_resources_files_path / "generated_primitive.json"; + + if (!std::filesystem::exists(json_file)) + { + MESSAGE("Skipping test: test file not found at " << json_file); + return; + } + + const std::string non_existent_stream = "non_existent_file_12345.stream"; + auto result = execute_command( + make_command(validate_exe, "\"" + json_file.string() + "\" \"" + non_existent_stream + "\"") + ); + CHECK_NE(result.exit_code, 0); + CHECK(result.stderr_data.find("not found") != std::string::npos); + } + TEST_CASE("file_to_stream - Convert JSON to stream") { // Test with a known good JSON file const std::filesystem::path json_file = tests_resources_files_path / "generated_primitive.json"; - + if (!std::filesystem::exists(json_file)) { MESSAGE("Skipping test: test file not found at " << json_file); return; } - const std::filesystem::path output_stream = std::filesystem::temp_directory_path() / "test_output.stream"; - + const std::filesystem::path output_stream = std::filesystem::temp_directory_path() + / "test_output.stream"; + // Execute file_to_stream - const std::string command = "\"" + file_to_stream_exe.string() + "\" \"" + json_file.string() + "\" > \"" + output_stream.string() + "\""; + const std::string command = "\"" + file_to_stream_exe.string() + "\" \"" + json_file.string() + + "\" > \"" + output_stream.string() + "\""; auto result = execute_command(command); - + CHECK_EQ(result.exit_code, 0); CHECK(std::filesystem::exists(output_stream)); CHECK_GT(std::filesystem::file_size(output_stream), 0); @@ -242,7 +321,7 @@ TEST_SUITE("Integration Tools Tests") // Verify the output is a valid stream by deserializing it std::ifstream stream_file(output_stream, std::ios::binary); REQUIRE(stream_file.is_open()); - + std::vector stream_data( (std::istreambuf_iterator(stream_file)), std::istreambuf_iterator() @@ -259,19 +338,21 @@ TEST_SUITE("Integration Tools Tests") TEST_CASE("stream_to_file - Process stream file") { const std::filesystem::path input_stream = tests_resources_files_path / "generated_primitive.stream"; - + if (!std::filesystem::exists(input_stream)) { MESSAGE("Skipping test: test file not found at " << input_stream); return; } - const std::filesystem::path output_stream = std::filesystem::temp_directory_path() / "test_stream_output.stream"; - + const std::filesystem::path output_stream = std::filesystem::temp_directory_path() + / "test_stream_output.stream"; + // Execute stream_to_file - const std::string command = "\"" + stream_to_file_exe.string() + "\" \"" + input_stream.string() + "\" \"" + output_stream.string() + "\""; + const std::string command = "\"" + stream_to_file_exe.string() + "\" \"" + input_stream.string() + + "\" \"" + output_stream.string() + "\""; auto result = execute_command(command); - + CHECK_EQ(result.exit_code, 0); CHECK(std::filesystem::exists(output_stream)); CHECK_GT(std::filesystem::file_size(output_stream), 0); @@ -279,7 +360,7 @@ TEST_SUITE("Integration Tools Tests") // Verify the output is a valid stream std::ifstream output_file(output_stream, std::ios::binary); REQUIRE(output_file.is_open()); - + std::vector output_data( (std::istreambuf_iterator(output_file)), std::istreambuf_iterator() @@ -295,19 +376,21 @@ TEST_SUITE("Integration Tools Tests") TEST_CASE("Round-trip: JSON -> stream -> file -> deserialize") { const std::filesystem::path json_file = tests_resources_files_path / "generated_primitive.json"; - + if (!std::filesystem::exists(json_file)) { MESSAGE("Skipping test: test file not found at " << json_file); return; } - const std::filesystem::path intermediate_stream = std::filesystem::temp_directory_path() / "intermediate.stream"; + const std::filesystem::path intermediate_stream = std::filesystem::temp_directory_path() + / "intermediate.stream"; const std::filesystem::path final_stream = std::filesystem::temp_directory_path() / "final.stream"; // Step 1: JSON -> stream { - const std::string command = "\"" + file_to_stream_exe.string() + "\" \"" + json_file.string() + "\" > \"" + intermediate_stream.string() + "\""; + const std::string command = "\"" + file_to_stream_exe.string() + "\" \"" + json_file.string() + + "\" > \"" + intermediate_stream.string() + "\""; auto result = execute_command(command); REQUIRE_EQ(result.exit_code, 0); REQUIRE(std::filesystem::exists(intermediate_stream)); @@ -315,7 +398,8 @@ TEST_SUITE("Integration Tools Tests") // Step 2: stream -> file { - const std::string command = "\"" + stream_to_file_exe.string() + "\" \"" + intermediate_stream.string() + "\" \"" + final_stream.string() + "\""; + const std::string command = "\"" + stream_to_file_exe.string() + "\" \"" + + intermediate_stream.string() + "\" \"" + final_stream.string() + "\""; auto result = execute_command(command); REQUIRE_EQ(result.exit_code, 0); REQUIRE(std::filesystem::exists(final_stream)); @@ -332,9 +416,7 @@ TEST_SUITE("Integration Tools Tests") std::vector original_batches; for (size_t i = 0; i < num_batches; ++i) { - original_batches.emplace_back( - sparrow::json_reader::build_record_batch_from_json(json_data, i) - ); + original_batches.emplace_back(sparrow::json_reader::build_record_batch_from_json(json_data, i)); } // Load final stream @@ -356,10 +438,149 @@ TEST_SUITE("Integration Tools Tests") std::filesystem::remove(final_stream); } + TEST_CASE("json_to_file - Convert JSON to stream file") + { + const std::filesystem::path json_file = tests_resources_files_path / "generated_primitive.json"; + + if (!std::filesystem::exists(json_file)) + { + MESSAGE("Skipping test: test file not found at " << json_file); + return; + } + + const std::filesystem::path output_stream = std::filesystem::temp_directory_path() + / "json_to_file_output.stream"; + + // Execute json_to_file + const std::string command = "\"" + json_to_file_exe.string() + "\" \"" + json_file.string() + "\" \"" + + output_stream.string() + "\""; + auto result = execute_command(command); + + CHECK_EQ(result.exit_code, 0); + CHECK(std::filesystem::exists(output_stream)); + CHECK_GT(std::filesystem::file_size(output_stream), 0); + + // Verify the output is a valid stream by deserializing it + std::ifstream stream_file(output_stream, std::ios::binary); + REQUIRE(stream_file.is_open()); + + std::vector stream_data( + (std::istreambuf_iterator(stream_file)), + std::istreambuf_iterator() + ); + stream_file.close(); + + // Should be able to deserialize without errors + auto deserialized_batches = sparrow_ipc::deserialize_stream(std::span(stream_data)); + CHECK_GT(deserialized_batches.size(), 0); + + // Clean up + std::filesystem::remove(output_stream); + } + + TEST_CASE("validate - Successful validation of matching files") + { + const std::filesystem::path json_file = tests_resources_files_path / "generated_primitive.json"; + + if (!std::filesystem::exists(json_file)) + { + MESSAGE("Skipping test: test file not found at " << json_file); + return; + } + + // First, create a stream file from the JSON + const std::filesystem::path stream_file = std::filesystem::temp_directory_path() + / "validate_test.stream"; + { + const std::string command = "\"" + json_to_file_exe.string() + "\" \"" + json_file.string() + + "\" \"" + stream_file.string() + "\""; + auto result = execute_command(command); + REQUIRE_EQ(result.exit_code, 0); + REQUIRE(std::filesystem::exists(stream_file)); + } + + // Now validate that the JSON and stream match + { + const std::string command = "\"" + validate_exe.string() + "\" \"" + json_file.string() + "\" \"" + + stream_file.string() + "\""; + auto result = execute_command(command); + + CHECK_EQ(result.exit_code, 0); + const bool validation_success = result.stdout_data.find("Validation successful") != std::string::npos + || result.stdout_data.find("identical data") != std::string::npos; + CHECK(validation_success); + } + + // Clean up + std::filesystem::remove(stream_file); + } + + TEST_CASE("validate - Validation with reference stream file") + { + const std::filesystem::path json_file = tests_resources_files_path / "generated_primitive.json"; + const std::filesystem::path stream_file = tests_resources_files_path / "generated_primitive.stream"; + + if (!std::filesystem::exists(json_file)) + { + MESSAGE("Skipping test: JSON file not found at " << json_file); + return; + } + + if (!std::filesystem::exists(stream_file)) + { + MESSAGE("Skipping test: Stream file not found at " << stream_file); + return; + } + + // Validate that the JSON and reference stream match + const std::string command = "\"" + validate_exe.string() + "\" \"" + json_file.string() + "\" \"" + + stream_file.string() + "\""; + auto result = execute_command(command); + + CHECK_EQ(result.exit_code, 0); + const bool validation_success = result.stdout_data.find("Validation successful") != std::string::npos + || result.stdout_data.find("identical data") != std::string::npos; + CHECK(validation_success); + } + + TEST_CASE("json_to_file and validate - Round-trip with validation") + { + const std::filesystem::path json_file = tests_resources_files_path / "generated_binary.json"; + + if (!std::filesystem::exists(json_file)) + { + MESSAGE("Skipping test: test file not found at " << json_file); + return; + } + + const std::filesystem::path stream_file = std::filesystem::temp_directory_path() + / "roundtrip_validate.stream"; + + // Step 1: Convert JSON to stream + { + const std::string command = "\"" + json_to_file_exe.string() + "\" \"" + json_file.string() + + "\" \"" + stream_file.string() + "\""; + auto result = execute_command(command); + REQUIRE_EQ(result.exit_code, 0); + REQUIRE(std::filesystem::exists(stream_file)); + } + + // Step 2: Validate the stream against the JSON + { + const std::string command = "\"" + validate_exe.string() + "\" \"" + json_file.string() + "\" \"" + + stream_file.string() + "\""; + auto result = execute_command(command); + CHECK_EQ(result.exit_code, 0); + } + + // Clean up + std::filesystem::remove(stream_file); + } + TEST_CASE("Paths with spaces") { const std::filesystem::path json_file = tests_resources_files_path / "generated_primitive.json"; - + if (!std::filesystem::exists(json_file)) { MESSAGE("Skipping test: test file not found at " << json_file); @@ -375,7 +596,8 @@ TEST_SUITE("Integration Tools Tests") // Step 1: JSON -> stream with spaces in output path { - const std::string command = "\"" + file_to_stream_exe.string() + "\" \"" + json_file.string() + "\" > \"" + output_stream.string() + "\""; + const std::string command = "\"" + file_to_stream_exe.string() + "\" \"" + json_file.string() + + "\" > \"" + output_stream.string() + "\""; auto result = execute_command(command); CHECK_EQ(result.exit_code, 0); CHECK(std::filesystem::exists(output_stream)); @@ -383,7 +605,8 @@ TEST_SUITE("Integration Tools Tests") // Step 2: stream -> file with spaces in both paths { - const std::string command = "\"" + stream_to_file_exe.string() + "\" \"" + output_stream.string() + "\" \"" + final_stream.string() + "\""; + const std::string command = "\"" + stream_to_file_exe.string() + "\" \"" + output_stream.string() + + "\" \"" + final_stream.string() + "\""; auto result = execute_command(command); CHECK_EQ(result.exit_code, 0); CHECK(std::filesystem::exists(final_stream)); @@ -400,6 +623,24 @@ TEST_SUITE("Integration Tools Tests") CHECK_NOTHROW(sparrow_ipc::deserialize_stream(std::span(final_data))); + // Step 3: Test json_to_file with spaces in paths + const std::filesystem::path json_to_file_output = temp_dir / "json to file output.stream"; + { + const std::string command = "\"" + json_to_file_exe.string() + "\" \"" + json_file.string() + + "\" \"" + json_to_file_output.string() + "\""; + auto result = execute_command(command); + CHECK_EQ(result.exit_code, 0); + CHECK(std::filesystem::exists(json_to_file_output)); + } + + // Step 4: Test validate with spaces in paths + { + const std::string command = "\"" + validate_exe.string() + "\" \"" + json_file.string() + "\" \"" + + json_to_file_output.string() + "\""; + auto result = execute_command(command); + CHECK_EQ(result.exit_code, 0); + } + // Clean up std::filesystem::remove_all(temp_dir); } @@ -416,7 +657,7 @@ TEST_SUITE("Integration Tools Tests") for (const auto& test_file : test_files) { const std::filesystem::path json_file = tests_resources_files_path / (test_file + ".json"); - + if (!std::filesystem::exists(json_file)) { MESSAGE("Skipping test file: " << json_file); @@ -425,12 +666,14 @@ TEST_SUITE("Integration Tools Tests") SUBCASE(test_file.c_str()) { - const std::filesystem::path output_stream = std::filesystem::temp_directory_path() / (test_file + "_output.stream"); + const std::filesystem::path output_stream = std::filesystem::temp_directory_path() + / (test_file + "_output.stream"); // Convert JSON to stream - const std::string command = "\"" + file_to_stream_exe.string() + "\" \"" + json_file.string() + "\" > \"" + output_stream.string() + "\""; + const std::string command = "\"" + file_to_stream_exe.string() + "\" \"" + json_file.string() + + "\" > \"" + output_stream.string() + "\""; auto result = execute_command(command); - + CHECK_EQ(result.exit_code, 0); CHECK(std::filesystem::exists(output_stream)); @@ -447,8 +690,28 @@ TEST_SUITE("Integration Tools Tests") CHECK_NOTHROW(sparrow_ipc::deserialize_stream(std::span(stream_data))); } + // Test json_to_file with the same file + const std::filesystem::path json_to_file_output = std::filesystem::temp_directory_path() + / (test_file + "_json_to_file.stream"); + { + const std::string cmd = "\"" + json_to_file_exe.string() + "\" \"" + json_file.string() + + "\" \"" + json_to_file_output.string() + "\""; + auto res = execute_command(cmd); + CHECK_EQ(res.exit_code, 0); + CHECK(std::filesystem::exists(json_to_file_output)); + } + + // Test validate with the json_to_file output + { + const std::string cmd = "\"" + validate_exe.string() + "\" \"" + json_file.string() + + "\" \"" + json_to_file_output.string() + "\""; + auto res = execute_command(cmd); + CHECK_EQ(res.exit_code, 0); + } + // Clean up std::filesystem::remove(output_stream); + std::filesystem::remove(json_to_file_output); } } } From 3fc2bfe8e1b32bf0d730b4ea256875b7a04386e7 Mon Sep 17 00:00:00 2001 From: Alexis Placet Date: Wed, 12 Nov 2025 13:38:44 +0100 Subject: [PATCH 07/24] try fix --- .github/workflows/integration_tests.yaml | 2 +- integration_tests/README.md | 91 ++++++++++++++++++++++++ 2 files changed, 92 insertions(+), 1 deletion(-) create mode 100644 integration_tests/README.md diff --git a/.github/workflows/integration_tests.yaml b/.github/workflows/integration_tests.yaml index 4ae0132..ed82cf1 100644 --- a/.github/workflows/integration_tests.yaml +++ b/.github/workflows/integration_tests.yaml @@ -51,7 +51,7 @@ jobs: - name: Run Integration tests run: | docker run --rm \ - -e ARCHERY_INTEGRATION_WITH_EXTERNAL_LIBRARY=/workspace/build/bin/RELEASE/ \ + -e ARCHERY_INTEGRATION_WITH_EXTERNAL_LIBRARY=/workspace/build/bin/RELEASE/integration_tests \ -e ARCHERY_INTEGRATION_EXTERNAL_LIBRARY_IPC_PRODUCER=true \ -e ARCHERY_INTEGRATION_EXTERNAL_LIBRARY_IPC_CONSUMER=true \ -v ${{ github.workspace }}:/workspace \ diff --git a/integration_tests/README.md b/integration_tests/README.md new file mode 100644 index 0000000..136fcda --- /dev/null +++ b/integration_tests/README.md @@ -0,0 +1,91 @@ +# Integration Tests + +This directory contains integration test tools for `sparrow-ipc`. + +## Tools + +### `arrow_file_to_stream` + +Reads a JSON file containing Arrow record batches and outputs the serialized Arrow IPC stream to stdout. + +**Usage:** +```bash +./arrow_file_to_stream > output.stream +``` + +**Example:** +```bash +# Convert a JSON file to an Arrow IPC stream file +./arrow_file_to_stream input.json > output.stream +``` + +### `arrow_stream_to_file` + +Reads an Arrow IPC stream from a file and writes it to another file. + +**Usage:** +```bash +./arrow_stream_to_file +``` + +**Example:** +```bash +# Read stream from file and write to another file +./arrow_stream_to_file input.stream output.stream +``` + +## Round-trip Example + +You can chain these tools to perform a round-trip conversion: + +```bash +# JSON -> Stream file -> Stream file (verify) +./arrow_file_to_stream input.json > intermediate.stream +./arrow_stream_to_file intermediate.stream final.stream +``` + +Or test a complete pipeline: + +```bash +# Convert JSON to stream and process it +./arrow_file_to_stream test.json > test.stream +./arrow_stream_to_file test.stream test_verified.stream +``` + +## Building + +To build these integration tests, enable the `SPARROW_IPC_BUILD_INTEGRATION_TESTS` CMake option: + +```bash +cmake -DSPARROW_IPC_BUILD_INTEGRATION_TESTS=ON .. +cmake --build . +``` + +The executables will be built in your build directory under `integration_tests/`. + +## Testing + +The integration tests include automated tests that verify the functionality of both tools: + +### Running Tests + +```bash +# Run the test suite +./test_integration_tools + +# Or using CTest +ctest -R integration_tools_test +``` + +### Test Coverage + +The test suite includes: +- **Argument validation**: Tests for missing or incorrect arguments +- **File not found**: Tests error handling for non-existent files +- **Valid conversions**: Tests successful JSON to stream conversions +- **Stream processing**: Tests valid stream file processing +- **Round-trip testing**: Tests JSON → stream → file → deserialize pipeline +- **Paths with spaces**: Tests handling of file paths containing spaces +- **Multiple file types**: Tests various Arrow data types (primitive, binary, zero-length, etc.) + +All tests automatically use the Arrow testing data from the `ARROW_TESTING_DATA_DIR` if available. From ed889e6cb8019a71d645a145640e58d7e583f700 Mon Sep 17 00:00:00 2001 From: Alexis Placet Date: Wed, 12 Nov 2025 13:39:04 +0100 Subject: [PATCH 08/24] delete --- integration_tests/README.md | 91 ------------------------------------- 1 file changed, 91 deletions(-) delete mode 100644 integration_tests/README.md diff --git a/integration_tests/README.md b/integration_tests/README.md deleted file mode 100644 index 136fcda..0000000 --- a/integration_tests/README.md +++ /dev/null @@ -1,91 +0,0 @@ -# Integration Tests - -This directory contains integration test tools for `sparrow-ipc`. - -## Tools - -### `arrow_file_to_stream` - -Reads a JSON file containing Arrow record batches and outputs the serialized Arrow IPC stream to stdout. - -**Usage:** -```bash -./arrow_file_to_stream > output.stream -``` - -**Example:** -```bash -# Convert a JSON file to an Arrow IPC stream file -./arrow_file_to_stream input.json > output.stream -``` - -### `arrow_stream_to_file` - -Reads an Arrow IPC stream from a file and writes it to another file. - -**Usage:** -```bash -./arrow_stream_to_file -``` - -**Example:** -```bash -# Read stream from file and write to another file -./arrow_stream_to_file input.stream output.stream -``` - -## Round-trip Example - -You can chain these tools to perform a round-trip conversion: - -```bash -# JSON -> Stream file -> Stream file (verify) -./arrow_file_to_stream input.json > intermediate.stream -./arrow_stream_to_file intermediate.stream final.stream -``` - -Or test a complete pipeline: - -```bash -# Convert JSON to stream and process it -./arrow_file_to_stream test.json > test.stream -./arrow_stream_to_file test.stream test_verified.stream -``` - -## Building - -To build these integration tests, enable the `SPARROW_IPC_BUILD_INTEGRATION_TESTS` CMake option: - -```bash -cmake -DSPARROW_IPC_BUILD_INTEGRATION_TESTS=ON .. -cmake --build . -``` - -The executables will be built in your build directory under `integration_tests/`. - -## Testing - -The integration tests include automated tests that verify the functionality of both tools: - -### Running Tests - -```bash -# Run the test suite -./test_integration_tools - -# Or using CTest -ctest -R integration_tools_test -``` - -### Test Coverage - -The test suite includes: -- **Argument validation**: Tests for missing or incorrect arguments -- **File not found**: Tests error handling for non-existent files -- **Valid conversions**: Tests successful JSON to stream conversions -- **Stream processing**: Tests valid stream file processing -- **Round-trip testing**: Tests JSON → stream → file → deserialize pipeline -- **Paths with spaces**: Tests handling of file paths containing spaces -- **Multiple file types**: Tests various Arrow data types (primitive, binary, zero-length, etc.) - -All tests automatically use the Arrow testing data from the `ARROW_TESTING_DATA_DIR` if available. From 21b14488da2cb53e4c937de2aaa67445d54b3a96 Mon Sep 17 00:00:00 2001 From: Alexis Placet Date: Wed, 12 Nov 2025 13:54:19 +0100 Subject: [PATCH 09/24] try fix --- .github/workflows/integration_tests.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration_tests.yaml b/.github/workflows/integration_tests.yaml index ed82cf1..1a9aef9 100644 --- a/.github/workflows/integration_tests.yaml +++ b/.github/workflows/integration_tests.yaml @@ -51,7 +51,7 @@ jobs: - name: Run Integration tests run: | docker run --rm \ - -e ARCHERY_INTEGRATION_WITH_EXTERNAL_LIBRARY=/workspace/build/bin/RELEASE/integration_tests \ + -e ARCHERY_INTEGRATION_WITH_EXTERNAL_LIBRARY=/workspace/build/bin/RELEASE/integration_tests/ \ -e ARCHERY_INTEGRATION_EXTERNAL_LIBRARY_IPC_PRODUCER=true \ -e ARCHERY_INTEGRATION_EXTERNAL_LIBRARY_IPC_CONSUMER=true \ -v ${{ github.workspace }}:/workspace \ From a5834a0281bfc3f5f1e425319875a5b367b9dc85 Mon Sep 17 00:00:00 2001 From: Alexis Placet Date: Wed, 12 Nov 2025 14:24:42 +0100 Subject: [PATCH 10/24] try --- .github/workflows/integration_tests.yaml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/integration_tests.yaml b/.github/workflows/integration_tests.yaml index 1a9aef9..c724b2d 100644 --- a/.github/workflows/integration_tests.yaml +++ b/.github/workflows/integration_tests.yaml @@ -48,10 +48,13 @@ jobs: - name: Build Docker image run: docker build -t sparrow/integration-tests -f ci/docker/integration.dockerfile . + - name: List all folders in workspace + run: ls -la ${{ github.workspace }} + - name: Run Integration tests run: | docker run --rm \ - -e ARCHERY_INTEGRATION_WITH_EXTERNAL_LIBRARY=/workspace/build/bin/RELEASE/integration_tests/ \ + -e ARCHERY_INTEGRATION_WITH_EXTERNAL_LIBRARY=/workspace/build/bin/Release/integration_tests/ \ -e ARCHERY_INTEGRATION_EXTERNAL_LIBRARY_IPC_PRODUCER=true \ -e ARCHERY_INTEGRATION_EXTERNAL_LIBRARY_IPC_CONSUMER=true \ -v ${{ github.workspace }}:/workspace \ From 60af5b132250883070f3206e030d895045bb2c2e Mon Sep 17 00:00:00 2001 From: Alexis Placet Date: Wed, 12 Nov 2025 14:46:32 +0100 Subject: [PATCH 11/24] try --- .github/workflows/integration_tests.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration_tests.yaml b/.github/workflows/integration_tests.yaml index c724b2d..5beee1f 100644 --- a/.github/workflows/integration_tests.yaml +++ b/.github/workflows/integration_tests.yaml @@ -49,7 +49,7 @@ jobs: run: docker build -t sparrow/integration-tests -f ci/docker/integration.dockerfile . - name: List all folders in workspace - run: ls -la ${{ github.workspace }} + run: ls -la ${{ github.workspace }}/build/bin/Release/ - name: Run Integration tests run: | From cae63f69bddcbb8f6154f4978f95f92eac88fea0 Mon Sep 17 00:00:00 2001 From: Alexis Placet Date: Wed, 12 Nov 2025 15:16:37 +0100 Subject: [PATCH 12/24] wip --- .github/workflows/integration_tests.yaml | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/.github/workflows/integration_tests.yaml b/.github/workflows/integration_tests.yaml index 5beee1f..9cff702 100644 --- a/.github/workflows/integration_tests.yaml +++ b/.github/workflows/integration_tests.yaml @@ -45,12 +45,14 @@ jobs: working-directory: build run: cmake --build . --config Release --target arrow_validate + - name: List all folders and subfolders + run: | + echo "Listing all folders and subfolders:" + find . -type d + - name: Build Docker image run: docker build -t sparrow/integration-tests -f ci/docker/integration.dockerfile . - - name: List all folders in workspace - run: ls -la ${{ github.workspace }}/build/bin/Release/ - - name: Run Integration tests run: | docker run --rm \ From 3e43db3a694ec6f85b0258ec3cda71452b383d0d Mon Sep 17 00:00:00 2001 From: Alexis Placet Date: Wed, 12 Nov 2025 16:44:02 +0100 Subject: [PATCH 13/24] fix --- CMakeLists.txt | 11 ++++++ integration_tests/CMakeLists.txt | 61 ++++++++++++++++++++++++-------- 2 files changed, 57 insertions(+), 15 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c4ddbd2..05e1a76 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -93,6 +93,17 @@ MESSAGE(STATUS "šŸ”§ Enable coverage: ${SPARROW_IPC_ENABLE_COVERAGE}") include(external_dependencies) +# Build +# ===== +set(BINARY_BUILD_DIR "${CMAKE_BINARY_DIR}/bin/${CMAKE_BUILD_TYPE}") + +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_DEBUG "${BINARY_BUILD_DIR}") +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE "${BINARY_BUILD_DIR}") +set(CMAKE_LIBRARY_OUTPUT_DIRECTORY_DEBUG "${BINARY_BUILD_DIR}") +set(CMAKE_LIBRARY_OUTPUT_DIRECTORY_RELEASE "${BINARY_BUILD_DIR}") +set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY_DEBUG "${BINARY_BUILD_DIR}") +set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY_RELEASE "${BINARY_BUILD_DIR}") + if(SPARROW_IPC_ENABLE_COVERAGE) include(code_coverage) endif() diff --git a/integration_tests/CMakeLists.txt b/integration_tests/CMakeLists.txt index 5555c4c..00418e9 100644 --- a/integration_tests/CMakeLists.txt +++ b/integration_tests/CMakeLists.txt @@ -12,9 +12,16 @@ target_link_libraries(arrow_file_to_stream set_target_properties(arrow_file_to_stream PROPERTIES - CXX_STANDARD 20 - CXX_STANDARD_REQUIRED ON - CXX_EXTENSIONS OFF + CXX_STANDARD 20 + CXX_STANDARD_REQUIRED ON + CXX_EXTENSIONS OFF + VERSION ${SPARROW_IPC_BINARY_VERSION} + SOVERSION ${SPARROW_IPC_BINARY_CURRENT} + FOLDER "integration_tests" + INSTALL_RPATH_USE_LINK_PATH TRUE + BUILD_WITH_INSTALL_RPATH FALSE + INSTALL_RPATH "$<$:$ORIGIN>$<$:@loader_path>" + BUILD_RPATH_USE_ORIGIN TRUE ) target_include_directories(arrow_file_to_stream @@ -36,9 +43,17 @@ target_link_libraries(arrow_stream_to_file set_target_properties(arrow_stream_to_file PROPERTIES - CXX_STANDARD 20 - CXX_STANDARD_REQUIRED ON - CXX_EXTENSIONS OFF + CXX_STANDARD 20 + CXX_STANDARD_REQUIRED ON + CXX_EXTENSIONS OFF + VERSION ${SPARROW_IPC_BINARY_VERSION} + SOVERSION ${SPARROW_IPC_BINARY_CURRENT} + FOLDER integration_tests + # RPATH settings for Unix-like systems + INSTALL_RPATH_USE_LINK_PATH TRUE + BUILD_WITH_INSTALL_RPATH FALSE + INSTALL_RPATH "$<$:$ORIGIN>$<$:@loader_path>" + BUILD_RPATH_USE_ORIGIN TRUE ) target_include_directories(arrow_stream_to_file @@ -61,9 +76,17 @@ target_link_libraries(arrow_json_to_file set_target_properties(arrow_json_to_file PROPERTIES - CXX_STANDARD 20 - CXX_STANDARD_REQUIRED ON - CXX_EXTENSIONS OFF + CXX_STANDARD 20 + CXX_STANDARD_REQUIRED ON + CXX_EXTENSIONS OFF + VERSION ${SPARROW_IPC_BINARY_VERSION} + SOVERSION ${SPARROW_IPC_BINARY_CURRENT} + FOLDER integration_tests + # RPATH settings for Unix-like systems + INSTALL_RPATH_USE_LINK_PATH TRUE + BUILD_WITH_INSTALL_RPATH FALSE + INSTALL_RPATH "$<$:$ORIGIN>$<$:@loader_path>" + BUILD_RPATH_USE_ORIGIN TRUE ) target_include_directories(arrow_json_to_file @@ -79,16 +102,24 @@ add_executable(arrow_validate arrow_validate.cpp) target_link_libraries(arrow_validate PRIVATE - sparrow-ipc - sparrow::sparrow - sparrow::json_reader + sparrow-ipc + sparrow::sparrow + sparrow::json_reader ) set_target_properties(arrow_validate PROPERTIES - CXX_STANDARD 20 - CXX_STANDARD_REQUIRED ON - CXX_EXTENSIONS OFF + CXX_STANDARD 20 + CXX_STANDARD_REQUIRED ON + CXX_EXTENSIONS OFF + VERSION ${SPARROW_IPC_BINARY_VERSION} + SOVERSION ${SPARROW_IPC_BINARY_CURRENT} + FOLDER integration_tests + # RPATH settings for Unix-like systems + INSTALL_RPATH_USE_LINK_PATH TRUE + BUILD_WITH_INSTALL_RPATH FALSE + INSTALL_RPATH "$<$:$ORIGIN>$<$:@loader_path>" + BUILD_RPATH_USE_ORIGIN TRUE ) target_include_directories(arrow_validate From 9cca88916908c53940cdaf5c4e8bf8ebcce1d779 Mon Sep 17 00:00:00 2001 From: Alexis Placet Date: Wed, 12 Nov 2025 16:59:14 +0100 Subject: [PATCH 14/24] fix --- .github/workflows/integration_tests.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration_tests.yaml b/.github/workflows/integration_tests.yaml index 9cff702..da10261 100644 --- a/.github/workflows/integration_tests.yaml +++ b/.github/workflows/integration_tests.yaml @@ -56,7 +56,7 @@ jobs: - name: Run Integration tests run: | docker run --rm \ - -e ARCHERY_INTEGRATION_WITH_EXTERNAL_LIBRARY=/workspace/build/bin/Release/integration_tests/ \ + -e ARCHERY_INTEGRATION_WITH_EXTERNAL_LIBRARY=/workspace/build/bin/Release/ \ -e ARCHERY_INTEGRATION_EXTERNAL_LIBRARY_IPC_PRODUCER=true \ -e ARCHERY_INTEGRATION_EXTERNAL_LIBRARY_IPC_CONSUMER=true \ -v ${{ github.workspace }}:/workspace \ From dad279f4170f8b57b4c5975259a91250cb7981ef Mon Sep 17 00:00:00 2001 From: Alexis Placet Date: Wed, 12 Nov 2025 17:11:01 +0100 Subject: [PATCH 15/24] fix --- .github/workflows/integration_tests.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration_tests.yaml b/.github/workflows/integration_tests.yaml index da10261..ab07a7c 100644 --- a/.github/workflows/integration_tests.yaml +++ b/.github/workflows/integration_tests.yaml @@ -56,7 +56,7 @@ jobs: - name: Run Integration tests run: | docker run --rm \ - -e ARCHERY_INTEGRATION_WITH_EXTERNAL_LIBRARY=/workspace/build/bin/Release/ \ + -e ARCHERY_INTEGRATION_WITH_EXTERNAL_LIBRARY=/workspace/build/bin/RELEASE/ \ -e ARCHERY_INTEGRATION_EXTERNAL_LIBRARY_IPC_PRODUCER=true \ -e ARCHERY_INTEGRATION_EXTERNAL_LIBRARY_IPC_CONSUMER=true \ -v ${{ github.workspace }}:/workspace \ From ac3820eb55a2e54fd7e91785b6f6f1a449170473 Mon Sep 17 00:00:00 2001 From: Alexis Placet Date: Wed, 12 Nov 2025 17:43:16 +0100 Subject: [PATCH 16/24] fix --- .github/workflows/integration_tests.yaml | 96 +++++++++++++----------- 1 file changed, 52 insertions(+), 44 deletions(-) diff --git a/.github/workflows/integration_tests.yaml b/.github/workflows/integration_tests.yaml index ab07a7c..161d0d2 100644 --- a/.github/workflows/integration_tests.yaml +++ b/.github/workflows/integration_tests.yaml @@ -8,57 +8,65 @@ on: jobs: build_integration_container_and_run_tests: - runs-on: ubuntu-24.04 + runs-on: ubuntu-22.04 steps: - - name: Checkout code - uses: actions/checkout@v5 + - name: Install dependencies + run: | + sudo apt-get update + sudo apt-get install -y mold libpthread-stubs0-dev libboost-thread-dev doctest-dev - - name: Create build environment - uses: mamba-org/setup-micromamba@v2 - with: - environment-file: ./environment-dev.yml - environment-name: build_env - cache-environment: true + - name: Install specific version of tzdata + run: sudo apt-get install tzdata + + - name: Checkout code + uses: actions/checkout@v5 - - name: Configure using CMake - run: | - cmake -G Ninja \ - -Bbuild \ - -DCMAKE_BUILD_TYPE:STRING=RELEASE \ - -DSPARROW_IPC_BUILD_INTEGRATION_TESTS=ON \ - -DFETCH_DEPENDENCIES_WITH_CMAKE=MISSING \ - -DSPARROW_IPC_BUILD_SHARED=ON + - name: Create build environment + uses: mamba-org/setup-micromamba@v2 + with: + environment-file: ./environment-dev.yml + environment-name: build_env + cache-environment: true - - name: Build arrow_file_to_stream target - working-directory: build - run: cmake --build . --config Release --target arrow_file_to_stream + - name: Configure using CMake + run: | + cmake -G Ninja \ + -Bbuild \ + -DCMAKE_BUILD_TYPE:STRING=RELEASE \ + -DSPARROW_IPC_BUILD_INTEGRATION_TESTS=ON \ + -DFETCH_DEPENDENCIES_WITH_CMAKE=MISSING \ + -DSPARROW_IPC_BUILD_SHARED=ON - - name: Build arrow_stream_to_file target - working-directory: build - run: cmake --build . --config Release --target arrow_stream_to_file + - name: Build arrow_file_to_stream target + working-directory: build + run: cmake --build . --config Release --target arrow_file_to_stream - - name: Build arrow_json_to_file target - working-directory: build - run: cmake --build . --config Release --target arrow_json_to_file + - name: Build arrow_stream_to_file target + working-directory: build + run: cmake --build . --config Release --target arrow_stream_to_file - - name: Build arrow_validate target - working-directory: build - run: cmake --build . --config Release --target arrow_validate + - name: Build arrow_json_to_file target + working-directory: build + run: cmake --build . --config Release --target arrow_json_to_file - - name: List all folders and subfolders - run: | - echo "Listing all folders and subfolders:" - find . -type d + - name: Build arrow_validate target + working-directory: build + run: cmake --build . --config Release --target arrow_validate - - name: Build Docker image - run: docker build -t sparrow/integration-tests -f ci/docker/integration.dockerfile . + - name: List all folders and subfolders + run: | + echo "Listing all folders and subfolders:" + find . -type d - - name: Run Integration tests - run: | - docker run --rm \ - -e ARCHERY_INTEGRATION_WITH_EXTERNAL_LIBRARY=/workspace/build/bin/RELEASE/ \ - -e ARCHERY_INTEGRATION_EXTERNAL_LIBRARY_IPC_PRODUCER=true \ - -e ARCHERY_INTEGRATION_EXTERNAL_LIBRARY_IPC_CONSUMER=true \ - -v ${{ github.workspace }}:/workspace \ - -w /arrow-integration sparrow/integration-tests \ - "/arrow-integration/ci/scripts/integration_arrow.sh /arrow-integration /build" + - name: Build Docker image + run: docker build -t sparrow/integration-tests -f ci/docker/integration.dockerfile . + + - name: Run Integration tests + run: | + docker run --rm \ + -e ARCHERY_INTEGRATION_WITH_EXTERNAL_LIBRARY=/workspace/build/bin/RELEASE/ \ + -e ARCHERY_INTEGRATION_EXTERNAL_LIBRARY_IPC_PRODUCER=true \ + -e ARCHERY_INTEGRATION_EXTERNAL_LIBRARY_IPC_CONSUMER=true \ + -v ${{ github.workspace }}:/workspace \ + -w /arrow-integration sparrow/integration-tests \ + "/arrow-integration/ci/scripts/integration_arrow.sh /arrow-integration /build" From b416a8aac2dce13b438f17272f3fdbfeb9f7688d Mon Sep 17 00:00:00 2001 From: Alexis Placet Date: Wed, 12 Nov 2025 17:55:37 +0100 Subject: [PATCH 17/24] fix --- src/utils.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/utils.cpp b/src/utils.cpp index 2fc2490..73db136 100644 --- a/src/utils.cpp +++ b/src/utils.cpp @@ -1,5 +1,7 @@ #include "sparrow_ipc/utils.hpp" +#include + namespace sparrow_ipc::utils { std::optional parse_format(std::string_view format_str, std::string_view sep) From e9174490458eeea77037db66cbd9da3689a5f30b Mon Sep 17 00:00:00 2001 From: Alexis Placet Date: Wed, 12 Nov 2025 18:00:08 +0100 Subject: [PATCH 18/24] fix --- include/sparrow_ipc/memory_output_stream.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/include/sparrow_ipc/memory_output_stream.hpp b/include/sparrow_ipc/memory_output_stream.hpp index 27e2e06..a245bd3 100644 --- a/include/sparrow_ipc/memory_output_stream.hpp +++ b/include/sparrow_ipc/memory_output_stream.hpp @@ -2,6 +2,7 @@ #include #include #include +#include namespace sparrow_ipc { From affc18dda1514f0b057e1104d4c5cd12901689c1 Mon Sep 17 00:00:00 2001 From: Alexis Placet Date: Wed, 12 Nov 2025 18:06:20 +0100 Subject: [PATCH 19/24] fix --- integration_tests/arrow_validate.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/integration_tests/arrow_validate.cpp b/integration_tests/arrow_validate.cpp index 4292e91..77ba32a 100644 --- a/integration_tests/arrow_validate.cpp +++ b/integration_tests/arrow_validate.cpp @@ -109,8 +109,8 @@ bool compare_record_batch( std::cerr << "Error: Batch " << batch_idx << ", column " << col_idx << " ('" << col_name1.value_or("unnamed") << "'), row " << row_idx << " has different value\n"; - std::cerr << " JSON value: " << col1[row_idx] << "\n"; - std::cerr << " Stream value: " << col2[row_idx] << "\n"; + std::cerr << " JSON value: " << std::format("{}", col1[row_idx]) << "\n"; + std::cerr << " Stream value: " << std::format("{}", col2[row_idx]) << "\n"; all_match = false; } } From 4692e6abfbf4a98c6c02f71c14a5e62f13507dbf Mon Sep 17 00:00:00 2001 From: Alexis Placet Date: Thu, 13 Nov 2025 08:48:51 +0100 Subject: [PATCH 20/24] fix --- integration_tests/arrow_validate.cpp | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/integration_tests/arrow_validate.cpp b/integration_tests/arrow_validate.cpp index 77ba32a..69e6927 100644 --- a/integration_tests/arrow_validate.cpp +++ b/integration_tests/arrow_validate.cpp @@ -1,17 +1,18 @@ #include #include +#include #include #include #include #include #include +#include + #include #include "sparrow/json_reader/json_parser.hpp" -#include - /** * @brief Helper function to compare two record batches for equality. * @@ -23,11 +24,7 @@ * @param batch_idx The index of the batch being compared (for error reporting) * @return true if the batches are identical, false otherwise */ -bool compare_record_batch( - const sparrow::record_batch& rb1, - const sparrow::record_batch& rb2, - size_t batch_idx -) +bool compare_record_batch(const sparrow::record_batch& rb1, const sparrow::record_batch& rb2, size_t batch_idx) { bool all_match = true; @@ -77,8 +74,8 @@ bool compare_record_batch( // Check column size if (col1.size() != col2.size()) { - std::cerr << "Error: Batch " << batch_idx << ", column " << col_idx << " has different size: " - << col1.size() << " vs " << col2.size() << "\n"; + std::cerr << "Error: Batch " << batch_idx << ", column " << col_idx + << " has different size: " << col1.size() << " vs " << col2.size() << "\n"; all_match = false; continue; } @@ -86,8 +83,7 @@ bool compare_record_batch( // Check column data type if (col1.data_type() != col2.data_type()) { - std::cerr << "Error: Batch " << batch_idx << ", column " << col_idx - << " has different data type\n"; + std::cerr << "Error: Batch " << batch_idx << ", column " << col_idx << " has different data type\n"; all_match = false; continue; } @@ -206,8 +202,8 @@ int main(int argc, char* argv[]) } catch (const std::exception& e) { - std::cerr << "Error: Failed to build record batch " << batch_idx << " from JSON: " - << e.what() << "\n"; + std::cerr << "Error: Failed to build record batch " << batch_idx << " from JSON: " << e.what() + << "\n"; return EXIT_FAILURE; } } From b4605ed2ef66bb814104554f344a60399371cd3f Mon Sep 17 00:00:00 2001 From: Alexis Placet Date: Thu, 13 Nov 2025 08:58:51 +0100 Subject: [PATCH 21/24] wip --- integration_tests/arrow_validate.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/integration_tests/arrow_validate.cpp b/integration_tests/arrow_validate.cpp index 69e6927..34b7a0d 100644 --- a/integration_tests/arrow_validate.cpp +++ b/integration_tests/arrow_validate.cpp @@ -1,10 +1,12 @@ #include #include -#include #include #include #include #include +#if defined(__cpp_lib_format) +# include +#endif #include #include @@ -105,8 +107,10 @@ bool compare_record_batch(const sparrow::record_batch& rb1, const sparrow::recor std::cerr << "Error: Batch " << batch_idx << ", column " << col_idx << " ('" << col_name1.value_or("unnamed") << "'), row " << row_idx << " has different value\n"; +#if defined(__cpp_lib_format) std::cerr << " JSON value: " << std::format("{}", col1[row_idx]) << "\n"; std::cerr << " Stream value: " << std::format("{}", col2[row_idx]) << "\n"; +#endif all_match = false; } } From 3e5f06b2f04df483f7de00d6226bd24c25fd691f Mon Sep 17 00:00:00 2001 From: Alexis Placet Date: Thu, 13 Nov 2025 09:25:48 +0100 Subject: [PATCH 22/24] try fix --- integration_tests/CMakeLists.txt | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/integration_tests/CMakeLists.txt b/integration_tests/CMakeLists.txt index 00418e9..7fcd415 100644 --- a/integration_tests/CMakeLists.txt +++ b/integration_tests/CMakeLists.txt @@ -18,10 +18,6 @@ set_target_properties(arrow_file_to_stream VERSION ${SPARROW_IPC_BINARY_VERSION} SOVERSION ${SPARROW_IPC_BINARY_CURRENT} FOLDER "integration_tests" - INSTALL_RPATH_USE_LINK_PATH TRUE - BUILD_WITH_INSTALL_RPATH FALSE - INSTALL_RPATH "$<$:$ORIGIN>$<$:@loader_path>" - BUILD_RPATH_USE_ORIGIN TRUE ) target_include_directories(arrow_file_to_stream @@ -49,11 +45,6 @@ set_target_properties(arrow_stream_to_file VERSION ${SPARROW_IPC_BINARY_VERSION} SOVERSION ${SPARROW_IPC_BINARY_CURRENT} FOLDER integration_tests - # RPATH settings for Unix-like systems - INSTALL_RPATH_USE_LINK_PATH TRUE - BUILD_WITH_INSTALL_RPATH FALSE - INSTALL_RPATH "$<$:$ORIGIN>$<$:@loader_path>" - BUILD_RPATH_USE_ORIGIN TRUE ) target_include_directories(arrow_stream_to_file @@ -82,11 +73,6 @@ set_target_properties(arrow_json_to_file VERSION ${SPARROW_IPC_BINARY_VERSION} SOVERSION ${SPARROW_IPC_BINARY_CURRENT} FOLDER integration_tests - # RPATH settings for Unix-like systems - INSTALL_RPATH_USE_LINK_PATH TRUE - BUILD_WITH_INSTALL_RPATH FALSE - INSTALL_RPATH "$<$:$ORIGIN>$<$:@loader_path>" - BUILD_RPATH_USE_ORIGIN TRUE ) target_include_directories(arrow_json_to_file @@ -115,11 +101,6 @@ set_target_properties(arrow_validate VERSION ${SPARROW_IPC_BINARY_VERSION} SOVERSION ${SPARROW_IPC_BINARY_CURRENT} FOLDER integration_tests - # RPATH settings for Unix-like systems - INSTALL_RPATH_USE_LINK_PATH TRUE - BUILD_WITH_INSTALL_RPATH FALSE - INSTALL_RPATH "$<$:$ORIGIN>$<$:@loader_path>" - BUILD_RPATH_USE_ORIGIN TRUE ) target_include_directories(arrow_validate From 0c64129ba4ca494ff7b29e7b8e7d7b084273dac4 Mon Sep 17 00:00:00 2001 From: Alexis Placet Date: Thu, 13 Nov 2025 10:19:49 +0100 Subject: [PATCH 23/24] try fix --- integration_tests/CMakeLists.txt | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/integration_tests/CMakeLists.txt b/integration_tests/CMakeLists.txt index 7fcd415..46da543 100644 --- a/integration_tests/CMakeLists.txt +++ b/integration_tests/CMakeLists.txt @@ -18,6 +18,9 @@ set_target_properties(arrow_file_to_stream VERSION ${SPARROW_IPC_BINARY_VERSION} SOVERSION ${SPARROW_IPC_BINARY_CURRENT} FOLDER "integration_tests" + BUILD_RPATH_USE_ORIGIN ON + BUILD_RPATH "$ORIGIN" + INSTALL_RPATH "$ORIGIN" ) target_include_directories(arrow_file_to_stream @@ -45,6 +48,9 @@ set_target_properties(arrow_stream_to_file VERSION ${SPARROW_IPC_BINARY_VERSION} SOVERSION ${SPARROW_IPC_BINARY_CURRENT} FOLDER integration_tests + BUILD_RPATH_USE_ORIGIN ON + BUILD_RPATH "$ORIGIN" + INSTALL_RPATH "$ORIGIN" ) target_include_directories(arrow_stream_to_file @@ -73,6 +79,9 @@ set_target_properties(arrow_json_to_file VERSION ${SPARROW_IPC_BINARY_VERSION} SOVERSION ${SPARROW_IPC_BINARY_CURRENT} FOLDER integration_tests + BUILD_RPATH_USE_ORIGIN ON + BUILD_RPATH "$ORIGIN" + INSTALL_RPATH "$ORIGIN" ) target_include_directories(arrow_json_to_file @@ -101,6 +110,9 @@ set_target_properties(arrow_validate VERSION ${SPARROW_IPC_BINARY_VERSION} SOVERSION ${SPARROW_IPC_BINARY_CURRENT} FOLDER integration_tests + BUILD_RPATH_USE_ORIGIN ON + BUILD_RPATH "$ORIGIN" + INSTALL_RPATH "$ORIGIN" ) target_include_directories(arrow_validate From 265a553aca509e87ead087ac06f3474828440b6e Mon Sep 17 00:00:00 2001 From: Alexis Placet Date: Thu, 13 Nov 2025 10:45:56 +0100 Subject: [PATCH 24/24] try fix --- .github/copilot-instructions.md | 60 ++++++++++++++++++++++++ .github/workflows/integration_tests.yaml | 9 +--- 2 files changed, 61 insertions(+), 8 deletions(-) create mode 100644 .github/copilot-instructions.md diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md new file mode 100644 index 0000000..908383e --- /dev/null +++ b/.github/copilot-instructions.md @@ -0,0 +1,60 @@ +# Sparrow-IPC AI Agent Instructions + +C++20 library for Arrow IPC serialization/deserialization using FlatBuffers. See [../examples/write_and_read_streams.cpp](../examples/write_and_read_streams.cpp) for usage patterns. + +## Architecture + +- **Serialization**: `record_batch` → `serializer` → FlatBuffer metadata + body → stream (continuation bytes + length + message + padding + data) +- **Deserialization**: Binary stream → `extract_encapsulated_message()` → parse FlatBuffer → reconstruct `record_batch` +- **Critical**: All record batches in a stream must have identical schemas (validated in `serialize_record_batches_to_ipc_stream`) +- **Memory model**: Deserialized arrays use `std::span` - source buffer must outlive arrays + +## Build System + +**Dependency fetching** (unique pattern in `cmake/external_dependencies.cmake`): +- `FETCH_DEPENDENCIES_WITH_CMAKE=OFF` - require via `find_package()` (CI default) +- `FETCH_DEPENDENCIES_WITH_CMAKE=MISSING` - auto-fetch missing (local dev) +- All binaries/libs → `${CMAKE_BINARY_DIR}/bin/${CMAKE_BUILD_TYPE}` (not standard locations) + +**FlatBuffer schemas**: Auto-downloaded from Apache Arrow during configure → `${CMAKE_BINARY_DIR}/generated/*_generated.h`. Never edit generated headers. + +**Build**: +```bash +mamba env create -f environment-dev.yml && mamba activate sparrow-ipc +cmake -B build -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX -DCMAKE_PREFIX_PATH=$CONDA_PREFIX -DSPARROW_IPC_BUILD_TESTS=ON +cmake --build build -j12 +cmake --build build --target run_tests +``` + +## Platform-Specific Patterns + +**Linux executables linking sparrow-ipc**: Must set RPATH (libs in same dir): +```cmake +set_target_properties(my_exe PROPERTIES + BUILD_RPATH_USE_ORIGIN ON + BUILD_RPATH "$ORIGIN" + INSTALL_RPATH "$ORIGIN") +``` +See `integration_tests/CMakeLists.txt` for examples. Missing this causes "cannot open shared object file" errors. + +**Windows**: Explicit DLL copying in CMakeLists (see `tests/CMakeLists.txt:32-47`). + +## Testing + +- Arrow test data: Auto-fetched from `apache/arrow-testing`, `.json.gz` files extracted during configure +- Unit tests: `cmake --build build --target run_tests` +- Integration tests: `integration_tests/` tools integrate with Apache Arrow's Archery framework via Docker + +## Naming & Style + +- `snake_case` for everything (types, functions) +- `m_` prefix for members +- Namespace: `sparrow_ipc` +- Format: `cmake --build build --target clang-format` (requires `ACTIVATE_LINTER=ON`) + +## Common Issues + +1. Schema mismatches in stream → `std::invalid_argument` +2. Deallocating source buffer while arrays in use → undefined behavior +3. Missing RPATH on Linux → runtime linking errors +4. Only LZ4 compression supported (not ZSTD yet) diff --git a/.github/workflows/integration_tests.yaml b/.github/workflows/integration_tests.yaml index 161d0d2..0ecd755 100644 --- a/.github/workflows/integration_tests.yaml +++ b/.github/workflows/integration_tests.yaml @@ -21,13 +21,6 @@ jobs: - name: Checkout code uses: actions/checkout@v5 - - name: Create build environment - uses: mamba-org/setup-micromamba@v2 - with: - environment-file: ./environment-dev.yml - environment-name: build_env - cache-environment: true - - name: Configure using CMake run: | cmake -G Ninja \ @@ -35,7 +28,7 @@ jobs: -DCMAKE_BUILD_TYPE:STRING=RELEASE \ -DSPARROW_IPC_BUILD_INTEGRATION_TESTS=ON \ -DFETCH_DEPENDENCIES_WITH_CMAKE=MISSING \ - -DSPARROW_IPC_BUILD_SHARED=ON + -DSPARROW_IPC_BUILD_SHARED=OFF - name: Build arrow_file_to_stream target working-directory: build