Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 60 additions & 0 deletions .github/copilot-instructions.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# Sparrow-IPC AI Agent Instructions

C++20 library for Arrow IPC serialization/deserialization using FlatBuffers. See [../examples/write_and_read_streams.cpp](../examples/write_and_read_streams.cpp) for usage patterns.

## Architecture

- **Serialization**: `record_batch` → `serializer` → FlatBuffer metadata + body → stream (continuation bytes + length + message + padding + data)
- **Deserialization**: Binary stream → `extract_encapsulated_message()` → parse FlatBuffer → reconstruct `record_batch`
- **Critical**: All record batches in a stream must have identical schemas (validated in `serialize_record_batches_to_ipc_stream`)
- **Memory model**: Deserialized arrays use `std::span<const uint8_t>` - source buffer must outlive arrays

## Build System

**Dependency fetching** (unique pattern in `cmake/external_dependencies.cmake`):
- `FETCH_DEPENDENCIES_WITH_CMAKE=OFF` - require via `find_package()` (CI default)
- `FETCH_DEPENDENCIES_WITH_CMAKE=MISSING` - auto-fetch missing (local dev)
- All binaries/libs → `${CMAKE_BINARY_DIR}/bin/${CMAKE_BUILD_TYPE}` (not standard locations)

**FlatBuffer schemas**: Auto-downloaded from Apache Arrow during configure → `${CMAKE_BINARY_DIR}/generated/*_generated.h`. Never edit generated headers.

**Build**:
```bash
mamba env create -f environment-dev.yml && mamba activate sparrow-ipc
cmake -B build -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX -DCMAKE_PREFIX_PATH=$CONDA_PREFIX -DSPARROW_IPC_BUILD_TESTS=ON
cmake --build build -j12
cmake --build build --target run_tests
```

## Platform-Specific Patterns

**Linux executables linking sparrow-ipc**: Must set RPATH (libs in same dir):
```cmake
set_target_properties(my_exe PROPERTIES
BUILD_RPATH_USE_ORIGIN ON
BUILD_RPATH "$ORIGIN"
INSTALL_RPATH "$ORIGIN")
```
See `integration_tests/CMakeLists.txt` for examples. Missing this causes "cannot open shared object file" errors.

**Windows**: Explicit DLL copying in CMakeLists (see `tests/CMakeLists.txt:32-47`).

## Testing

- Arrow test data: Auto-fetched from `apache/arrow-testing`, `.json.gz` files extracted during configure
- Unit tests: `cmake --build build --target run_tests`
- Integration tests: `integration_tests/` tools integrate with Apache Arrow's Archery framework via Docker

## Naming & Style

- `snake_case` for everything (types, functions)
- `m_` prefix for members
- Namespace: `sparrow_ipc`
- Format: `cmake --build build --target clang-format` (requires `ACTIVATE_LINTER=ON`)

## Common Issues

1. Schema mismatches in stream → `std::invalid_argument`
2. Deallocating source buffer while arrays in use → undefined behavior
3. Missing RPATH on Linux → runtime linking errors
4. Only LZ4 compression supported (not ZSTD yet)
65 changes: 65 additions & 0 deletions .github/workflows/integration_tests.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
name: Integration tests

on:
workflow_dispatch:
pull_request:
push:
branches: [main]

jobs:
build_integration_container_and_run_tests:
runs-on: ubuntu-22.04
steps:
- name: Install dependencies
run: |
sudo apt-get update
sudo apt-get install -y mold libpthread-stubs0-dev libboost-thread-dev doctest-dev

- name: Install specific version of tzdata
run: sudo apt-get install tzdata

- name: Checkout code
uses: actions/checkout@v5

- name: Configure using CMake
run: |
cmake -G Ninja \
-Bbuild \
-DCMAKE_BUILD_TYPE:STRING=RELEASE \
-DSPARROW_IPC_BUILD_INTEGRATION_TESTS=ON \
-DFETCH_DEPENDENCIES_WITH_CMAKE=MISSING \
-DSPARROW_IPC_BUILD_SHARED=OFF

- name: Build arrow_file_to_stream target
working-directory: build
run: cmake --build . --config Release --target arrow_file_to_stream

- name: Build arrow_stream_to_file target
working-directory: build
run: cmake --build . --config Release --target arrow_stream_to_file

- name: Build arrow_json_to_file target
working-directory: build
run: cmake --build . --config Release --target arrow_json_to_file

- name: Build arrow_validate target
working-directory: build
run: cmake --build . --config Release --target arrow_validate

- name: List all folders and subfolders
run: |
echo "Listing all folders and subfolders:"
find . -type d

- name: Build Docker image
run: docker build -t sparrow/integration-tests -f ci/docker/integration.dockerfile .

- name: Run Integration tests
run: |
docker run --rm \
-e ARCHERY_INTEGRATION_WITH_EXTERNAL_LIBRARY=/workspace/build/bin/RELEASE/ \
-e ARCHERY_INTEGRATION_EXTERNAL_LIBRARY_IPC_PRODUCER=true \
-e ARCHERY_INTEGRATION_EXTERNAL_LIBRARY_IPC_CONSUMER=true \
-v ${{ github.workspace }}:/workspace \
-w /arrow-integration sparrow/integration-tests \
"/arrow-integration/ci/scripts/integration_arrow.sh /arrow-integration /build"
25 changes: 23 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,6 @@ include(CMakeDependentOption)
list(PREPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake")
message(DEBUG "CMake module path: ${CMAKE_MODULE_PATH}")

include(external_dependencies)

set(SPARROW_IPC_COMPILE_DEFINITIONS "" CACHE STRING "List of public compile definitions of the sparrow-ipc target")

set(SPARROW_IPC_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/include)
Expand Down Expand Up @@ -85,11 +83,27 @@ MESSAGE(STATUS "🔧 Build docs: ${SPARROW_IPC_BUILD_DOCS}")
OPTION(SPARROW_IPC_BUILD_EXAMPLES "Build sparrow-ipc examples" OFF)
MESSAGE(STATUS "🔧 Build examples: ${SPARROW_IPC_BUILD_EXAMPLES}")

OPTION(SPARROW_IPC_BUILD_INTEGRATION_TESTS "Build sparrow-ipc integration tests" OFF)
MESSAGE(STATUS "🔧 Build integration tests: ${SPARROW_IPC_BUILD_INTEGRATION_TESTS}")

# Code coverage
# =============
OPTION(SPARROW_IPC_ENABLE_COVERAGE "Enable sparrow-ipc test coverage" OFF)
MESSAGE(STATUS "🔧 Enable coverage: ${SPARROW_IPC_ENABLE_COVERAGE}")

include(external_dependencies)

# Build
# =====
set(BINARY_BUILD_DIR "${CMAKE_BINARY_DIR}/bin/${CMAKE_BUILD_TYPE}")

set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_DEBUG "${BINARY_BUILD_DIR}")
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE "${BINARY_BUILD_DIR}")
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY_DEBUG "${BINARY_BUILD_DIR}")
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY_RELEASE "${BINARY_BUILD_DIR}")
set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY_DEBUG "${BINARY_BUILD_DIR}")
set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY_RELEASE "${BINARY_BUILD_DIR}")

if(SPARROW_IPC_ENABLE_COVERAGE)
include(code_coverage)
endif()
Expand Down Expand Up @@ -284,6 +298,13 @@ if(SPARROW_IPC_BUILD_EXAMPLES)
add_subdirectory(examples)
endif()

# Integration tests
# =================
if(SPARROW_IPC_BUILD_INTEGRATION_TESTS)
message(STATUS "🔨 Create integration tests targets")
add_subdirectory(integration_tests)
endif()

# Installation
# ============
include(GNUInstallDirs)
Expand Down
42 changes: 42 additions & 0 deletions ci/docker/integration.dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

FROM apache/arrow-dev:amd64-conda-integration

ENV ARROW_USE_CCACHE=OFF \
ARROW_CPP_EXE_PATH=/build/cpp/debug \
BUILD_DOCS_CPP=OFF \
ARROW_INTEGRATION_CPP=ON \
ARROW_INTEGRATION_CSHARP=OFF \
ARROW_INTEGRATION_GO=OFF \
ARROW_INTEGRATION_JAVA=OFF \
ARROW_INTEGRATION_JS=OFF \
ARCHERY_INTEGRATION_WITH_NANOARROW="0" \
ARCHERY_INTEGRATION_WITH_RUST="0"

RUN apt update

RUN apt install build-essential git -y

# Clone the arrow monorepo // TODO: change to the official repo
RUN git clone --depth 1 --branch archery_supports_external_libraries https://github.com/Alex-PLACET/arrow.git /arrow-integration --recurse-submodules

# Build all the integrations
RUN conda run --no-capture-output \
/arrow-integration/ci/scripts/integration_arrow_build.sh \
/arrow-integration \
/build
6 changes: 3 additions & 3 deletions cmake/external_dependencies.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ function(find_package_or_fetch)
endfunction()

set(SPARROW_BUILD_SHARED ${SPARROW_IPC_BUILD_SHARED})
if(${SPARROW_IPC_BUILD_TESTS})
if(${SPARROW_IPC_BUILD_TESTS} OR ${SPARROW_IPC_BUILD_INTEGRATION_TESTS})
set(CREATE_JSON_READER_TARGET ON)
endif()
find_package_or_fetch(
Expand All @@ -81,7 +81,7 @@ unset(CREATE_JSON_READER_TARGET)
if(NOT TARGET sparrow::sparrow)
add_library(sparrow::sparrow ALIAS sparrow)
endif()
if(${SPARROW_IPC_BUILD_TESTS})
if(${SPARROW_IPC_BUILD_TESTS} OR ${SPARROW_IPC_BUILD_INTEGRATION_TESTS})
find_package_or_fetch(
PACKAGE_NAME sparrow-json-reader
)
Expand Down Expand Up @@ -123,7 +123,7 @@ if(NOT TARGET lz4::lz4)
add_library(lz4::lz4 ALIAS lz4)
endif()

if(SPARROW_IPC_BUILD_TESTS)
if(${SPARROW_IPC_BUILD_TESTS} OR ${SPARROW_IPC_BUILD_INTEGRATION_TESTS})
find_package_or_fetch(
PACKAGE_NAME doctest
GIT_REPOSITORY https://github.com/doctest/doctest.git
Expand Down
1 change: 1 addition & 0 deletions include/sparrow_ipc/memory_output_stream.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#include <cstdint>
#include <functional>
#include <ranges>
#include <span>

namespace sparrow_ipc
{
Expand Down
Loading
Loading